Sfoglia il codice sorgente

Merge branch 'issue_o' of git.fafadiatech.com:harsh/pdf_parser into issue_o

Harsh Parikh 2 anni fa
parent
commit
2db275f44b
3 ha cambiato i file con 138 aggiunte e 1 eliminazioni
  1. 4 1
      .gitignore
  2. 42 0
      expert/parse_expert_pdf.py
  3. 92 0
      expert/parse_expert_pdf_utils.py

+ 4 - 1
.gitignore

@@ -60,8 +60,11 @@ target/
 # Ignoring all pdfs and test files.
 *.pdf
 *.xlsx
+
 complaints/test.py
 *.csv
 foo.py
 .~lock.Document Parser Fields.xlsx#
-.DS_Store
+.DS_Store
+#pdf
+pdfs/

+ 42 - 0
expert/parse_expert_pdf.py

@@ -0,0 +1,42 @@
+from parse_expert_pdf_utils import (
+    case_number_extraction,
+    expert_name_extraction,
+    plaintiff_extraction,
+    defendent_extraction,
+    patent_extraction,
+    on_behalf_of_extraction,
+    acronym_extraction,
+)
+import tika
+from tika import parser
+import re
+import warnings
+warnings.filterwarnings('ignore')
+
+
+def main():
+    tika.initVM()
+    path = "/home/omkardesai/Code/pdf_parser/pdfs/expert_report1.pdf"
+    parsed_pdf = parser.from_file(path)
+    print(parsed_pdf.keys())
+
+    content = parsed_pdf['content'].strip().replace('\n', '')
+    # print(content)
+    # case_number = case_number_extraction(content)
+    # print(case_number)
+    # expert_name = expert_name_extraction(content)
+    # print(expert_name)
+    # plaintiff = plaintiff_extraction(content)
+    # print(plaintiff)
+    # defendent = defendent_extraction(content)
+    # print(defendent)
+    # patent = patent_extraction(content)
+    # print(patent)
+    # on_behalf_of = on_behalf_of_extraction(content)
+    # print(on_behalf_of)
+    acronym = acronym_extraction(content)
+    print(acronym)
+
+
+if __name__ == "__main__":
+    main()

+ 92 - 0
expert/parse_expert_pdf_utils.py

@@ -0,0 +1,92 @@
+from collections import defaultdict
+import re
+from distutils.filelist import findall
+
+
+def case_number_extraction(content):
+    # dict_case_numbers = defaultdict(int)
+    # case_number_info = re.findall("Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})", content)
+    # case_number = ""
+    # for element in case_number_info:
+    #     dict_case_numbers[element] += 1
+    # for mykey, value in dict_case_numbers.items():
+    #     case_number = mykey
+    # return case_number
+    regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})"  # Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})
+    results = []
+    case_number = re.compile(regex, re.IGNORECASE)
+    for current in case_number.finditer(content):
+        results.append(current.groups()[0])
+    return list(set(results))
+
+
+def expert_name_extraction(content):
+    regex = r"\bEXPERT\sREPORT\sOF\s(.*?),.*(REGARDING|Invalidity)?"  # "\bEXPERT\sREPORT\sOF\s(.+?),"
+    results = []
+    expert_name = re.compile(regex, re.IGNORECASE)
+    for current in expert_name.finditer(content):
+        results.append(current.groups()[0])
+    return list(set(results))
+
+
+def plaintiff_extraction(content):
+    regex = r"\bDIVISION([\s\S]*?)Plaintiff\,"  # "OF\s\w+(\s.*?)\,.*Plaintiff"  # "(.*)\s\nPlaintiff,"
+    results = []
+    plaintiff = re.compile(regex, re.IGNORECASE)
+    for current in plaintiff.finditer(content):
+        results.append(current.groups()[0].strip())
+    return results
+
+
+def defendent_extraction(content):
+    # "Plaintiff.*\n.*v\.([\s\S]*?)Defendant"   # "\bv\.([\s\S]*?)Defendant"
+    regex = r"Plaintiff.*v\.([\s\S]*?)Defendant"
+    results = []
+    defendent = re.compile(regex, re.IGNORECASE)
+    for current in defendent.finditer(content):
+        results.append(current.groups()[0].strip())
+    return results
+
+
+def patent_extraction(content):
+    regex = r"(U\.S\.\sPATENT\sNO.\s\d\,\d{3}\,\d{3})"  # "U\.S\.\sPATENT\sNO.*\d{1,2}\,\d{3}\,\d{3}"
+    # results = []
+    # patent = re.compile(regex, re.IGNORECASE)
+    # for current in patent.finditer(content):
+    #     results.append(current.groups())
+    # patent_info = re.search(regex, content).groups()[0].strip()
+    patent_info = re.findall(regex, content)
+    return patent_info
+
+
+def law_firm_extraction(content):
+    regex = r""
+    results = []
+    firm = re.compile(regex, re.IGNORECASE)
+    for current in firm.finditer(content):
+        results.append(current.groups()[0].strip())
+    return results
+
+
+def on_behalf_of_extraction(content):
+    regex = "on\sbehalf\sof(.*?)(C|c)ase"
+    on_behalf_of = re.search(regex, content).groups()[0].strip()
+    return on_behalf_of
+
+
+def hourly_compensation(content):
+    return
+
+
+def ref_patents(content):
+    return
+
+
+def acronym_extraction(content):
+    regex = r"\([A-Z]+\)"
+    # results = []
+    # acronym = re.compile(regex, re.IGNORECASE)
+    # for current in acronym.finditer(content):
+    #     results.append(current)
+    acronym = re.findall(regex, content)
+    return list(set(acronym))