Просмотр исходного кода

updated parsers for expert_report

Harsh Parikh 2 лет назад
Родитель
Сommit
5680d46a1b
2 измененных файлов с 70 добавлено и 59 удалено
  1. 25 18
      expert_report/parse_expert_pdf.py
  2. 45 41
      expert_report/parse_expert_pdf_utils.py

+ 25 - 18
expert_report/parse_expert_pdf.py

@@ -1,11 +1,15 @@
 from parse_expert_pdf_utils import (
-    date_extraction,
-    address_extraction,
-    refer_exteraction,
+    defendent_extraction,
+    plaintiff_extraction,
+    defendent_extraction,
+    expert_name_extraction,
+    case_number_extraction,
 )
 import tika
 import os
 from tika import parser
+import pandas as pd
+from collections import defaultdict
 
 tika.initVM()
 import warnings
@@ -13,25 +17,28 @@ import warnings
 warnings.filterwarnings("ignore")
 
 
-def main(path):
+def main(path: str) -> pd.DataFrame:
+    """
+    The functions iterates through all the given files and gathers the data in the \
+    form of a dataframe
+    """
+    data = defaultdict(list)
     required_files = [file for file in os.listdir(path) if file.find(".pdf") != -1]
-    for idx, file in enumerate(required_files):
-        print(idx, file)
+    for file in required_files:
         print(file)
-        content = parser.from_file(file)["content"].strip().replace("\n", "")
+        content = parser.from_file(file)["content"]
+        data["file"].append(file)
+        data["full_name"].append(expert_name_extraction(content))
+        data["plaintiff"].append(plaintiff_extraction(content))
+        data["defendant"].append(defendent_extraction(content))
+        data["case_number"].append(case_number_extraction(content))
 
-        # content = parsed_pdf["content"].strip().replace("\n", "")
-        # print(content)
-        date = date_extraction(content)
-        print(date)
-        address = address_extraction(content)
-        print(address)
-        refer = refer_exteraction(content)
-        print(refer)
+    df = pd.DataFrame(data)
+    return df
 
 
 if __name__ == "__main__":
     HOME_DIR = os.path.expanduser("~")
-    BASE_DIR = "Code/pdf_parser/pdfs"
-    path = os.path.join(HOME_DIR, BASE_DIR)
-    main(path)
+    BASE_DIR = "Code/pdf_parser/expert_report"
+    PATH = os.path.join(HOME_DIR, BASE_DIR)
+    print(main(PATH))

+ 45 - 41
expert_report/parse_expert_pdf_utils.py

@@ -16,9 +16,9 @@ def address_extraction(content):
     """
     extracts address from the documents.
     """
-    regex = r"Address\:\s([\s\S].*)www"
+    regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
     try:
-        print(re.search(regex, content).groups()[0])
+        return re.search(regex_address, content).group(0)
     except:
         return "None"
 
@@ -38,59 +38,60 @@ def refer_exteraction(content):
 
 
 def case_number_extraction(content):
-    # dict_case_numbers = defaultdict(int)
-    # case_number_info = re.findall("Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})", content)
-    # case_number = ""
-    # for element in case_number_info:
-    #     dict_case_numbers[element] += 1
-    # for mykey, value in dict_case_numbers.items():
-    #     case_number = mykey
-    # return case_number
+    """
+    Extracts the case number from the documents.
+    """
     regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})"  # Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})
-    results = []
+    results = set()
     case_number = re.compile(regex, re.IGNORECASE)
     for current in case_number.finditer(content):
-        results.append(current.groups()[0])
-    return list(set(results))
+        results.add(current.groups()[0])
+    return list(results)
 
 
 def expert_name_extraction(content):
-    regex = r"\bEXPERT\sREPORT\sOF\s(.*?),.*(REGARDING|Invalidity)?"  # "\bEXPERT\sREPORT\sOF\s(.+?),"
-    results = []
-    expert_name = re.compile(regex, re.IGNORECASE)
-    for current in expert_name.finditer(content):
-        results.append(current.groups()[0])
-    return list(set(results))
+    """
+    Extracts the name of the expert from the document.
+    """
+    regex = r"(REPORT|DECLARATION)\sOF(\s(DR.)?\s?\w+\s(.*?\.)?\s?\w+)"
+    try:
+        return re.search(regex, content).group(2)
+    except:
+        return "None"
 
 
 def plaintiff_extraction(content):
-    regex = r"\bDIVISION([\s\S]*?)Plaintiff\,"  # "OF\s\w+(\s.*?)\,.*Plaintiff"  # "(.*)\s\nPlaintiff,"
-    results = []
-    plaintiff = re.compile(regex, re.IGNORECASE)
-    for current in plaintiff.finditer(content):
-        results.append(current.groups()[0].strip())
-    return results
+    """
+    Extracts the plaintiff from the document
+    """
+    regex = r"(\w.*)\n\s?\n?\s?(Plaintiffs?|Petitioner)"
+    try:
+        return re.search(regex, content).group(1)
+    except:
+        return "None"
 
 
 def defendent_extraction(content):
-    # "Plaintiff.*\n.*v\.([\s\S]*?)Defendant"   # "\bv\.([\s\S]*?)Defendant"
-    regex = r"Plaintiff.*v\.([\s\S]*?)Defendant"
-    results = []
-    defendent = re.compile(regex, re.IGNORECASE)
-    for current in defendent.finditer(content):
-        results.append(current.groups()[0].strip())
-    return results
+    """
+    Extracts the defendant from the document
+    """
+    regex = r"(\w.*?)\n\s?\n?\s\s?(Defendants|Patent\sOwners?)"
+    try:
+        return re.search(regex, content).group(1)
+    except:
+        return "None"
 
 
 def patent_extraction(content):
-    regex = r"(U\.S\.\sPATENT\sNO.\s\d\,\d{3}\,\d{3})"  # "U\.S\.\sPATENT\sNO.*\d{1,2}\,\d{3}\,\d{3}"
-    # results = []
-    # patent = re.compile(regex, re.IGNORECASE)
-    # for current in patent.finditer(content):
-    #     results.append(current.groups())
-    # patent_info = re.search(regex, content).groups()[0].strip()
-    patent_info = re.findall(regex, content)
-    return patent_info
+    """
+    Extracts patent numbers from the document
+    """
+    regex = r"\d{1,3}\,\d{1,3}\,\d{3}\,?"
+    results = []
+    patent = re.compile(regex, re.IGNORECASE)
+    for current in patent.finditer(content):
+        results.append(current.group().replace(",", ""))
+    return list(set(results))
 
 
 def law_firm_extraction(content):
@@ -109,7 +110,10 @@ def on_behalf_of_extraction(content):
 
 
 def hourly_compensation(content):
-    regex = "\$\s\d+"
+    """
+    Returns the hourly compensation of the expert.
+    """
+    regex = "\$\s?\d+"
     pay = re.findall(regex, content)
     return pay