Quellcode durchsuchen

added more parsers for expert report

Harsh Parikh vor 2 Jahren
Ursprung
Commit
decf3ff912
2 geänderte Dateien mit 13 neuen und 6 gelöschten Zeilen
  1. 5 0
      expert_report/parse_expert_pdf.py
  2. 8 6
      expert_report/parse_expert_pdf_utils.py

+ 5 - 0
expert_report/parse_expert_pdf.py

@@ -8,6 +8,8 @@ from parse_expert_pdf_utils import (
     defendent_extraction,
     expert_name_extraction,
     case_number_extraction,
+    patent_extraction,
+    hourly_compensation,
 )
 
 
@@ -26,11 +28,14 @@ def main(path: str) -> pd.DataFrame:
         content = parser.from_file(file)["content"]
         data["file"].append(file)
         data["full_name"].append(expert_name_extraction(content))
+        data["hourly_pay"].append(hourly_compensation(content))
         data["plaintiff"].append(plaintiff_extraction(content))
         data["defendant"].append(defendent_extraction(content))
         data["case_number"].append(case_number_extraction(content))
+        data["patents"].append(patent_extraction(content))
 
     data_expert = pd.DataFrame(data)
+    data_expert.to_csv("required_data.csv")
     return data_expert
 
 

+ 8 - 6
expert_report/parse_expert_pdf_utils.py

@@ -87,11 +87,11 @@ def patent_extraction(content):
     Extracts patent numbers from the document
     """
     regex = r"\d{1,3}\,\d{1,3}\,\d{3}\,?"
-    results = []
+    result = set()
     patent = re.compile(regex, re.IGNORECASE)
     for current in patent.finditer(content):
-        results.append(current.group().replace(",", ""))
-    return list(set(results))
+        result.add(current.group().replace(",", ""))
+    return list(result)
 
 
 def law_firm_extraction(content):
@@ -113,9 +113,11 @@ def hourly_compensation(content):
     """
     Returns the hourly compensation of the expert.
     """
-    regex = "\$\s?\d+"
-    pay = re.findall(regex, content)
-    return pay
+    regex = "\$\s?\d{3,4}"
+    try:
+        return re.search(regex, content).group(0)
+    except:
+        return "None"
 
 
 def ref_patents(content):