Selaa lähdekoodia

ref #1:fixed merge conflict

Omkar Desai 2 vuotta sitten
vanhempi
commit
0277ca0712

+ 6 - 1
.gitignore

@@ -57,9 +57,14 @@ docs/_build/
 
 # PyBuilder
 target/
-# Ignoring all pdfs
+# Ignoring all pdfs and test files.
 *.pdf
 *.xlsx
 
+complaints/test.py
+*.csv
+foo.py
+.~lock.Document Parser Fields.xlsx#
+.DS_Store
 #pdf
 pdfs/

+ 1 - 1
LICENSE

@@ -1,5 +1,5 @@
 MIT License
-Copyright (c) <year> <copyright holders>
+Copyright (c) 2022 Fafadiatech
 
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 

+ 10 - 8
complaints/parse_pdf.py

@@ -1,6 +1,6 @@
 from collections import defaultdict
-from IPython.display import display
 import os
+from IPython.display import display
 import tika
 import pandas as pd
 
@@ -14,6 +14,7 @@ from parse_pdf_utils import (
     extract_firm_name,
     extract_patent_number,
     extract_plaintiff,
+    extract_attorney_name,
 )
 
 
@@ -26,16 +27,17 @@ def extract_all(PATH):
     for idx, file in enumerate(required_files):
         print(idx, file)
         parse_file = parser.from_file(file)["content"].strip().replace("\n", "")
-        # data_dict["case_number_list"].append(extract_case_number(content=parse_file))
-        # data_dict["filing_date_list"].append(extract_filing_date(content=parse_file))
+        data_dict["case_number_list"].append(extract_case_number(content=parse_file))
+        data_dict["filing_date_list"].append(extract_filing_date(content=parse_file))
         data_dict["plaintiff_list"].append(extract_plaintiff(content=parse_file))
-        # data_dict["defendent_list"].append(extract_defendent(content=parse_file))
-        # data_dict["acronyms_list"].append(extract_acronyms(content=parse_file))
-        # data_dict["firm_name_list"].append(extract_firm_name(content=parse_file))
-        # data_dict["patent_list"].append(extract_patent_number(content=parse_file))
+        data_dict["defendent_list"].append(extract_defendent(content=parse_file))
+        data_dict["acronyms_list"].append(extract_acronyms(content=parse_file))
+        data_dict["firm_name_list"].append(extract_firm_name(content=parse_file))
+        data_dict["attorney"].append(extract_attorney_name(content=parse_file))
+        data_dict["patent_list"].append(extract_patent_number(content=parse_file))
 
     data = pd.DataFrame(data_dict)
-    # data.to_csv("required_data.csv", index=False)
+    data.to_csv("required_data.csv", index=False)
     return data
 
 

+ 19 - 14
complaints/parse_pdf_utils.py

@@ -35,17 +35,6 @@ def extract_hourly_compensation(content):
     return list(set(results))
 
 
-def extract_expert_name(content):
-    """
-    Returns the name of the expert
-    """
-    results = []
-    exp_name = re.compile(r"\b(REPORT OF ).*\S[.]")
-    for current in exp_name.finditer(content):
-        results.append(current.group().replace(",", ""))
-    return list(set(results))
-
-
 def extract_plaintiff(content):
     """
     Returns the name of the plaintiff
@@ -61,8 +50,9 @@ def extract_plaintiff(content):
 def extract_defendent(content):
     """
     Returns the name of the defendant
+    Plaintiff[s]?.*v[s]?\.(.*?)Defendant[s]?\.
     """
-    regex = r"Plaintiff[s]?.*v[s]?\.(.*?)Defendant[s]?\."
+    regex = r"Plaintiff[s]?\,.*?[v|V]\.(.*?)Defendant[s]?\.?"
     try:
         return re.search(regex, content).groups()[0]
     except:
@@ -72,12 +62,15 @@ def extract_defendent(content):
 def extract_acronyms(content):
     """
     Returns the list of all the acronyms present
+    \(["|“](\w{1,10})
     """
-    regex = r"\(“(\w{3})”\)"
+    regex = r'\(["|“](\w{1,10})["|”]\)'
     results = []
     plaintiff = re.compile(regex, re.IGNORECASE)
     for current in plaintiff.finditer(content):
-        results.append(current.group().replace(",", ""))
+        results.append(current.groups()[0].replace(",", ""))
+    if len(results) == 0:
+        return "None"
     return list(set(results))
 
 
@@ -101,3 +94,15 @@ def extract_filing_date(content):
         return re.search(r"(Filed)\s(\d{2}\/\d{2}\/\d{2})", content).groups()[1]
     except:
         return "None"
+
+
+def extract_attorney_name(content):
+    """
+    returns the name of the attorney/attornies.
+    """
+    regex = r"\/s\/\s\w+\s\w\.\s\w+"
+    results = []
+    attorney = re.compile(regex, re.IGNORECASE)
+    for current in attorney.finditer(content):
+        results.append(current.group().replace(",", ""))
+    return list(set(results))

+ 0 - 20
complaints/test.py

@@ -1,20 +0,0 @@
-import re
-import tika
-from tika import parser
-
-tika.initVM()
-
-
-if __name__ == "__main__":
-    data = (
-        parser.from_file(
-            "/home/ftech/Desktop/harsh_parikh_codes/PDF_Scrapper/Complaint/document.pdf"
-        )["content"]
-        .strip()
-        .replace("\n", "")
-    )
-    print(data)
-    # pattern = r"OF\s\w+(\s.*?\,).*?Plaintiff"
-    # print(re.search(pattern, data).groups()[0])
-    # pattern = r"Plaintiff[s]?.*v[s]?\.(.*)Defendant\."
-    # re.search(pattern, data).group()

+ 45 - 0
expert_resume/parse_expert_resume.py

@@ -0,0 +1,45 @@
+from collections import defaultdict
+import os
+from IPython.display import display
+import tika
+import pandas as pd
+
+tika.initVM()
+from tika import parser
+
+from parse_resume_utils import (
+    extract_email,
+    extract_phone,
+    extract_zipcode,
+    extract_case_numbers,
+    extract_litigation_experience,
+    extract_patents_issued,
+)
+
+
+def main(PATH):
+    """
+    Returns the required data in a dataframe format
+    """
+    data_dict = defaultdict(list)
+    required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
+    for idx, file in enumerate(required_files):
+        print(idx, file)
+        parse_content = parser.from_file(file)["content"].strip().replace("\n", "")
+        data_dict["email"].append(extract_email(parse_content))
+        data_dict["phone"].append(extract_phone(parse_content))
+        data_dict["zipcode"].append(extract_zipcode(parse_content))
+        data_dict["cases"].append(extract_case_numbers(parse_content))
+        data_dict["litigation_experience"].append(
+            extract_litigation_experience(parse_content)
+        )
+        data_dict["patents_issued"].append(extract_patents_issued(parse_content))
+
+    return pd.DataFrame(data_dict)
+
+
+if __name__ == "__main__":
+    HOME_DIR = os.path.expanduser("~")
+    BASE_DIR = "Code/pdf_parser/expert_resume"
+    PATH = os.path.join(HOME_DIR, BASE_DIR)
+    display(main(PATH))

+ 68 - 0
expert_resume/parse_resume_utils.py

@@ -0,0 +1,68 @@
+import re
+
+
+def extract_email(content):
+    """
+    Extracts email id of the expert
+    """
+    try:
+        pattern = r"([a-z0-9]+@[a-z]+\.[a-z]{2,3})"
+        return re.search(pattern, content).groups()[0]
+    except:
+        return "None"
+
+
+def extract_zipcode(content):
+    """
+    Extracts zipcode from the resume
+    """
+    try:
+        pattern = r"(\w{2}\s\d{5})"
+        return re.search(pattern, content).groups()[0]
+    except:
+        return "None"
+
+
+def extract_phone(content):
+    """
+    Extracts phone number of the expert.
+    """
+    try:
+        pattern = r"(\(?\d{3}\)?\-?\s?\d{3}\-\d{4})"
+        return re.search(pattern, content).group()
+    except:
+        return "None"
+
+
+def extract_case_numbers(content):
+    """
+    Extracts all the case numbers associated with resume
+    """
+    results = []
+    case_numbers = re.compile(r"\d{2}\-\w+\-\d+\-\w+\-?\w+", re.IGNORECASE)
+    for current in case_numbers.finditer(content):
+        results.append(current.group().replace(",", ""))
+    return list(set(results))
+
+
+def extract_litigation_experience(content):
+    """
+    Extracts the litigation experience of the expert
+    """
+    try:
+        pattern = r"(\d+|\w+)\s?years"
+        return re.search(pattern, content).group()
+    except:
+        return "None"
+
+
+def extract_patents_issued(content):
+    """
+    Returns the patents issued by the expert
+    """
+    regex = r"\d{1,3}\,\d{1,3}\,\d{3}\,?"
+    results = []
+    patent = re.compile(regex, re.IGNORECASE)
+    for current in patent.finditer(content):
+        results.append(current.group().replace(",", ""))
+    return list(set(results))