Selaa lähdekoodia

fixed regex for defendant and modified .gitignore to ignore csv files

Harsh Parikh 2 vuotta sitten
vanhempi
commit
dcceba8965
4 muutettua tiedostoa jossa 13 lisäystä ja 29 poistoa
  1. 4 1
      .gitignore
  2. 7 7
      complaints/parse_pdf.py
  3. 2 1
      complaints/parse_pdf_utils.py
  4. 0 20
      complaints/test.py

+ 4 - 1
.gitignore

@@ -57,6 +57,9 @@ docs/_build/
 
 # PyBuilder
 target/
-# Ignoring all pdfs
+# Ignoring all pdfs and test files.
 *.pdf
 *.xlsx
+complaints/test.py
+*.csv
+foo.py

+ 7 - 7
complaints/parse_pdf.py

@@ -26,16 +26,16 @@ def extract_all(PATH):
     for idx, file in enumerate(required_files):
         print(idx, file)
         parse_file = parser.from_file(file)["content"].strip().replace("\n", "")
-        # data_dict["case_number_list"].append(extract_case_number(content=parse_file))
-        # data_dict["filing_date_list"].append(extract_filing_date(content=parse_file))
+        data_dict["case_number_list"].append(extract_case_number(content=parse_file))
+        data_dict["filing_date_list"].append(extract_filing_date(content=parse_file))
         data_dict["plaintiff_list"].append(extract_plaintiff(content=parse_file))
-        # data_dict["defendent_list"].append(extract_defendent(content=parse_file))
-        # data_dict["acronyms_list"].append(extract_acronyms(content=parse_file))
-        # data_dict["firm_name_list"].append(extract_firm_name(content=parse_file))
-        # data_dict["patent_list"].append(extract_patent_number(content=parse_file))
+        data_dict["defendent_list"].append(extract_defendent(content=parse_file))
+        data_dict["acronyms_list"].append(extract_acronyms(content=parse_file))
+        data_dict["firm_name_list"].append(extract_firm_name(content=parse_file))
+        data_dict["patent_list"].append(extract_patent_number(content=parse_file))
 
     data = pd.DataFrame(data_dict)
-    # data.to_csv("required_data.csv", index=False)
+    data.to_csv("required_data.csv", index=False)
     return data
 
 

+ 2 - 1
complaints/parse_pdf_utils.py

@@ -61,8 +61,9 @@ def extract_plaintiff(content):
 def extract_defendent(content):
     """
     Returns the name of the defendant
+    Plaintiff[s]?.*v[s]?\.(.*?)Defendant[s]?\.
     """
-    regex = r"Plaintiff[s]?.*v[s]?\.(.*?)Defendant[s]?\."
+    regex = r"Plaintiff[s]?\,.*?[v|V]\.(.*?)Defendant[s]?\.?"
     try:
         return re.search(regex, content).groups()[0]
     except:

+ 0 - 20
complaints/test.py

@@ -1,20 +0,0 @@
-import re
-import tika
-from tika import parser
-
-tika.initVM()
-
-
-if __name__ == "__main__":
-    data = (
-        parser.from_file(
-            "/home/ftech/Desktop/harsh_parikh_codes/PDF_Scrapper/Complaint/document.pdf"
-        )["content"]
-        .strip()
-        .replace("\n", "")
-    )
-    print(data)
-    # pattern = r"OF\s\w+(\s.*?\,).*?Plaintiff"
-    # print(re.search(pattern, data).groups()[0])
-    # pattern = r"Plaintiff[s]?.*v[s]?\.(.*)Defendant\."
-    # re.search(pattern, data).group()