Bladeren bron

added a regex for address and removed .strip() from content extracting line

Harsh Parikh 2 jaren geleden
bovenliggende
commit
904c83576e
2 gewijzigde bestanden met toevoegingen van 3 en 3 verwijderingen
  1. 1 1
      pdfs/pdf_parser.py
  2. 2 2
      pdfs/utils.py

+ 1 - 1
pdfs/pdf_parser.py

@@ -21,7 +21,7 @@ def main(PATH):
     data_dict = defaultdict(list)
     required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
     for file in required_files:
-        content = parser.from_file(file)["content"].strip().replace("\n", "")
+        content = parser.from_file(file)["content"]
         data_dict["document_name"].append(file)
         data_dict["filing_date"].append(extract_filing_date(content))
         data_dict["address"].append(address_extraction(content))

+ 2 - 2
pdfs/utils.py

@@ -17,9 +17,9 @@ def address_extraction(content):
     """
     extracts address from the documents.
     """
-    regex = r"(OfficeAddress\:\s([\s\S].*)www|(A|a)ddress.+?\sof.+?Fax\:\s\(\d{3}\)\s\d{3}\-\d+\s)"
+    regex_address = r"\w+.*\n\w+.*\n\w+.*\w\.?{2,4}\s\d{5}"
     try:
-        re.search(regex, content).groups()[0]
+        re.search(regex_address, content).group(0)
     except:
         return "None"