Преглед изворни кода

updated address code and changed function name

Harsh Parikh пре 2 година
родитељ
комит
41c5679d47
2 измењених фајлова са 5 додато и 5 уклоњено
  1. 2 2
      pdfs/pdf_parser.py
  2. 3 3
      pdfs/utils.py

+ 2 - 2
pdfs/pdf_parser.py

@@ -6,7 +6,7 @@ from tika import parser
 from utils import (
     extract_filing_date,
     address_extraction,
-    refer_exteraction,
+    extract_reference,
     email_extraction,
     telephone_number_extraction,
 )
@@ -25,7 +25,7 @@ def main(PATH):
         data_dict["document_name"].append(file)
         data_dict["filing_date"].append(extract_filing_date(content))
         data_dict["address"].append(address_extraction(content))
-        data_dict["refer"].append(refer_exteraction(content))
+        data_dict["references"].append(extract_reference(content))
         data_dict["email"].append(email_extraction(content))
         data_dict["telephone_number"].append(telephone_number_extraction(content))
 

+ 3 - 3
pdfs/utils.py

@@ -17,14 +17,14 @@ def address_extraction(content):
     """
     extracts address from the documents.
     """
-    regex_address = r"\w+.*\n\w+.*\n\w+.*\w\.?{2,4}\s\d{5}"
+    regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
     try:
-        re.search(regex_address, content).group(0)
+        return re.search(regex_address, content).group(0)
     except:
         return "None"
 
 
-def refer_exteraction(content):
+def extract_reference(content):
     """
     extract referals from the documents.
     """