преди 2 години · 2352bd3c3c
--- a/.gitignore
+++ b/.gitignore
@@ -59,3 +59,4 @@ docs/_build/
 
				 target/
			
 
				 # Ignoring all pdfs
			
 
				 *.pdf
			
 
				+*.xlsx
			
--- a/README.md
+++ b/README.md
@@ -1,3 +1,37 @@
 
				 # pdf_parser
			
 
				+## All the codes related to pdf parsing
			
 
				 
			
 
				-All codes related to pdf parsing.
			
 
				+### The following elements are to be parsed from documents.
			
 
				+1. Documents
			
 
				+    1. Extracting dates from documents
			
 
				+    1. Classification Tags
			
 
				+    1. Extracting Key Entities from documents
			
 
				+        1. Patents
			
 
				+        1. References
			
 
				+        1. Entities
			
 
				+            1. Names
			
 
				+            1. Addresses
			
 
				+            1. Law Firms
			
 
				+            1. Contact Numbers
			
 
				+            1. Emails
			
 
				+    1. Association with Cases
			
 
				+
			
 
				+### Setting up the code base.
			
 
				+1. Launch the terminal.
			
 
				+1. Enter the following command to go to the base directory:
			
 
				+   ``` bash
			
 
				+   cd ~
			
 
				+   ```
			
 
				+1. Make a new directory `Code` by using the following command:
			
 
				+   ```bash
			
 
				+   mkdir Code
			
 
				+   ```
			
 
				+1. Pull the current repository by entering the following command:
			
 
				+   ```bash
			
 
				+   git pull gogs@git.fafadiatech.com:harsh/pdf_parser.git
			
 
				+   ```
			
 
				+
			
 
				+### TODO LIST:
			
 
				+1. Implementing OCR on tika.
			
 
				+1. Dockerising the whole apache tika with ocr.
			
 
				+1. Testing the re on the scanned pdfs.
			
--- a/complaints/parse_pdf.py
+++ b/complaints/parse_pdf.py
@@ -0,0 +1,46 @@
 
				+from collections import defaultdict
			
 
				+from IPython.display import display
			
 
				+import os
			
 
				+import tika
			
 
				+import pandas as pd
			
 
				+
			
 
				+tika.initVM()
			
 
				+from tika import parser
			
 
				+from parse_pdf_utils import (
			
 
				+    extract_acronyms,
			
 
				+    extract_case_number,
			
 
				+    extract_defendent,
			
 
				+    extract_filing_date,
			
 
				+    extract_firm_name,
			
 
				+    extract_patent_number,
			
 
				+    extract_plaintiff,
			
 
				+)
			
 
				+
			
 
				+
			
 
				+def extract_all(PATH):
			
 
				+    """
			
 
				+    Returns all the required data from the pdfs in a dataframe format.
			
 
				+    """
			
 
				+    data_dict = defaultdict(list)
			
 
				+    required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
			
 
				+    for idx, file in enumerate(required_files):
			
 
				+        print(idx, file)
			
 
				+        parse_file = parser.from_file(file)["content"].strip().replace("\n", "")
			
 
				+        # data_dict["case_number_list"].append(extract_case_number(content=parse_file))
			
 
				+        # data_dict["filing_date_list"].append(extract_filing_date(content=parse_file))
			
 
				+        data_dict["plaintiff_list"].append(extract_plaintiff(content=parse_file))
			
 
				+        # data_dict["defendent_list"].append(extract_defendent(content=parse_file))
			
 
				+        # data_dict["acronyms_list"].append(extract_acronyms(content=parse_file))
			
 
				+        # data_dict["firm_name_list"].append(extract_firm_name(content=parse_file))
			
 
				+        # data_dict["patent_list"].append(extract_patent_number(content=parse_file))
			
 
				+
			
 
				+    data = pd.DataFrame(data_dict)
			
 
				+    # data.to_csv("required_data.csv", index=False)
			
 
				+    return data
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    BASE_DIR = "Code/pdf_parser/complaints"
			
 
				+    HOME_DIR = os.path.expanduser("~")
			
 
				+    PATH = os.path.join(HOME_DIR, BASE_DIR)
			
 
				+    display(extract_all(PATH))
			
--- a/complaints/parse_pdf_utils.py
+++ b/complaints/parse_pdf_utils.py
@@ -0,0 +1,103 @@
 
				+import re
			
 
				+
			
 
				+
			
 
				+def extract_patent_number(content):
			
 
				+    """
			
 
				+    Returns the list of unique patent_numbers in the document
			
 
				+    """
			
 
				+    regex = r"\d{1,3}\,\d{1,3}\,\d{3}"
			
 
				+    results = []
			
 
				+    patent = re.compile(regex, re.IGNORECASE)
			
 
				+    for current in patent.finditer(content):
			
 
				+        results.append(current.group().replace(",", ""))
			
 
				+    return list(set(results))
			
 
				+
			
 
				+
			
 
				+def extract_case_number(content):
			
 
				+    """
			
 
				+    Returns the lisr of unique case_numbers
			
 
				+    """
			
 
				+    regex = r"(\d{1,})\:(\d{1,}\-\w{1,}\-\d{5,}\-\w+)+"
			
 
				+    try:
			
 
				+        return re.search(regex, content).groups()[1]
			
 
				+    except:
			
 
				+        return "None"
			
 
				+
			
 
				+
			
 
				+def extract_hourly_compensation(content):
			
 
				+    """
			
 
				+    Returns hourly compensation.
			
 
				+    """
			
 
				+    results = []
			
 
				+    hourly_comp_re = re.compile("\$\d{1,20}", re.IGNORECASE)
			
 
				+    for current in hourly_comp_re.finditer(content):
			
 
				+        results.append(current.group().replace(",", ""))
			
 
				+    return list(set(results))
			
 
				+
			
 
				+
			
 
				+def extract_expert_name(content):
			
 
				+    """
			
 
				+    Returns the name of the expert
			
 
				+    """
			
 
				+    results = []
			
 
				+    exp_name = re.compile(r"\b(REPORT OF ).*\S[.]")
			
 
				+    for current in exp_name.finditer(content):
			
 
				+        results.append(current.group().replace(",", ""))
			
 
				+    return list(set(results))
			
 
				+
			
 
				+
			
 
				+def extract_plaintiff(content):
			
 
				+    """
			
 
				+    Returns the name of the plaintiff
			
 
				+    previous = (OF\s\w{1,})(.*)Plaintiff[s]?,
			
 
				+    """
			
 
				+    regex = r"OF\s\w+(\s.*?\,).*?Plaintiff"
			
 
				+    try:
			
 
				+        return re.search(regex, content).groups()[0]
			
 
				+    except:
			
 
				+        return "None"
			
 
				+
			
 
				+
			
 
				+def extract_defendent(content):
			
 
				+    """
			
 
				+    Returns the name of the defendant
			
 
				+    """
			
 
				+    regex = r"Plaintiff[s]?.*v[s]?\.(.*?)Defendant[s]?\."
			
 
				+    try:
			
 
				+        return re.search(regex, content).groups()[0]
			
 
				+    except:
			
 
				+        return "None"
			
 
				+
			
 
				+
			
 
				+def extract_acronyms(content):
			
 
				+    """
			
 
				+    Returns the list of all the acronyms present
			
 
				+    """
			
 
				+    regex = r"\(“(\w{3})”\)"
			
 
				+    results = []
			
 
				+    plaintiff = re.compile(regex, re.IGNORECASE)
			
 
				+    for current in plaintiff.finditer(content):
			
 
				+        results.append(current.group().replace(",", ""))
			
 
				+    return list(set(results))
			
 
				+
			
 
				+
			
 
				+def extract_firm_name(content):
			
 
				+    """
			
 
				+    Returns the list of firm names present in the documents.
			
 
				+    """
			
 
				+    regex = r"(\(Firm\sName\,\sAddress\,\sand\sTelephone\sNumber\))([\r\n]+([^\r\n]+))"
			
 
				+    results = []
			
 
				+    firm = re.compile(regex, re.IGNORECASE)
			
 
				+    for current in firm.finditer(content):
			
 
				+        results.append(current.group().replace(",", ""))
			
 
				+    return list(set(results))
			
 
				+
			
 
				+
			
 
				+def extract_filing_date(content):
			
 
				+    """
			
 
				+    Returns the filing date.
			
 
				+    """
			
 
				+    try:
			
 
				+        return re.search(r"(Filed)\s(\d{2}\/\d{2}\/\d{2})", content).groups()[1]
			
 
				+    except:
			
 
				+        return "None"
			
--- a/complaints/pdf_parser.py
+++ b/complaints/pdf_parser.py
--- a/complaints/test.py
+++ b/complaints/test.py
@@ -0,0 +1,20 @@
 
				+import re
			
 
				+import tika
			
 
				+from tika import parser
			
 
				+
			
 
				+tika.initVM()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    data = (
			
 
				+        parser.from_file(
			
 
				+            "/home/ftech/Desktop/harsh_parikh_codes/PDF_Scrapper/Complaint/document.pdf"
			
 
				+        )["content"]
			
 
				+        .strip()
			
 
				+        .replace("\n", "")
			
 
				+    )
			
 
				+    print(data)
			
 
				+    # pattern = r"OF\s\w+(\s.*?\,).*?Plaintiff"
			
 
				+    # print(re.search(pattern, data).groups()[0])
			
 
				+    # pattern = r"Plaintiff[s]?.*v[s]?\.(.*)Defendant\."
			
 
				+    # re.search(pattern, data).group()
			
--- a/docker/docker-compose.yml
+++ b/docker/docker-compose.yml
@@ -0,0 +1,24 @@
 
				+version: "3.8"
			
 
				+services:
			
 
				+
			
 
				+  ## Apache Tika Server 
			
 
				+  tika:
			
 
				+    image: apache/tika:${TAG}-full
			
 
				+    # Override default so we can add configuration on classpath
			
 
				+    entrypoint:
			
 
				+      [
			
 
				+        "/bin/sh",
			
 
				+        "-c",
			
 
				+        "exec java -cp /customocr:/${TIKA_JAR}-${TAG}.jar org.apache.tika.server.TikaServerCli -h 0.0.0.0 $$0 $$@"
			
 
				+      ]
			
 
				+    # Kept command as example but could be added to entrypoint too
			
 
				+    command: -c /tika-config.xml
			
 
				+    restart: on-failure
			
 
				+    ports:
			
 
				+      - "9998:9998"
			
 
				+    volumes:
			
 
				+      # Choose the configuration you want, or add your own custom one
			
 
				+      # -  ./sample-configs/customocr/tika-config-inline.xml:/tika-config.xml
			
 
				+      - ./sample-configs/customocr/tika-config-rendered.xml:/tika-config.xml
			
 
				+
			
 
				+# source: https://github.com/apache/tika-docker/blob/master/docker-compose-tika-customocr.yml