Jelajahi Sumber

added the intial codebase for parsing documents

Harsh Parikh 2 tahun lalu
induk
melakukan
2352bd3c3c
7 mengubah file dengan 229 tambahan dan 1 penghapusan
  1. 1 0
      .gitignore
  2. 35 1
      README.md
  3. 46 0
      complaints/parse_pdf.py
  4. 103 0
      complaints/parse_pdf_utils.py
  5. 0 0
      complaints/pdf_parser.py
  6. 20 0
      complaints/test.py
  7. 24 0
      docker/docker-compose.yml

+ 1 - 0
.gitignore

@@ -59,3 +59,4 @@ docs/_build/
 target/
 # Ignoring all pdfs
 *.pdf
+*.xlsx

+ 35 - 1
README.md

@@ -1,3 +1,37 @@
 # pdf_parser
+## All the codes related to pdf parsing
 
-All codes related to pdf parsing.
+### The following elements are to be parsed from documents.
+1. Documents
+    1. Extracting dates from documents
+    1. Classification Tags
+    1. Extracting Key Entities from documents
+        1. Patents
+        1. References
+        1. Entities
+            1. Names
+            1. Addresses
+            1. Law Firms
+            1. Contact Numbers
+            1. Emails
+    1. Association with Cases
+
+### Setting up the code base.
+1. Launch the terminal.
+1. Enter the following command to go to the base directory:
+   ``` bash
+   cd ~
+   ```
+1. Make a new directory `Code` by using the following command:
+   ```bash
+   mkdir Code
+   ```
+1. Pull the current repository by entering the following command:
+   ```bash
+   git pull gogs@git.fafadiatech.com:harsh/pdf_parser.git
+   ```
+
+### TODO LIST:
+1. Implementing OCR on tika.
+1. Dockerising the whole apache tika with ocr.
+1. Testing the re on the scanned pdfs.

+ 46 - 0
complaints/parse_pdf.py

@@ -0,0 +1,46 @@
+from collections import defaultdict
+from IPython.display import display
+import os
+import tika
+import pandas as pd
+
+tika.initVM()
+from tika import parser
+from parse_pdf_utils import (
+    extract_acronyms,
+    extract_case_number,
+    extract_defendent,
+    extract_filing_date,
+    extract_firm_name,
+    extract_patent_number,
+    extract_plaintiff,
+)
+
+
+def extract_all(PATH):
+    """
+    Returns all the required data from the pdfs in a dataframe format.
+    """
+    data_dict = defaultdict(list)
+    required_files = [file for file in os.listdir(PATH) if file.find(".pdf") != -1]
+    for idx, file in enumerate(required_files):
+        print(idx, file)
+        parse_file = parser.from_file(file)["content"].strip().replace("\n", "")
+        # data_dict["case_number_list"].append(extract_case_number(content=parse_file))
+        # data_dict["filing_date_list"].append(extract_filing_date(content=parse_file))
+        data_dict["plaintiff_list"].append(extract_plaintiff(content=parse_file))
+        # data_dict["defendent_list"].append(extract_defendent(content=parse_file))
+        # data_dict["acronyms_list"].append(extract_acronyms(content=parse_file))
+        # data_dict["firm_name_list"].append(extract_firm_name(content=parse_file))
+        # data_dict["patent_list"].append(extract_patent_number(content=parse_file))
+
+    data = pd.DataFrame(data_dict)
+    # data.to_csv("required_data.csv", index=False)
+    return data
+
+
+if __name__ == "__main__":
+    BASE_DIR = "Code/pdf_parser/complaints"
+    HOME_DIR = os.path.expanduser("~")
+    PATH = os.path.join(HOME_DIR, BASE_DIR)
+    display(extract_all(PATH))

+ 103 - 0
complaints/parse_pdf_utils.py

@@ -0,0 +1,103 @@
+import re
+
+
+def extract_patent_number(content):
+    """
+    Returns the list of unique patent_numbers in the document
+    """
+    regex = r"\d{1,3}\,\d{1,3}\,\d{3}"
+    results = []
+    patent = re.compile(regex, re.IGNORECASE)
+    for current in patent.finditer(content):
+        results.append(current.group().replace(",", ""))
+    return list(set(results))
+
+
+def extract_case_number(content):
+    """
+    Returns the lisr of unique case_numbers
+    """
+    regex = r"(\d{1,})\:(\d{1,}\-\w{1,}\-\d{5,}\-\w+)+"
+    try:
+        return re.search(regex, content).groups()[1]
+    except:
+        return "None"
+
+
+def extract_hourly_compensation(content):
+    """
+    Returns hourly compensation.
+    """
+    results = []
+    hourly_comp_re = re.compile("\$\d{1,20}", re.IGNORECASE)
+    for current in hourly_comp_re.finditer(content):
+        results.append(current.group().replace(",", ""))
+    return list(set(results))
+
+
+def extract_expert_name(content):
+    """
+    Returns the name of the expert
+    """
+    results = []
+    exp_name = re.compile(r"\b(REPORT OF ).*\S[.]")
+    for current in exp_name.finditer(content):
+        results.append(current.group().replace(",", ""))
+    return list(set(results))
+
+
+def extract_plaintiff(content):
+    """
+    Returns the name of the plaintiff
+    previous = (OF\s\w{1,})(.*)Plaintiff[s]?,
+    """
+    regex = r"OF\s\w+(\s.*?\,).*?Plaintiff"
+    try:
+        return re.search(regex, content).groups()[0]
+    except:
+        return "None"
+
+
+def extract_defendent(content):
+    """
+    Returns the name of the defendant
+    """
+    regex = r"Plaintiff[s]?.*v[s]?\.(.*?)Defendant[s]?\."
+    try:
+        return re.search(regex, content).groups()[0]
+    except:
+        return "None"
+
+
+def extract_acronyms(content):
+    """
+    Returns the list of all the acronyms present
+    """
+    regex = r"\(“(\w{3})”\)"
+    results = []
+    plaintiff = re.compile(regex, re.IGNORECASE)
+    for current in plaintiff.finditer(content):
+        results.append(current.group().replace(",", ""))
+    return list(set(results))
+
+
+def extract_firm_name(content):
+    """
+    Returns the list of firm names present in the documents.
+    """
+    regex = r"(\(Firm\sName\,\sAddress\,\sand\sTelephone\sNumber\))([\r\n]+([^\r\n]+))"
+    results = []
+    firm = re.compile(regex, re.IGNORECASE)
+    for current in firm.finditer(content):
+        results.append(current.group().replace(",", ""))
+    return list(set(results))
+
+
+def extract_filing_date(content):
+    """
+    Returns the filing date.
+    """
+    try:
+        return re.search(r"(Filed)\s(\d{2}\/\d{2}\/\d{2})", content).groups()[1]
+    except:
+        return "None"

+ 0 - 0
complaints/pdf_parser.py


+ 20 - 0
complaints/test.py

@@ -0,0 +1,20 @@
+import re
+import tika
+from tika import parser
+
+tika.initVM()
+
+
+if __name__ == "__main__":
+    data = (
+        parser.from_file(
+            "/home/ftech/Desktop/harsh_parikh_codes/PDF_Scrapper/Complaint/document.pdf"
+        )["content"]
+        .strip()
+        .replace("\n", "")
+    )
+    print(data)
+    # pattern = r"OF\s\w+(\s.*?\,).*?Plaintiff"
+    # print(re.search(pattern, data).groups()[0])
+    # pattern = r"Plaintiff[s]?.*v[s]?\.(.*)Defendant\."
+    # re.search(pattern, data).group()

+ 24 - 0
docker/docker-compose.yml

@@ -0,0 +1,24 @@
+version: "3.8"
+services:
+
+  ## Apache Tika Server 
+  tika:
+    image: apache/tika:${TAG}-full
+    # Override default so we can add configuration on classpath
+    entrypoint:
+      [
+        "/bin/sh",
+        "-c",
+        "exec java -cp /customocr:/${TIKA_JAR}-${TAG}.jar org.apache.tika.server.TikaServerCli -h 0.0.0.0 $$0 $$@"
+      ]
+    # Kept command as example but could be added to entrypoint too
+    command: -c /tika-config.xml
+    restart: on-failure
+    ports:
+      - "9998:9998"
+    volumes:
+      # Choose the configuration you want, or add your own custom one
+      # -  ./sample-configs/customocr/tika-config-inline.xml:/tika-config.xml
+      - ./sample-configs/customocr/tika-config-rendered.xml:/tika-config.xml
+
+# source: https://github.com/apache/tika-docker/blob/master/docker-compose-tika-customocr.yml