ソースを参照

ref #1:made expert parsing files

Omkar Desai 2 年 前
コミット
ba03a089ab
3 ファイル変更61 行追加0 行削除
  1. 3 0
      .gitignore
  2. 30 0
      expert/parse_expert_pdf.py
  3. 28 0
      expert/parse_expert_pdf_utils.py

+ 3 - 0
.gitignore

@@ -60,3 +60,6 @@ target/
 # Ignoring all pdfs
 *.pdf
 *.xlsx
+
+#pdf
+pdfs/

+ 30 - 0
expert/parse_expert_pdf.py

@@ -0,0 +1,30 @@
+from parse_expert_pdf_utils import (
+    case_number_extraction,
+    expert_name_extraction,
+)
+import tika
+from tika import parser
+import re
+import warnings
+warnings.filterwarnings('ignore')
+
+
+def main():
+    tika.initVM()
+    path = "/home/omkardesai/Code/pdf_parser/pdfs/expert_parse_pdf.pdf"
+    parsed_pdf = parser.from_file(path)
+    print(parsed_pdf.keys())
+    # for mykeys, myvalues in parsed_pdf['metadata'].items():
+    #     print(f"{mykeys}")
+    #     print(f"{myvalues}")
+
+    content = parsed_pdf['content']
+    # print(content)
+    case_number = case_number_extraction(content)
+    print(case_number)
+    expert_name = expert_name_extraction(content)
+    print(expert_name)
+
+
+if __name__ == "__main__":
+    main()

+ 28 - 0
expert/parse_expert_pdf_utils.py

@@ -0,0 +1,28 @@
+from collections import defaultdict
+import re
+from distutils.filelist import findall
+
+
+def case_number_extraction(content):
+    dict_case_numbers = defaultdict(int)
+    case_number_info = re.findall("Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})", content)
+    case_number = ""
+    for element in case_number_info:
+        dict_case_numbers[element] += 1
+    for mykey, value in dict_case_numbers.items():
+        case_number = mykey
+    return case_number
+
+
+def expert_name_extraction(content):
+    regex = r"\bEXPERT\sREPORT\sOF\s(.+?),"  # \bEXPERT\sREPORT\sOF\s(.+?),
+    results = []
+    expert = re.compile(regex, re.IGNORECASE)
+    for current in expert.finditer(content):
+        results.append(current.group().replace(",", ""))
+    return list(set(results))
+    # return print(expert_names)
+
+
+def plaintiff_extraction(content):
+    plaintiff_info = re.findall("", content)