|
@@ -1,3 +1,6 @@
|
|
|
+""" This module are the functions to parse elements from the expert pdfs
|
|
|
+"""
|
|
|
+
|
|
|
import re
|
|
|
|
|
|
|
|
@@ -52,7 +55,7 @@ def case_number_extraction(content: str) -> str:
|
|
|
"""
|
|
|
Extracts the case number from the documents.
|
|
|
"""
|
|
|
- regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})" # Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})
|
|
|
+ regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})"
|
|
|
results = set()
|
|
|
case_number = re.compile(regex, re.IGNORECASE)
|
|
|
for current in case_number.finditer(content):
|
|
@@ -133,15 +136,10 @@ def hourly_compensation(content: str) -> str:
|
|
|
return "None"
|
|
|
|
|
|
|
|
|
-def ref_patents(content: str) -> str:
|
|
|
- return
|
|
|
-
|
|
|
-
|
|
|
def acronym_extraction(content: str) -> str:
|
|
|
- regex = r"\([A-Z]+\)"
|
|
|
- # results = []
|
|
|
- # acronym = re.compile(regex, re.IGNORECASE)
|
|
|
- # for current in acronym.finditer(content: str) -> str:
|
|
|
- # results.append(current)
|
|
|
- acronym = re.findall(regex, content)
|
|
|
- return list(set(acronym))
|
|
|
+ regex = r"\(\“([A-Z]{3,4})\”\)"
|
|
|
+ results = set()
|
|
|
+ acronym = re.compile(regex, re.IGNORECASE)
|
|
|
+ for current in acronym.finditer(content):
|
|
|
+ results.add(current.group(1))
|
|
|
+ return list(results)
|