harsh
/
pdf_parser


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
							import re


def date_extraction(content: str) -> str:
    """
    extracts filing date from the documents.
    """
    pattern = r"((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4}|(Entered\:\s.*?)\w.*?\d{1,2}\,\s\d{4}|(Filed.*?)\d{1,2}\/\d{1,2}\/\d{1,2})"  # "((FILING\sDATE.*?)\d{1,2}\/\d{1,2}\/\d{2,4}|(Date\:\s)\w{1,9}\s\d{1,2}\,\s\d{4})"
    try:
        return re.search(pattern, content).groups()[0]
    except:
        return "None"


def extract_email(content: str) -> str:
    """
    extracts email from the documents.
    """
    pattern = r"[a-z0-9]+@+[a-z].*?\.\w+\.?\w+"
    try:
        return re.search(pattern, content).group()
    except:
        return "None"


def address_extraction(content: str) -> str:
    """
    extracts address from the documents.
    """
    regex_address = r"\w+.*\n\w+.*\n\w+.*\w{2,4}\s\d{5}"
    try:
        return re.search(regex_address, content).group(0)
    except:
        return "None"


def refer_exteraction(content: str) -> str:
    """
    extract referals from the documents.
    """
    regex = r"(\w+)\srefer?s\sto(.*?)as\s"
    # 1. by reference
    # 2. In re
    # 3. in qoutes ""
    try:
        print("group1", re.search(regex, content).groups()[0])
    except:
        return "None"


def case_number_extraction(content: str) -> str:
    """
    Extracts the case number from the documents.
    """
    regex = r"Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})"  # Case\sNo\.\s(\d\:\d{2}\-\w{2}\-\d{5}\-\w{3})
    results = set()
    case_number = re.compile(regex, re.IGNORECASE)
    for current in case_number.finditer(content):
        results.add(current.groups()[0])
    return list(results)


def expert_name_extraction(content: str) -> str:
    """
    Extracts the name of the expert from the document.
    """
    regex = r"(REPORT|DECLARATION)\sOF(\s(DR.)?\s?\w+\s(.*?\.)?\s?\w+)"
    try:
        return re.search(regex, content).group(2)
    except:
        return "None"


def plaintiff_extraction(content: str) -> str:
    """
    Extracts the plaintiff from the document
    """
    regex = r"(\w.*)\n\s?\n?\s?(Plaintiffs?|Petitioner)"
    try:
        return re.search(regex, content).group(1)
    except:
        return "None"


def defendent_extraction(content: str) -> str:
    """
    Extracts the defendant from the document
    """
    regex = r"(\w.*?)\n\s?\n?\s?\s?(Defendants|Patent\sOwners?)"
    try:
        return re.search(regex, content).group(1)
    except:
        return "None"


def patent_extraction(content: str) -> str:
    """
    Extracts patent numbers from the document
    """
    regex = r"\d{1,3}\,\d{1,3}\,\d{3}\,?"
    result = set()
    patent = re.compile(regex, re.IGNORECASE)
    for current in patent.finditer(content):
        result.add(current.group().replace(",", ""))
    return list(result)


def law_firm_extraction(content: str) -> str:
    regex = r""
    results = []
    firm = re.compile(regex, re.IGNORECASE)
    for current in firm.finditer(content):
        results.append(current.groups()[0].strip())
    return results


def on_behalf_of_extraction(content: str) -> str:
    regex = "on\sbehalf\sof(.*?)(C|c)ase"
    # try:
    return re.search(regex, content)
    # except:
    #     return "None"


def hourly_compensation(content: str) -> str:
    """
    Returns the hourly compensation of the expert.
    """
    regex = "\$\s?\d{3,4}"
    try:
        return re.search(regex, content).group(0)
    except:
        return "None"


def ref_patents(content: str) -> str:
    return


def acronym_extraction(content: str) -> str:
    regex = r"\([A-Z]+\)"
    # results = []
    # acronym = re.compile(regex, re.IGNORECASE)
    # for current in acronym.finditer(content: str) -> str:
    #     results.append(current)
    acronym = re.findall(regex, content)
    return list(set(acronym))