diff --git a/requirements.txt b/requirements.txt index 02626cd..4d79602 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ --e git+https://github.com/Wazzabeee/slate3k#egg=slate3k beautifulsoup4==4.10.0 nltk==3.6.6 odfpy==1.4.1 -pdfplumber==0.5.28 tabulate==0.8.9 -tqdm==4.66.3 \ No newline at end of file +tqdm==4.66.3 +pdfminer.six==20200517 \ No newline at end of file diff --git a/scripts/processing_files.py b/scripts/processing_files.py index 455841d..4db6f31 100644 --- a/scripts/processing_files.py +++ b/scripts/processing_files.py @@ -4,10 +4,9 @@ import zipfile from os import path -import pdfplumber -import slate3k as slate from odf import text, teletype from odf.opendocument import load +from pdfminer.high_level import extract_text def get_file_extension(filepath: str) -> str: @@ -39,38 +38,19 @@ def file_extension_call(file: str) -> list: def get_words_from_pdf_file(pdf_path: str) -> list: - """Return list of words from pdf file at specified path""" + """Return list of words from pdf file at specified path using pdfminer.six.""" - with open(pdf_path, "rb") as file: - extracted_text = slate.PDF(file) + # Extract text from the PDF file using pdfminer + extracted_text = extract_text(pdf_path) - nested_lists_length_sum = sum(len(temp) for temp in extracted_text) - count_line_return = sum(string.count("\n") for string in extracted_text) + # Clean up the extracted text + cleaned_text = re.sub(r"\s+", " ", extracted_text) + cleaned_text = re.sub(r"<(.|\n)*?>", "", cleaned_text) - # Check \n ratio compared to length of text - if nested_lists_length_sum / count_line_return > 10: - for i, _ in enumerate(extracted_text): - extracted_text[i] = extracted_text[i].replace("\n", " ") - extracted_text[i] = re.sub("<(.|\n)*?>", "", str(extracted_text[i])) - extracted_text[i] = re.findall(r"\w+", extracted_text[i].lower()) + # Extract words from the cleaned text + words = re.findall(r"\w+", cleaned_text.lower()) - return [item for sublist in extracted_text for item in sublist] - - # Pdf format is not readable by Slate library - return get_words_from_special_pdf(pdf_path) - - -def get_words_from_special_pdf(pdf_path: str) -> list: - """Return list of words from a PDF file when the Slate library can't scrape it""" - - with pdfplumber.open(pdf_path) as file: - concat_string = "" - for page in file.pages: - text_page = page.extract_text() + "\n" - concat_string += text_page - - # Split the string into words and return as a list - return concat_string.replace("\xa0", " ").strip().split() + return words def get_words_from_txt_file(txt_path: str) -> list: diff --git a/setup.py b/setup.py index a5744ec..ba359f7 100644 --- a/setup.py +++ b/setup.py @@ -14,10 +14,9 @@ def get_version(): "beautifulsoup4==4.10.0", "nltk==3.6.6", "odfpy==1.4.1", - "pdfplumber==0.5.28", "tabulate==0.8.9", "tqdm==4.66.3", - "slate3k @ git+https://github.com/Wazzabeee/slate3k#egg=slate3k", + "pdfminer.six==20200517", ], extras_require={ "lint": ["pylint==3.0.2", "mypy==1.7.1", "flake8==6.1.0", "black==24.3.0", "types-tabulate"],