Skip to content

Commit

Permalink
feat: add tqdm and custom errors (#13)
Browse files Browse the repository at this point in the history
  • Loading branch information
Wazzabeee authored May 4, 2024
1 parent 16f8f52 commit f11d2fc
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 62 deletions.
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ nltk==3.6.6
odfpy==1.4.1
pdfplumber==0.5.28
tabulate==0.8.9
tqdm==4.66.2
146 changes: 84 additions & 62 deletions scripts/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,38 @@
It can also use Jaccard Similarity, words counting, overlapping words for similarity
"""
import sys
import webbrowser
from datetime import datetime
from os import listdir, path
from typing import List

from tqdm import tqdm

from scripts.html_writing import add_links_to_html_table, results_to_html, papers_comparison
from scripts.html_utils import writing_results
from scripts.processing_files import file_extension_call
from scripts.similarity import difflib_overlap
from scripts.utils import wait_for_file, get_student_names, parse_options


class MinimumFilesError(Exception):
"""Raised when there are fewer than two files for comparison."""

pass


class UnsupportedFileError(Exception):
"""Raised when there are unsupported files in the input directory."""

pass


class PathNotFoundError(Exception):
"""Raised when the specified input directory path does not exist."""

pass


def main() -> None:
"""
Main function to process and compare text files.
Expand All @@ -36,69 +55,72 @@ def main() -> None:
args = parse_options()
in_dir, out_dir, block_size = args.in_dir, args.out_dir, args.block_size

if path.exists(in_dir): # Check if specified path exists
if not path.isabs(in_dir):
in_dir = path.abspath(in_dir)
if len(listdir(in_dir)) > 1: # Check if there are at least 2 files at specified path
filenames, processed_files = [], []
students_names = get_student_names(in_dir)
for ind, direc in enumerate(listdir(in_dir)):
if path.isdir(path.join(in_dir, direc)):
for file in listdir(path.join(in_dir, direc)):
file_words = file_extension_call(str(path.join(in_dir, direc, file)))

if file_words: # If all files have supported format
processed_files.append(file_words)
filenames.append(students_names[ind])
else: # At least one file was not supported
print("Remove files which are not txt, pdf, docx or odt and run the script again.")
sys.exit()
if out_dir is not None and path.exists(out_dir):
if not path.isabs(out_dir):
out_dir = path.abspath(out_dir)
results_directory = out_dir
else:
# Create new directory for storing html files
results_directory = writing_results(datetime.now().strftime("%Y%m%d_%H%M%S"))

difflib_scores: List[List[float]] = [[] for _ in range(len(processed_files))]
file_ind = 0

for i, text in enumerate(processed_files):
for j, text_bis in enumerate(processed_files):
if i != j:
# Append to the list the similarity score between text and text_bis
difflib_scores[i].append(difflib_overlap(text, text_bis))

# Write text with matching blocks colored in results directory
papers_comparison(
results_directory,
file_ind,
text,
text_bis,
(filenames[i], filenames[j]),
block_size,
)
file_ind += 1
else:
difflib_scores[i].append(-1)

results_directory = path.join(results_directory, "_results.html")
print(results_directory)

results_to_html(difflib_scores, filenames, results_directory)

if wait_for_file(results_directory, 60): # Wait for file to be created
add_links_to_html_table(results_directory)
webbrowser.open(results_directory) # Open results HTML table
if not path.exists(in_dir):
raise PathNotFoundError(f"The specified path does not exist: {in_dir}")

if not path.isabs(in_dir):
in_dir = path.abspath(in_dir)

files = [
f for f in listdir(in_dir) if path.isdir(path.join(in_dir, f)) or f.endswith(("txt", "pdf", "docx", "odt"))
]

if len(files) < 2:
raise MinimumFilesError(
"Minimum number of files is not present. Please check that there are at least two files to compare."
)

filenames, processed_files = [], []
students_names = get_student_names(in_dir)

for ind, direc in enumerate(tqdm(listdir(in_dir), desc="Processing Directories")):
if path.isdir(path.join(in_dir, direc)):
for file in listdir(path.join(in_dir, direc)):
file_words = file_extension_call(str(path.join(in_dir, direc, file)))
if file_words: # If all files have supported format
processed_files.append(file_words)
filenames.append(students_names[ind])
else:
raise UnsupportedFileError(
"Remove files which are not txt, pdf, docx, or odt and run the script again."
)

if out_dir is not None and path.exists(out_dir):
if not path.isabs(out_dir):
out_dir = path.abspath(out_dir)
results_directory = out_dir
else:
results_directory = writing_results(datetime.now().strftime("%Y%m%d_%H%M%S"))

difflib_scores: List[List[float]] = [[] for _ in range(len(processed_files))]
file_ind = 0

for i, text in enumerate(tqdm(processed_files, desc="Comparing Files")):
for j, text_bis in enumerate(processed_files):
if i != j:
difflib_scores[i].append(difflib_overlap(text, text_bis))
papers_comparison(
results_directory,
file_ind,
text,
text_bis,
(filenames[i], filenames[j]),
block_size,
)
file_ind += 1
else:
print("Results file was not created...")
else:
print("Minimum number of files is not present. Please check that there are at least two files to compare.")
sys.exit()
difflib_scores[i].append(-1)

results_directory = path.join(results_directory, "_results.html")
print(f"Results saved at: {results_directory}")

results_to_html(difflib_scores, filenames, results_directory)

if wait_for_file(results_directory, 60): # Wait for file to be created
add_links_to_html_table(results_directory)
webbrowser.open(results_directory) # Open results HTML table
else:
print("The specified path does not exist : " + in_dir)
sys.exit()
raise RuntimeError("Results file was not created...")


if __name__ == "__main__":
Expand Down

0 comments on commit f11d2fc

Please sign in to comment.