diff --git a/physics_collection/analysis_not_complete.py b/physics_collection/analysis_not_complete.py new file mode 100644 index 0000000..a0ee80c --- /dev/null +++ b/physics_collection/analysis_not_complete.py @@ -0,0 +1,24 @@ +import os,sys +import json +from tqdm.auto import tqdm +current_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) +print(current_dir) +sys.path.append(current_dir) +from batch_running_task.get_data_utils import * +with open("physics_collection/analysis/not_complete_pdf_page_id.pairlist",'w') as file: + with open("physics_collection/analysis/whole_layout_complete.filelist",'r') as f: + alllines = f.readlines() + for line in tqdm(alllines): + line = line.strip().split() + jsonl_path = line[0] + bulk_status = json.loads(" ".join(line[1:])) + # jsonl_path = data['file'] + # status = data['status'] + pdf_id_and_page_id_pair = [] + for track_id,pdf_status,page_status_list in bulk_status: + for page_id, status in enumerate(page_status_list): + if status in {page_status.none, page_status.layout_complete_and_ocr_only_for_mfd}: + pdf_id_and_page_id_pair.append((track_id, page_id)) + if len(pdf_id_and_page_id_pair)>0: + + file.write(f"{jsonl_path} {json.dumps(pdf_id_and_page_id_pair)}"+'\n') diff --git a/physics_collection/check_the_detected_row_is_part_of_one_category.py b/physics_collection/check_the_detected_row_is_part_of_one_category.py new file mode 100644 index 0000000..4a703af --- /dev/null +++ b/physics_collection/check_the_detected_row_is_part_of_one_category.py @@ -0,0 +1,51 @@ +import os +import json +from tqdm.auto import tqdm +from batch_running_task.get_data_utils import * +import json +from tqdm.auto import tqdm + +from simple_parsing import ArgumentParser +from batch_running_task.batch_run_utils import obtain_processed_filelist, process_files,save_analysis, BatchModeConfig +PageInformationROOT="opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared/page_num_map" +OriginDATAROOT="opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub" + +### form all physics doi set +with open("physics_collection/must_be_physics.doilist",'r') as f: + physics_doilist = f.readlines() + physics_doilist = [x.strip() for x in physics_doilist] +physics_doilist = set(physics_doilist) + +client = build_client() +def process_file(metadata_file, args:BatchModeConfig): + physics_collection = [] + metadata_list = read_json_from_path(metadata_file,client) + for metadata in metadata_list: + doi = metadata['remark']['original_file_id'] + if doi in physics_doilist: + physics_collection.append(metadata) + return physics_collection + +def process_one_file_wrapper(args): + arxiv_path, args = args + return process_file(arxiv_path,args) + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_arguments(BatchModeConfig, dest="config") + args = parser.parse_args() + args = args.config + args.task_name = "scan" + alread_processing_file_list = obtain_processed_filelist(args) + results = process_files(process_one_file_wrapper, alread_processing_file_list, args) + whole_physics_collection = [] + for result in results: + whole_physics_collection.extend(result) + fold = os.path.join(args.savepath,f"physics_collection.metadata.split") + os.makedirs(fold,exist_ok=True) + savepath = os.path.join(fold,f"{args.start_index:07d}-{args.end_index:07d}") + with open(savepath,'w') as f: + for metadata in whole_physics_collection: + f.write(json.dumps(metadata)+'\n') + + \ No newline at end of file diff --git a/physics_collection/collect_and_upload_to_ceph.py b/physics_collection/collect_and_upload_to_ceph.py new file mode 100644 index 0000000..6de68ca --- /dev/null +++ b/physics_collection/collect_and_upload_to_ceph.py @@ -0,0 +1,44 @@ +import os +import json +## redirect package root +import sys +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from tqdm.auto import tqdm +from batch_running_task.get_data_utils import * +from batch_running_task.batch_run_utils import obtain_processed_filelist, process_files,save_analysis, BatchModeConfig +from simple_parsing import ArgumentParser +ROOT="physics_collection/physics_collection.metadata.split" +#SAVE="physics_collection/physics_collection.metadata.minibatch.split" +SAVE="opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared/physics_part/result/" +client = build_client() +#client = None +def process_file(filename, args:BatchModeConfig): + chunk = 1000 + metadata_file= os.path.join(ROOT, filename) + filename = filename.replace(".jsonl","") + metadata = read_json_from_path(metadata_file,client) + chunk_num= int(np.ceil(len(metadata)/chunk)) + for i in tqdm(range(chunk_num),position=1, leave=False): + start = i*chunk + end = min((i+1)*chunk, len(metadata)) + save_path = os.path.join(SAVE, f"{filename}.{start:05d}_{end:05d}.jsonl") + if os.path.exists(save_path): + continue + + minibatch = metadata[start:end] + + write_jsonl_to_path(minibatch, save_path, client) + +def process_one_file_wrapper(args): + arxiv_path, args = args + return process_file(arxiv_path,args) + +if __name__ == '__main__': + parser = ArgumentParser() + parser.add_arguments(BatchModeConfig, dest="config") + args = parser.parse_args() + args = args.config + args.task_name = "scan" + alread_processing_file_list = obtain_processed_filelist(args) + results = process_files(process_one_file_wrapper, alread_processing_file_list, args) + \ No newline at end of file diff --git a/physics_collection/filte_out_left_namelist.py b/physics_collection/filte_out_left_namelist.py new file mode 100644 index 0000000..81eefaf --- /dev/null +++ b/physics_collection/filte_out_left_namelist.py @@ -0,0 +1,48 @@ +import os +from typing import List +def obtain_pdfpath_from_jsonl(jsonlpath_list:List[str]): + + Already_Done = {} + + empty_filelist=[] + if not isinstance(jsonlpath_list, list): + jsonlpath_list = [jsonlpath_list] + for jsonlpath in jsonlpath_list: + with open(jsonlpath, "r") as f: + for line in f: + line = line.strip().split() + if len(line)<4:continue + size = int(line[-2]) + filename = line[-1] + if size == 0: + empty_filelist.append(NamePathMap[filename]) + continue + Already_Done[filename]=NamePathMap[filename] + return Already_Done, empty_filelist +ROOT="finished" +NamePathMap={} +with open('physics_collection/physics.files.final.checklist.filelist','r') as f: + for line in f: + path = line.strip() + name = os.path.basename(path) + NamePathMap[name] = path + +#Already_Done, empty_filelist = obtain_pdfpath_from_jsonl('physics_collection/finished.rec.filelist') +Already_Done, empty_filelist = obtain_pdfpath_from_jsonl(['physics_collection/physics.files.final.filelist', + ]) + +Already_Done_path = "physics_collection/sci_index_files.finished.filelist" +with open(Already_Done_path, "w") as f: + for name, path in Already_Done.items(): + path = f"opendata:{path}" if not path.startswith("opendata:") else path + f.write(f"{path}\n") +with open("physics_collection/sci_index_files.redo.filelist","w") as f: + for name in empty_filelist: + f.write(name+'\n') + +Should_do = set(NamePathMap.keys()) - set(Already_Done.keys()) +remain_file_path = "physics_collection/sci_index_files.remain.filelist" +print(remain_file_path) +with open(remain_file_path,'w') as f: + for name in Should_do: + f.write(f"{NamePathMap[name]}\n") \ No newline at end of file diff --git a/scihub_collection/analysis_not_complete.py b/scihub_collection/analysis_not_complete.py new file mode 100644 index 0000000..52c1e06 --- /dev/null +++ b/scihub_collection/analysis_not_complete.py @@ -0,0 +1,29 @@ +import os +import json +from tqdm.auto import tqdm +reason_code = { + "complete": "P", #<--- layout + mfd + ocr_det + "only_have_15": "I", #<--- layout + mfd + "only_have_012467": "K", #<--- layout + "no012467": "A", + "none": "N" + +} + + +with open("scihub_collection/analysis/not_complete_pdf_page_id.pairlist",'w') as file: + with open("scihub_collection/analysis/not_complete.filelist",'r') as f: + alllines = f.readlines() + for line in tqdm(alllines): + data = json.loads(line.strip()) + jsonl_path = data['file'] + status = data['status'] + pdf_id_and_page_id_pair = [] + for track_id,page_status in status: + # pdf_id: int + # page_status: List[int] + for page_id, status in enumerate(page_status): + if status in {'N'}: + pdf_id_and_page_id_pair.append((track_id, page_id)) + if len(pdf_id_and_page_id_pair)>0: + file.write(f"{jsonl_path} {json.dumps(pdf_id_and_page_id_pair)}"+'\n') diff --git a/scihub_collection/filte_out_left_namelist.py b/scihub_collection/filte_out_left_namelist.py new file mode 100644 index 0000000..3820d84 --- /dev/null +++ b/scihub_collection/filte_out_left_namelist.py @@ -0,0 +1,41 @@ +import os +ROOT="finished" +Already_Done = {} +empty_filelist=[] +for name in os.listdir(ROOT): + print(f"read {name}") + version = name.split(".")[1] + if not name.endswith(".filelist"):continue + with open(os.path.join(ROOT, name), "r") as f: + for line in f: + line = line.strip().split() + if len(line)<4:continue + size = int(line[-2]) + filename = line[-1] + abspath = f"s3://llm-pdf-text/pdf_gpu_output/scihub_shared/{version}/result/{filename}" + if size == 0: + empty_filelist.append(f"opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub/"+filename) + continue + + + Already_Done[filename]=abspath + +Already_Done_path = "sci_index_files.finished.filelist" +with open(Already_Done_path, "w") as f: + for name, path in Already_Done.items(): + f.write(f"opendata:{path}\n") +with open("sci_index_files.redo.filelist","w") as f: + for name in empty_filelist: + f.write(name+'\n') + +Should_do = [] +with open('sci_index_files.namelist','r') as f: + for line in f: + name = line.strip() + if name in Already_Done: + continue + Should_do.append(name) +print(f"write to sci_index_files.remain.filelist") +with open('sci_index_files.remain.filelist','w') as f: + for name in Should_do: + f.write("opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub/"+name+'\n') \ No newline at end of file diff --git a/scihub_collection/filte_out_left_page_map.py b/scihub_collection/filte_out_left_page_map.py new file mode 100644 index 0000000..2a76c98 --- /dev/null +++ b/scihub_collection/filte_out_left_page_map.py @@ -0,0 +1,21 @@ +import os +Already_Done = [] +with open("page_num_map.filelist", "r") as f: + for line in f: + line = line.strip().split() + if len(line)<4:continue + name = line[-1] + Already_Done.append(name) + +Already_Done = set(Already_Done) +print(f"read sci_index_files.namelist") +Should_do = [] +with open('sci_index_files.namelist','r') as f: + for line in f: + name = line.strip()[:-1] + if name in Already_Done:continue + Should_do.append(name) +print(f"write to page_num_map.remain.filelist") +with open('page_num_map.remain.filelist','w') as f: + for name in Should_do: + f.write("opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub/"+name+'l\n') \ No newline at end of file diff --git a/scihub_collection/filte_out_missing_part.py b/scihub_collection/filte_out_missing_part.py new file mode 100644 index 0000000..bc1e787 --- /dev/null +++ b/scihub_collection/filte_out_missing_part.py @@ -0,0 +1,33 @@ +import os +from tqdm import tqdm +ROOT='scihub_collection/analysis' +ROOT='physics_collection/analysis' +print("collect already done jsonl name") +Already_Done = [] +with open("scan_finished.missingpart", "r") as f: + for line in f: + line = line.strip().split('/') + name = line[-1] + Already_Done.append(name) +print(Already_Done[:2]) +Already_Done = set(Already_Done) + +print(f"collect should do jsonl name") +Should_do = {} +with open(f'{ROOT}/not_complete_pdf_page_id.pairlist.filelist','r') as f: + for inputs_line in tqdm(f): + splited_line = inputs_line.split() + inputs_path = splited_line[0] + line = inputs_path.strip().split('/') + name = line[-1] + + Should_do[name]=inputs_line + +Remain_jsonl_name = set(Should_do.keys()) - Already_Done +print("=================") +print(f"Remain_jsonl_name: {len(Remain_jsonl_name)}") +print(f"Already_Done: {len(Already_Done)}") +print(f"Should_do: {len(Should_do)}") +with open(f'{ROOT}/not_complete_pdf_page_id.pairlist.remain.filelist','w') as f: + for name in Remain_jsonl_name: + f.write(Should_do[name]) \ No newline at end of file diff --git a/scihub_collection/gather_whole_finished_filelist.py b/scihub_collection/gather_whole_finished_filelist.py new file mode 100644 index 0000000..d6d8e53 --- /dev/null +++ b/scihub_collection/gather_whole_finished_filelist.py @@ -0,0 +1,24 @@ +import os +filepathlist= [ +"layoutV1", +"layoutV2", +"layoutV3", +"layoutV5", +"layoutV6", +] +too_small_files = {} +with open("scihub_collection/sci_hub.finished.filelist",'w') as file: + for filepart in filepathlist: + filepath = f"scihub_collection/sci_hub.finished.{filepart}.filelist" + with open(filepath, 'r') as f: + for line in f: + date, time, size, name = line.strip().split() + abspath = f"s3://llm-pdf-text/pdf_gpu_output/scihub_shared/{filepart}/result/{name}" + if int(size) < 1000: + too_small_files[name] = abspath + continue + if name in too_small_files: + ##remove the file from the set + del too_small_files[name] + file.write(abspath+'\n') +print(f"Too small files num = {len(too_small_files)}=>{too_small_files.values()}") \ No newline at end of file