-
Notifications
You must be signed in to change notification settings - Fork 412
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
315 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import os,sys | ||
import json | ||
from tqdm.auto import tqdm | ||
current_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) | ||
print(current_dir) | ||
sys.path.append(current_dir) | ||
from batch_running_task.get_data_utils import * | ||
with open("physics_collection/analysis/not_complete_pdf_page_id.pairlist",'w') as file: | ||
with open("physics_collection/analysis/whole_layout_complete.filelist",'r') as f: | ||
alllines = f.readlines() | ||
for line in tqdm(alllines): | ||
line = line.strip().split() | ||
jsonl_path = line[0] | ||
bulk_status = json.loads(" ".join(line[1:])) | ||
# jsonl_path = data['file'] | ||
# status = data['status'] | ||
pdf_id_and_page_id_pair = [] | ||
for track_id,pdf_status,page_status_list in bulk_status: | ||
for page_id, status in enumerate(page_status_list): | ||
if status in {page_status.none, page_status.layout_complete_and_ocr_only_for_mfd}: | ||
pdf_id_and_page_id_pair.append((track_id, page_id)) | ||
if len(pdf_id_and_page_id_pair)>0: | ||
|
||
file.write(f"{jsonl_path} {json.dumps(pdf_id_and_page_id_pair)}"+'\n') |
51 changes: 51 additions & 0 deletions
51
physics_collection/check_the_detected_row_is_part_of_one_category.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import os | ||
import json | ||
from tqdm.auto import tqdm | ||
from batch_running_task.get_data_utils import * | ||
import json | ||
from tqdm.auto import tqdm | ||
|
||
from simple_parsing import ArgumentParser | ||
from batch_running_task.batch_run_utils import obtain_processed_filelist, process_files,save_analysis, BatchModeConfig | ||
PageInformationROOT="opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared/page_num_map" | ||
OriginDATAROOT="opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub" | ||
|
||
### form all physics doi set | ||
with open("physics_collection/must_be_physics.doilist",'r') as f: | ||
physics_doilist = f.readlines() | ||
physics_doilist = [x.strip() for x in physics_doilist] | ||
physics_doilist = set(physics_doilist) | ||
|
||
client = build_client() | ||
def process_file(metadata_file, args:BatchModeConfig): | ||
physics_collection = [] | ||
metadata_list = read_json_from_path(metadata_file,client) | ||
for metadata in metadata_list: | ||
doi = metadata['remark']['original_file_id'] | ||
if doi in physics_doilist: | ||
physics_collection.append(metadata) | ||
return physics_collection | ||
|
||
def process_one_file_wrapper(args): | ||
arxiv_path, args = args | ||
return process_file(arxiv_path,args) | ||
|
||
if __name__ == '__main__': | ||
parser = ArgumentParser() | ||
parser.add_arguments(BatchModeConfig, dest="config") | ||
args = parser.parse_args() | ||
args = args.config | ||
args.task_name = "scan" | ||
alread_processing_file_list = obtain_processed_filelist(args) | ||
results = process_files(process_one_file_wrapper, alread_processing_file_list, args) | ||
whole_physics_collection = [] | ||
for result in results: | ||
whole_physics_collection.extend(result) | ||
fold = os.path.join(args.savepath,f"physics_collection.metadata.split") | ||
os.makedirs(fold,exist_ok=True) | ||
savepath = os.path.join(fold,f"{args.start_index:07d}-{args.end_index:07d}") | ||
with open(savepath,'w') as f: | ||
for metadata in whole_physics_collection: | ||
f.write(json.dumps(metadata)+'\n') | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
import os | ||
import json | ||
## redirect package root | ||
import sys | ||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) | ||
from tqdm.auto import tqdm | ||
from batch_running_task.get_data_utils import * | ||
from batch_running_task.batch_run_utils import obtain_processed_filelist, process_files,save_analysis, BatchModeConfig | ||
from simple_parsing import ArgumentParser | ||
ROOT="physics_collection/physics_collection.metadata.split" | ||
#SAVE="physics_collection/physics_collection.metadata.minibatch.split" | ||
SAVE="opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared/physics_part/result/" | ||
client = build_client() | ||
#client = None | ||
def process_file(filename, args:BatchModeConfig): | ||
chunk = 1000 | ||
metadata_file= os.path.join(ROOT, filename) | ||
filename = filename.replace(".jsonl","") | ||
metadata = read_json_from_path(metadata_file,client) | ||
chunk_num= int(np.ceil(len(metadata)/chunk)) | ||
for i in tqdm(range(chunk_num),position=1, leave=False): | ||
start = i*chunk | ||
end = min((i+1)*chunk, len(metadata)) | ||
save_path = os.path.join(SAVE, f"{filename}.{start:05d}_{end:05d}.jsonl") | ||
if os.path.exists(save_path): | ||
continue | ||
|
||
minibatch = metadata[start:end] | ||
|
||
write_jsonl_to_path(minibatch, save_path, client) | ||
|
||
def process_one_file_wrapper(args): | ||
arxiv_path, args = args | ||
return process_file(arxiv_path,args) | ||
|
||
if __name__ == '__main__': | ||
parser = ArgumentParser() | ||
parser.add_arguments(BatchModeConfig, dest="config") | ||
args = parser.parse_args() | ||
args = args.config | ||
args.task_name = "scan" | ||
alread_processing_file_list = obtain_processed_filelist(args) | ||
results = process_files(process_one_file_wrapper, alread_processing_file_list, args) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import os | ||
from typing import List | ||
def obtain_pdfpath_from_jsonl(jsonlpath_list:List[str]): | ||
|
||
Already_Done = {} | ||
|
||
empty_filelist=[] | ||
if not isinstance(jsonlpath_list, list): | ||
jsonlpath_list = [jsonlpath_list] | ||
for jsonlpath in jsonlpath_list: | ||
with open(jsonlpath, "r") as f: | ||
for line in f: | ||
line = line.strip().split() | ||
if len(line)<4:continue | ||
size = int(line[-2]) | ||
filename = line[-1] | ||
if size == 0: | ||
empty_filelist.append(NamePathMap[filename]) | ||
continue | ||
Already_Done[filename]=NamePathMap[filename] | ||
return Already_Done, empty_filelist | ||
ROOT="finished" | ||
NamePathMap={} | ||
with open('physics_collection/physics.files.final.checklist.filelist','r') as f: | ||
for line in f: | ||
path = line.strip() | ||
name = os.path.basename(path) | ||
NamePathMap[name] = path | ||
|
||
#Already_Done, empty_filelist = obtain_pdfpath_from_jsonl('physics_collection/finished.rec.filelist') | ||
Already_Done, empty_filelist = obtain_pdfpath_from_jsonl(['physics_collection/physics.files.final.filelist', | ||
]) | ||
|
||
Already_Done_path = "physics_collection/sci_index_files.finished.filelist" | ||
with open(Already_Done_path, "w") as f: | ||
for name, path in Already_Done.items(): | ||
path = f"opendata:{path}" if not path.startswith("opendata:") else path | ||
f.write(f"{path}\n") | ||
with open("physics_collection/sci_index_files.redo.filelist","w") as f: | ||
for name in empty_filelist: | ||
f.write(name+'\n') | ||
|
||
Should_do = set(NamePathMap.keys()) - set(Already_Done.keys()) | ||
remain_file_path = "physics_collection/sci_index_files.remain.filelist" | ||
print(remain_file_path) | ||
with open(remain_file_path,'w') as f: | ||
for name in Should_do: | ||
f.write(f"{NamePathMap[name]}\n") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import os | ||
import json | ||
from tqdm.auto import tqdm | ||
reason_code = { | ||
"complete": "P", #<--- layout + mfd + ocr_det | ||
"only_have_15": "I", #<--- layout + mfd | ||
"only_have_012467": "K", #<--- layout | ||
"no012467": "A", | ||
"none": "N" | ||
|
||
} | ||
|
||
|
||
with open("scihub_collection/analysis/not_complete_pdf_page_id.pairlist",'w') as file: | ||
with open("scihub_collection/analysis/not_complete.filelist",'r') as f: | ||
alllines = f.readlines() | ||
for line in tqdm(alllines): | ||
data = json.loads(line.strip()) | ||
jsonl_path = data['file'] | ||
status = data['status'] | ||
pdf_id_and_page_id_pair = [] | ||
for track_id,page_status in status: | ||
# pdf_id: int | ||
# page_status: List[int] | ||
for page_id, status in enumerate(page_status): | ||
if status in {'N'}: | ||
pdf_id_and_page_id_pair.append((track_id, page_id)) | ||
if len(pdf_id_and_page_id_pair)>0: | ||
file.write(f"{jsonl_path} {json.dumps(pdf_id_and_page_id_pair)}"+'\n') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import os | ||
ROOT="finished" | ||
Already_Done = {} | ||
empty_filelist=[] | ||
for name in os.listdir(ROOT): | ||
print(f"read {name}") | ||
version = name.split(".")[1] | ||
if not name.endswith(".filelist"):continue | ||
with open(os.path.join(ROOT, name), "r") as f: | ||
for line in f: | ||
line = line.strip().split() | ||
if len(line)<4:continue | ||
size = int(line[-2]) | ||
filename = line[-1] | ||
abspath = f"s3://llm-pdf-text/pdf_gpu_output/scihub_shared/{version}/result/{filename}" | ||
if size == 0: | ||
empty_filelist.append(f"opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub/"+filename) | ||
continue | ||
|
||
|
||
Already_Done[filename]=abspath | ||
|
||
Already_Done_path = "sci_index_files.finished.filelist" | ||
with open(Already_Done_path, "w") as f: | ||
for name, path in Already_Done.items(): | ||
f.write(f"opendata:{path}\n") | ||
with open("sci_index_files.redo.filelist","w") as f: | ||
for name in empty_filelist: | ||
f.write(name+'\n') | ||
|
||
Should_do = [] | ||
with open('sci_index_files.namelist','r') as f: | ||
for line in f: | ||
name = line.strip() | ||
if name in Already_Done: | ||
continue | ||
Should_do.append(name) | ||
print(f"write to sci_index_files.remain.filelist") | ||
with open('sci_index_files.remain.filelist','w') as f: | ||
for name in Should_do: | ||
f.write("opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub/"+name+'\n') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
import os | ||
Already_Done = [] | ||
with open("page_num_map.filelist", "r") as f: | ||
for line in f: | ||
line = line.strip().split() | ||
if len(line)<4:continue | ||
name = line[-1] | ||
Already_Done.append(name) | ||
|
||
Already_Done = set(Already_Done) | ||
print(f"read sci_index_files.namelist") | ||
Should_do = [] | ||
with open('sci_index_files.namelist','r') as f: | ||
for line in f: | ||
name = line.strip()[:-1] | ||
if name in Already_Done:continue | ||
Should_do.append(name) | ||
print(f"write to page_num_map.remain.filelist") | ||
with open('page_num_map.remain.filelist','w') as f: | ||
for name in Should_do: | ||
f.write("opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub/"+name+'l\n') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
import os | ||
from tqdm import tqdm | ||
ROOT='scihub_collection/analysis' | ||
ROOT='physics_collection/analysis' | ||
print("collect already done jsonl name") | ||
Already_Done = [] | ||
with open("scan_finished.missingpart", "r") as f: | ||
for line in f: | ||
line = line.strip().split('/') | ||
name = line[-1] | ||
Already_Done.append(name) | ||
print(Already_Done[:2]) | ||
Already_Done = set(Already_Done) | ||
|
||
print(f"collect should do jsonl name") | ||
Should_do = {} | ||
with open(f'{ROOT}/not_complete_pdf_page_id.pairlist.filelist','r') as f: | ||
for inputs_line in tqdm(f): | ||
splited_line = inputs_line.split() | ||
inputs_path = splited_line[0] | ||
line = inputs_path.strip().split('/') | ||
name = line[-1] | ||
|
||
Should_do[name]=inputs_line | ||
|
||
Remain_jsonl_name = set(Should_do.keys()) - Already_Done | ||
print("=================") | ||
print(f"Remain_jsonl_name: {len(Remain_jsonl_name)}") | ||
print(f"Already_Done: {len(Already_Done)}") | ||
print(f"Should_do: {len(Should_do)}") | ||
with open(f'{ROOT}/not_complete_pdf_page_id.pairlist.remain.filelist','w') as f: | ||
for name in Remain_jsonl_name: | ||
f.write(Should_do[name]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import os | ||
filepathlist= [ | ||
"layoutV1", | ||
"layoutV2", | ||
"layoutV3", | ||
"layoutV5", | ||
"layoutV6", | ||
] | ||
too_small_files = {} | ||
with open("scihub_collection/sci_hub.finished.filelist",'w') as file: | ||
for filepart in filepathlist: | ||
filepath = f"scihub_collection/sci_hub.finished.{filepart}.filelist" | ||
with open(filepath, 'r') as f: | ||
for line in f: | ||
date, time, size, name = line.strip().split() | ||
abspath = f"s3://llm-pdf-text/pdf_gpu_output/scihub_shared/{filepart}/result/{name}" | ||
if int(size) < 1000: | ||
too_small_files[name] = abspath | ||
continue | ||
if name in too_small_files: | ||
##remove the file from the set | ||
del too_small_files[name] | ||
file.write(abspath+'\n') | ||
print(f"Too small files num = {len(too_small_files)}=>{too_small_files.values()}") |