Skip to content

Commit

Permalink
add some script
Browse files Browse the repository at this point in the history
  • Loading branch information
veya2ztn committed Oct 9, 2024
1 parent 9ca6096 commit 38df666
Show file tree
Hide file tree
Showing 9 changed files with 315 additions and 0 deletions.
24 changes: 24 additions & 0 deletions physics_collection/analysis_not_complete.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import os,sys
import json
from tqdm.auto import tqdm
current_dir = os.path.abspath(os.path.dirname(os.path.dirname(__file__)))
print(current_dir)
sys.path.append(current_dir)
from batch_running_task.get_data_utils import *
with open("physics_collection/analysis/not_complete_pdf_page_id.pairlist",'w') as file:
with open("physics_collection/analysis/whole_layout_complete.filelist",'r') as f:
alllines = f.readlines()
for line in tqdm(alllines):
line = line.strip().split()
jsonl_path = line[0]
bulk_status = json.loads(" ".join(line[1:]))
# jsonl_path = data['file']
# status = data['status']
pdf_id_and_page_id_pair = []
for track_id,pdf_status,page_status_list in bulk_status:
for page_id, status in enumerate(page_status_list):
if status in {page_status.none, page_status.layout_complete_and_ocr_only_for_mfd}:
pdf_id_and_page_id_pair.append((track_id, page_id))
if len(pdf_id_and_page_id_pair)>0:

file.write(f"{jsonl_path} {json.dumps(pdf_id_and_page_id_pair)}"+'\n')
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
import json
from tqdm.auto import tqdm
from batch_running_task.get_data_utils import *
import json
from tqdm.auto import tqdm

from simple_parsing import ArgumentParser
from batch_running_task.batch_run_utils import obtain_processed_filelist, process_files,save_analysis, BatchModeConfig
PageInformationROOT="opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared/page_num_map"
OriginDATAROOT="opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub"

### form all physics doi set
with open("physics_collection/must_be_physics.doilist",'r') as f:
physics_doilist = f.readlines()
physics_doilist = [x.strip() for x in physics_doilist]
physics_doilist = set(physics_doilist)

client = build_client()
def process_file(metadata_file, args:BatchModeConfig):
physics_collection = []
metadata_list = read_json_from_path(metadata_file,client)
for metadata in metadata_list:
doi = metadata['remark']['original_file_id']
if doi in physics_doilist:
physics_collection.append(metadata)
return physics_collection

def process_one_file_wrapper(args):
arxiv_path, args = args
return process_file(arxiv_path,args)

if __name__ == '__main__':
parser = ArgumentParser()
parser.add_arguments(BatchModeConfig, dest="config")
args = parser.parse_args()
args = args.config
args.task_name = "scan"
alread_processing_file_list = obtain_processed_filelist(args)
results = process_files(process_one_file_wrapper, alread_processing_file_list, args)
whole_physics_collection = []
for result in results:
whole_physics_collection.extend(result)
fold = os.path.join(args.savepath,f"physics_collection.metadata.split")
os.makedirs(fold,exist_ok=True)
savepath = os.path.join(fold,f"{args.start_index:07d}-{args.end_index:07d}")
with open(savepath,'w') as f:
for metadata in whole_physics_collection:
f.write(json.dumps(metadata)+'\n')


44 changes: 44 additions & 0 deletions physics_collection/collect_and_upload_to_ceph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import os
import json
## redirect package root
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from tqdm.auto import tqdm
from batch_running_task.get_data_utils import *
from batch_running_task.batch_run_utils import obtain_processed_filelist, process_files,save_analysis, BatchModeConfig
from simple_parsing import ArgumentParser
ROOT="physics_collection/physics_collection.metadata.split"
#SAVE="physics_collection/physics_collection.metadata.minibatch.split"
SAVE="opendata:s3://llm-pdf-text/pdf_gpu_output/scihub_shared/physics_part/result/"
client = build_client()
#client = None
def process_file(filename, args:BatchModeConfig):
chunk = 1000
metadata_file= os.path.join(ROOT, filename)
filename = filename.replace(".jsonl","")
metadata = read_json_from_path(metadata_file,client)
chunk_num= int(np.ceil(len(metadata)/chunk))
for i in tqdm(range(chunk_num),position=1, leave=False):
start = i*chunk
end = min((i+1)*chunk, len(metadata))
save_path = os.path.join(SAVE, f"{filename}.{start:05d}_{end:05d}.jsonl")
if os.path.exists(save_path):
continue

minibatch = metadata[start:end]

write_jsonl_to_path(minibatch, save_path, client)

def process_one_file_wrapper(args):
arxiv_path, args = args
return process_file(arxiv_path,args)

if __name__ == '__main__':
parser = ArgumentParser()
parser.add_arguments(BatchModeConfig, dest="config")
args = parser.parse_args()
args = args.config
args.task_name = "scan"
alread_processing_file_list = obtain_processed_filelist(args)
results = process_files(process_one_file_wrapper, alread_processing_file_list, args)

48 changes: 48 additions & 0 deletions physics_collection/filte_out_left_namelist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import os
from typing import List
def obtain_pdfpath_from_jsonl(jsonlpath_list:List[str]):

Already_Done = {}

empty_filelist=[]
if not isinstance(jsonlpath_list, list):
jsonlpath_list = [jsonlpath_list]
for jsonlpath in jsonlpath_list:
with open(jsonlpath, "r") as f:
for line in f:
line = line.strip().split()
if len(line)<4:continue
size = int(line[-2])
filename = line[-1]
if size == 0:
empty_filelist.append(NamePathMap[filename])
continue
Already_Done[filename]=NamePathMap[filename]
return Already_Done, empty_filelist
ROOT="finished"
NamePathMap={}
with open('physics_collection/physics.files.final.checklist.filelist','r') as f:
for line in f:
path = line.strip()
name = os.path.basename(path)
NamePathMap[name] = path

#Already_Done, empty_filelist = obtain_pdfpath_from_jsonl('physics_collection/finished.rec.filelist')
Already_Done, empty_filelist = obtain_pdfpath_from_jsonl(['physics_collection/physics.files.final.filelist',
])

Already_Done_path = "physics_collection/sci_index_files.finished.filelist"
with open(Already_Done_path, "w") as f:
for name, path in Already_Done.items():
path = f"opendata:{path}" if not path.startswith("opendata:") else path
f.write(f"{path}\n")
with open("physics_collection/sci_index_files.redo.filelist","w") as f:
for name in empty_filelist:
f.write(name+'\n')

Should_do = set(NamePathMap.keys()) - set(Already_Done.keys())
remain_file_path = "physics_collection/sci_index_files.remain.filelist"
print(remain_file_path)
with open(remain_file_path,'w') as f:
for name in Should_do:
f.write(f"{NamePathMap[name]}\n")
29 changes: 29 additions & 0 deletions scihub_collection/analysis_not_complete.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import os
import json
from tqdm.auto import tqdm
reason_code = {
"complete": "P", #<--- layout + mfd + ocr_det
"only_have_15": "I", #<--- layout + mfd
"only_have_012467": "K", #<--- layout
"no012467": "A",
"none": "N"

}


with open("scihub_collection/analysis/not_complete_pdf_page_id.pairlist",'w') as file:
with open("scihub_collection/analysis/not_complete.filelist",'r') as f:
alllines = f.readlines()
for line in tqdm(alllines):
data = json.loads(line.strip())
jsonl_path = data['file']
status = data['status']
pdf_id_and_page_id_pair = []
for track_id,page_status in status:
# pdf_id: int
# page_status: List[int]
for page_id, status in enumerate(page_status):
if status in {'N'}:
pdf_id_and_page_id_pair.append((track_id, page_id))
if len(pdf_id_and_page_id_pair)>0:
file.write(f"{jsonl_path} {json.dumps(pdf_id_and_page_id_pair)}"+'\n')
41 changes: 41 additions & 0 deletions scihub_collection/filte_out_left_namelist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import os
ROOT="finished"
Already_Done = {}
empty_filelist=[]
for name in os.listdir(ROOT):
print(f"read {name}")
version = name.split(".")[1]
if not name.endswith(".filelist"):continue
with open(os.path.join(ROOT, name), "r") as f:
for line in f:
line = line.strip().split()
if len(line)<4:continue
size = int(line[-2])
filename = line[-1]
abspath = f"s3://llm-pdf-text/pdf_gpu_output/scihub_shared/{version}/result/{filename}"
if size == 0:
empty_filelist.append(f"opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub/"+filename)
continue


Already_Done[filename]=abspath

Already_Done_path = "sci_index_files.finished.filelist"
with open(Already_Done_path, "w") as f:
for name, path in Already_Done.items():
f.write(f"opendata:{path}\n")
with open("sci_index_files.redo.filelist","w") as f:
for name in empty_filelist:
f.write(name+'\n')

Should_do = []
with open('sci_index_files.namelist','r') as f:
for line in f:
name = line.strip()
if name in Already_Done:
continue
Should_do.append(name)
print(f"write to sci_index_files.remain.filelist")
with open('sci_index_files.remain.filelist','w') as f:
for name in Should_do:
f.write("opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub/"+name+'\n')
21 changes: 21 additions & 0 deletions scihub_collection/filte_out_left_page_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os
Already_Done = []
with open("page_num_map.filelist", "r") as f:
for line in f:
line = line.strip().split()
if len(line)<4:continue
name = line[-1]
Already_Done.append(name)

Already_Done = set(Already_Done)
print(f"read sci_index_files.namelist")
Should_do = []
with open('sci_index_files.namelist','r') as f:
for line in f:
name = line.strip()[:-1]
if name in Already_Done:continue
Should_do.append(name)
print(f"write to page_num_map.remain.filelist")
with open('page_num_map.remain.filelist','w') as f:
for name in Should_do:
f.write("opendata:s3://llm-process-pperf/ebook_index_v4/scihub/v001/scihub/"+name+'l\n')
33 changes: 33 additions & 0 deletions scihub_collection/filte_out_missing_part.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import os
from tqdm import tqdm
ROOT='scihub_collection/analysis'
ROOT='physics_collection/analysis'
print("collect already done jsonl name")
Already_Done = []
with open("scan_finished.missingpart", "r") as f:
for line in f:
line = line.strip().split('/')
name = line[-1]
Already_Done.append(name)
print(Already_Done[:2])
Already_Done = set(Already_Done)

print(f"collect should do jsonl name")
Should_do = {}
with open(f'{ROOT}/not_complete_pdf_page_id.pairlist.filelist','r') as f:
for inputs_line in tqdm(f):
splited_line = inputs_line.split()
inputs_path = splited_line[0]
line = inputs_path.strip().split('/')
name = line[-1]

Should_do[name]=inputs_line

Remain_jsonl_name = set(Should_do.keys()) - Already_Done
print("=================")
print(f"Remain_jsonl_name: {len(Remain_jsonl_name)}")
print(f"Already_Done: {len(Already_Done)}")
print(f"Should_do: {len(Should_do)}")
with open(f'{ROOT}/not_complete_pdf_page_id.pairlist.remain.filelist','w') as f:
for name in Remain_jsonl_name:
f.write(Should_do[name])
24 changes: 24 additions & 0 deletions scihub_collection/gather_whole_finished_filelist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import os
filepathlist= [
"layoutV1",
"layoutV2",
"layoutV3",
"layoutV5",
"layoutV6",
]
too_small_files = {}
with open("scihub_collection/sci_hub.finished.filelist",'w') as file:
for filepart in filepathlist:
filepath = f"scihub_collection/sci_hub.finished.{filepart}.filelist"
with open(filepath, 'r') as f:
for line in f:
date, time, size, name = line.strip().split()
abspath = f"s3://llm-pdf-text/pdf_gpu_output/scihub_shared/{filepart}/result/{name}"
if int(size) < 1000:
too_small_files[name] = abspath
continue
if name in too_small_files:
##remove the file from the set
del too_small_files[name]
file.write(abspath+'\n')
print(f"Too small files num = {len(too_small_files)}=>{too_small_files.values()}")

0 comments on commit 38df666

Please sign in to comment.