Skip to content

Commit

Permalink
Change v2d to metadata and transcript filepaths. Transcript is untest…
Browse files Browse the repository at this point in the history
…ed because we don't have the container working yet to use whisper (cry)
  • Loading branch information
kdu4108 committed Aug 4, 2024
1 parent 7cdfe15 commit c25a2d5
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 26 deletions.
30 changes: 23 additions & 7 deletions pseudolabeling/v2d_to_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import shutil
import tarfile
import tempfile
from tqdm import tqdm
from datetime import timedelta

# FIXME: may need adaptation
Expand All @@ -24,7 +25,7 @@ def process_tar_files(source_directory, target_directory, dataset, skip_existing

os.makedirs(target_directory, exist_ok=True)

for tar_path in os.listdir(source_directory):
for tar_path in tqdm(os.listdir(source_directory)):
if tar_path.endswith(".tar"):
shard_name = "shard-" + os.path.splitext(tar_path)[0] + ".tar"
target_tar_path = os.path.join(target_directory, shard_name)
Expand Down Expand Up @@ -94,33 +95,48 @@ def process_json_file(json_file_path, output_dir, dataset):


def main(args):
output_dir = (
args.output_dir
if args.output_dir is not None
else os.path.join(args.input_dir.replace("filtered_raw", "4m"), "video_metadata")
)
process_tar_files(
source_directory=args.data_root,
target_directory=os.path.join(args.data_root, "..", "video_metadata"),
source_directory=args.input_dir,
target_directory=output_dir,
dataset=args.dataset,
skip_existing=args.skip_existing,
)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Process tarfiles containing JSONs and convert to structured JSONL format."
description="Process tarfiles from `filtered_raw` format containing JSONs and extract relevant metadata into the `video_metadata` modality."
)

parser.add_argument(
"--data_root",
"-I",
"--input_dir",
type=str,
default="/store/swissai/a08/data/4m/video_rgb",
default="/store/swissai/a08/data/filtered_raw/howto100m/v2d_5000/",
# default="/cluster/work/cotterell/mm_swissai/raw/v2d_500/howto100m",
help="Dir containing the JSON files to process.",
help="A `filtered_raw` dir containing the JSON files to process.",
)
parser.add_argument(
"-O",
"--output_dir",
type=str,
default=None,
help="Output dir to save the pseudolabeled metadata.",
)
parser.add_argument(
"-S",
"--skip_existing",
default=False, # FIXME
help="Skip tarfiles already processed (exist in the target directory).",
)
# TODO: is this also in filestructure or do we have to provide it like this?
parser.add_argument(
"-D",
"--dataset",
type=str,
required=True,
Expand Down
45 changes: 26 additions & 19 deletions pseudolabeling/v2d_to_transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@ def timestamp_to_frames(timestamp, fps):

def process_tar_files(source_directory, target_directory, skip_existing=True):
"""Extract, process, and re-package JSON files in TAR archives."""
# TODO: this path
# source_directory = os.path.join(source_directory, "video_rgb")
target_directory = os.path.join(target_directory, "video_transcript")

os.makedirs(target_directory, exist_ok=True)

for tar_path in os.listdir(source_directory):
Expand Down Expand Up @@ -107,37 +103,48 @@ def process_json_file(json_file_path, output_dir):


def main(args):
for folder in os.listdir(args.data_root):
if folder in ["train", "val", "test"]:
current_folder = os.path.join(args.data_root, folder, args.whisper_dir)
print(f"Processing {current_folder}.")
process_tar_files(
source_directory=current_folder,
target_directory=os.path.join(args.data_root, folder),
skip_existing=args.skip_existing,
)
current_folder = os.path.join(args.input_dir, args.whisper_dir)
output_dir = (
args.output_dir
if args.output_dir is not None
else os.path.join(args.input_dir.replace("filtered_raw", "4m"), "video_transcript")
)
print(f"Processing {current_folder}.")
process_tar_files(
source_directory=current_folder,
target_directory=output_dir,
skip_existing=args.skip_existing,
)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Process tarfiles containing JSONs and convert to structured JSONL format."
)

parser.add_argument(
"--data_root",
"-I",
"--input_dir",
type=str,
default="/store/swissai/a08/data/filtered_raw/howto100m/v2d_5000/",
# default="/cluster/work/cotterell/mm_swissai/raw/v2d_500/howto100m",
help="A `filtered_raw` dir containing the JSON files to process.",
)
parser.add_argument(
"-O",
"--output_dir",
type=str,
# FIXME: default dir
# default="/store/swissai/a08/data/4m-data/train/DEBUG/v2d_40k",
default="/cluster/work/cotterell/mm_swissai/raw/v2d_500/howto100m",
help="Dir containing the JSON files to process.",
default=None,
help="Output dir to save the pseudolabeled transcripts.",
)
parser.add_argument(
"-W",
"--whisper_dir",
type=str,
default="whisperx",
help="Dir containing the WhisperX transcripts.",
)
parser.add_argument(
"-S",
"--skip_existing",
default=False, # FIXME
help="Skip tarfiles already processed (exist in the target directory).",
Expand Down

0 comments on commit c25a2d5

Please sign in to comment.