From 8f787c6aaff7d298b836bff78aadfd56b7a7c19e Mon Sep 17 00:00:00 2001 From: markus583 Date: Tue, 18 Jun 2024 13:15:11 +0000 Subject: [PATCH] git rm --- calc_compression_rate.py | 31 --------------- commands.txt | 24 ------------ run.sh | 2 - run_adapter.sh | 1 - run_eval.sh | 18 --------- run_eval_kmer.sh | 22 ----------- tpu_START.sh | 9 ----- tpu_starter.sh | 4 -- xla_spawn.py | 83 ---------------------------------------- 9 files changed, 194 deletions(-) delete mode 100644 calc_compression_rate.py delete mode 100644 commands.txt delete mode 100755 run.sh delete mode 100755 run_adapter.sh delete mode 100755 run_eval.sh delete mode 100755 run_eval_kmer.sh delete mode 100755 tpu_START.sh delete mode 100755 tpu_starter.sh delete mode 100644 xla_spawn.py diff --git a/calc_compression_rate.py b/calc_compression_rate.py deleted file mode 100644 index 9acc5c1a..00000000 --- a/calc_compression_rate.py +++ /dev/null @@ -1,31 +0,0 @@ -from datasets import load_dataset -from transformers import XLMRobertaTokenizer - -def calculate_compression_rate(dataset_name): - # Load the dataset - dataset = load_dataset(dataset_name, split='train') - - # Initialize the tokenizer - tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base') - - total_chars = 0 - total_tokens = 0 - - # Iterate over the dataset - for sample in dataset: - text = sample['text'] - total_chars += len(text) - - # Tokenize the text - tokens = tokenizer.tokenize(text) - total_tokens += len(tokens) - - # Calculate the average compression rate - avg_compression_rate = total_chars / total_tokens if total_tokens > 0 else 0 - - return avg_compression_rate - -# Example dataset -dataset_name = "markus583/mC4-TEST" -compression_rate = calculate_compression_rate(dataset_name) -print(compression_rate) diff --git a/commands.txt b/commands.txt deleted file mode 100644 index 3c702478..00000000 --- a/commands.txt +++ /dev/null @@ -1,24 +0,0 @@ -# .bashrc -export PATH=$PATH:~/.local/bin - -export XRT_TPU_CONFIG="localservice;0;localhost:51011" - -export XLA_USE_BF16=0 - -export TPU_NUM_DEVICES=8 - -export HF_DATASETS_CACHE=/dev/shm/cache - -# data -gcloud auth login -gsutil -m cp -r gs://trc-transfer-data/sentence/data/eval.pth data/ - -# cleanup -pkill -e python3 -(until no more) -or -watch -n1 pkill -e python3 - -# for debugging: - -os.environ["PJRT_DEVICE"] = "None" diff --git a/run.sh b/run.sh deleted file mode 100755 index d3ba4a89..00000000 --- a/run.sh +++ /dev/null @@ -1,2 +0,0 @@ -# TODO: cleanup in case of no .arrow files but cache-* files available. -python3 ~/wtpsplit/xla_spawn.py --num_cores ${TPU_NUM_DEVICES} wtpsplit/train/train.py $1 \ No newline at end of file diff --git a/run_adapter.sh b/run_adapter.sh deleted file mode 100755 index e0d58697..00000000 --- a/run_adapter.sh +++ /dev/null @@ -1 +0,0 @@ -python3 ~/wtpsplit/xla_spawn.py --num_cores ${TPU_NUM_DEVICES} wtpsplit/train/train_adapter_parallel.py $1 \ No newline at end of file diff --git a/run_eval.sh b/run_eval.sh deleted file mode 100755 index 208e8187..00000000 --- a/run_eval.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -# Check if sufficient arguments are provided -if [[ $# -lt 2 ]]; then - echo "Usage: $0 MODEL_PATH 'threshold_list'" - echo "Example: $0 /path/to/model '0.1 0.2 0.3'" - exit 1 -fi - -# Assign arguments to variables -MODEL_PATH="$1" -threshold_list=($2) - -# Loop over threshold_list -for threshold in "${threshold_list[@]}"; do - # Execute the Python script - python3 wtpsplit/evaluation/intrinsic.py --model_path "$MODEL_PATH" --threshold "$threshold" --keep_logits -done \ No newline at end of file diff --git a/run_eval_kmer.sh b/run_eval_kmer.sh deleted file mode 100755 index 3fbc473f..00000000 --- a/run_eval_kmer.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -# Check if sufficient arguments are provided -if [[ $# -lt 3 ]]; then - echo "Usage: $0 MODEL_PATH 'k_list' 'threshold_list'" - echo "Example: $0 /path/to/model '1 2 3' '0.1 0.2 0.3'" - exit 1 -fi - -# Assign arguments to variables -MODEL_PATH="$1" -k_list=($2) -threshold_list=($3) - -# Loop over k_list -for k in "${k_list[@]}"; do - # Loop over threshold_list - for threshold in "${threshold_list[@]}"; do - # Execute the Python script - python3 wtpsplit/evaluation/intrinsic_pairwise.py --model_path "$MODEL_PATH" --k "$k" --threshold "$threshold" --keep_logits - done -done \ No newline at end of file diff --git a/tpu_START.sh b/tpu_START.sh deleted file mode 100755 index cc04404b..00000000 --- a/tpu_START.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -TPU_VM_NAME="v3-8_4-1.13" # Name of the TPU VM -ZONE="europe-west4-a" # Zone - -# Create the TPU VM, retry if it fails -until gcloud compute tpus tpu-vm start "$TPU_VM_NAME" --zone="$ZONE"; do - sleep 1 -done \ No newline at end of file diff --git a/tpu_starter.sh b/tpu_starter.sh deleted file mode 100755 index f8d5ba55..00000000 --- a/tpu_starter.sh +++ /dev/null @@ -1,4 +0,0 @@ -for var in "$@" -do - until gcloud compute tpus tpu-vm create $var --zone=europe-west4-a --accelerator-type=v3-8 --version=tpu-vm-pt-1.13; do sleep 3; done -done \ No newline at end of file diff --git a/xla_spawn.py b/xla_spawn.py deleted file mode 100644 index 5df6bfa2..00000000 --- a/xla_spawn.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -A simple launcher script for TPU training - -Inspired by https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py - -:: - >>> python xla_spawn.py --num_cores=NUM_CORES_YOU_HAVE - YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other - arguments of your training script) - -""" - - -import importlib -import sys -from argparse import REMAINDER, ArgumentParser -from pathlib import Path - -import torch_xla.distributed.xla_multiprocessing as xmp - - -def parse_args(): - """ - Helper function parsing the command line options - @retval ArgumentParser - """ - parser = ArgumentParser( - description=( - "PyTorch TPU distributed training launch helper utility that will spawn up multiple distributed processes" - ) - ) - - # Optional arguments for the launch helper - parser.add_argument("--num_cores", type=int, default=1, help="Number of TPU cores to use (1 or 8).") - - # positional - parser.add_argument( - "training_script", - type=str, - help=( - "The full path to the single TPU training " - "program/script to be launched in parallel, " - "followed by all the arguments for the " - "training script" - ), - ) - - # rest from the training program - parser.add_argument("training_script_args", nargs=REMAINDER) - - return parser.parse_args() - - -def main(): - args = parse_args() - - # Import training_script as a module. - script_fpath = Path(args.training_script) - sys.path.append(str(script_fpath.parent.resolve())) - mod_name = script_fpath.stem - mod = importlib.import_module(mod_name) - - # Patch sys.argv - sys.argv = [args.training_script] + args.training_script_args + ["--tpu_num_cores", str(args.num_cores)] - - xmp.spawn(mod._mp_fn, args=(), nprocs=args.num_cores) - - -if __name__ == "__main__": - main()