fix

segment-any-text · May 10, 2024 · 983ff29 · 983ff29
1 parent bad5b8f
commit 983ff29
Show file tree

Hide file tree

Showing 2 changed files with 4 additions and 1 deletion.
diff --git a/wtpsplit/train/train_adapter.py b/wtpsplit/train/train_adapter.py
@@ -132,6 +132,9 @@ def prepare_dataset(
                         # )
                         processed_dataset.append(processed_chunk)
                     dataset = datasets.Dataset.from_list(processed_dataset)
+                    if subsample:
+                        # 10k sentences -> 1k documents.
+                        subsample *= 0.1
 
                 else:
                     dataset = datasets.Dataset.from_list(

diff --git a/wtpsplit/utils.py b/wtpsplit/utils.py
@@ -120,7 +120,7 @@ def get_subword_label_dict(label_args, tokenizer):
     for i, c in enumerate(Constants.PUNCTUATION_CHARS):
         token_id = tokenizer.convert_tokens_to_ids(c)
         label_dict[token_id] = 1 + Constants.AUX_OFFSET + i
-        logger.info(
+        # logger.info(
         #     f"auxiliary character {c} has token ID {token_id} and label {label_dict[token_id]}, decoded: {tokenizer.decode([token_id])}"
         # )
         if token_id == tokenizer.unk_token_id: