Skip to content

Commit

Permalink
fix
Browse files Browse the repository at this point in the history
  • Loading branch information
markus583 committed May 10, 2024
1 parent bad5b8f commit 983ff29
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 1 deletion.
3 changes: 3 additions & 0 deletions wtpsplit/train/train_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,9 @@ def prepare_dataset(
# )
processed_dataset.append(processed_chunk)
dataset = datasets.Dataset.from_list(processed_dataset)
if subsample:
# 10k sentences -> 1k documents.
subsample *= 0.1

else:
dataset = datasets.Dataset.from_list(
Expand Down
2 changes: 1 addition & 1 deletion wtpsplit/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def get_subword_label_dict(label_args, tokenizer):
for i, c in enumerate(Constants.PUNCTUATION_CHARS):
token_id = tokenizer.convert_tokens_to_ids(c)
label_dict[token_id] = 1 + Constants.AUX_OFFSET + i
logger.info(
# logger.info(
# f"auxiliary character {c} has token ID {token_id} and label {label_dict[token_id]}, decoded: {tokenizer.decode([token_id])}"
# )
if token_id == tokenizer.unk_token_id:
Expand Down

0 comments on commit 983ff29

Please sign in to comment.