Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prepare oasst2 dataset #3736

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#
# /WARNING!

exclude: build|stubs|^bot/templates/$|openassistant/templates|docs/docs/api/openapi.json|scripts/postprocessing/regex_pii_detector.py
exclude: build|stubs|^bot/templates/$|openassistant/templates|docs/docs/api/openapi.json|scripts/postprocessing/regex_pii_detector.py|oasst-data/examples/clean_dataset.py|oasst-data/examples/tree_to_messages.py

default_language_version:
python: python3
Expand Down
32 changes: 28 additions & 4 deletions oasst-data/examples/clean_dataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
"""
Example usage:

python clean_dataset.py /
"2023-11-05_oasst_all.jsonl" /
"2023-11-05_oasst_all.clean.jsonl" /
--instructions "instructions.xlsx"
"""
import argparse
from collections import OrderedDict

Expand Down Expand Up @@ -59,25 +67,36 @@ def delete_message(msg: ExportMessageNode):
print(f"Tree deleted: {msg.message_id}")
else:
parent_msg = message_by_id[msg.parent_id]
parent_msg.replies.remove(msg)
print(f"Branch deleted: {msg.message_id} ({count_descendants(msg)} messages)")
try:
parent_msg.replies.remove(msg)
print(f"Branch deleted: {msg.message_id} ({count_descendants(msg)} messages)")
except ValueError:
print(f"Message not found: {msg.message_id}")

# cleaning
print("Cleaning...")
for index, row in instructions_df.iterrows():
id = row["UUID"]
print(f"Cleaning id={id}")
msg = message_by_id.get(id)
if msg is None:
print(f"Not found: {id}")
print(f"Skipping instructions for : {id}")
continue

action = row["Action"]
print(f"Action={action}")

# Delete
if action == "Delete":
print(f"deleting: {id}")
delete_message(msg)
# Replace
elif action == "Replace":
print(f"replace: {id}")
replace = row["Replace"]
msg.text = replace
# Edit
elif action == "Edit":
print(f"edit: {id}")
if row["Category"] == "Copy Code":
Expand All @@ -86,8 +105,13 @@ def delete_message(msg: ExportMessageNode):
else:
find = row["Find"]
replace = row["Replace"]
msg.text.index(find) # make sure text is present
msg.text = msg.text.replace(find, replace)
try:
msg.text.index(find) # make sure text is present
msg.text = msg.text.replace(find, replace)
except ValueError as e:
print(e)
# print(f"find not found: {find}")
continue
else:
print(f"Unsupported action {action}")

Expand Down
7 changes: 6 additions & 1 deletion oasst-data/examples/filter_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,12 @@ def approve_message(msg: ExportMessageNode) -> bool:
):
return False

if exclude_normal is True and not msg.deleted and not msg.synthetic and msg.review_result:
if (
exclude_normal is True
and not msg.deleted
and not msg.synthetic
and msg.review_result
):
return False

if spam is not None and spam != (not msg.review_result):
Expand Down
9 changes: 9 additions & 0 deletions oasst-data/examples/filter_trees.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
"""
Example usage:

python filter_trees.py /
"2023-11-05_oasst_all.jsonl" /
"2023-11-05_oasst_all.clean.jsonl" /
--states "ready_for_export"
"""

import argparse

from oasst_data import read_message_trees, write_message_trees
Expand Down
9 changes: 9 additions & 0 deletions oasst-data/examples/split_dataset.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
"""
Example usage:

python split_dataset.py /
"2023-11-05_oasst_all.messages.jsonl" /
--val_output "2023-11-05_oasst_all.messages.validation.jsonl" /
--train_output "2023-11-05_oasst_all.messages.train.jsonl"
"""

import argparse
import random

Expand Down
8 changes: 8 additions & 0 deletions oasst-data/examples/tree_to_messages.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
"""
Example usage:

python tree_to_messages.py /
"2023-11-05_oasst_all.jsonl" /
"2023-11-05_oasst_all.messages.jsonl"
"""

import argparse

from oasst_data import ExportMessageNode, read_message_trees, visit_messages_depth_first, write_messages
Expand Down
Loading
Loading