From 3a9d1241da3d21f24d70c2e2963f89c55bb53edd Mon Sep 17 00:00:00 2001 From: Andrew Maguire Date: Wed, 8 Nov 2023 16:24:37 +0000 Subject: [PATCH 1/6] Add example usage and instructions for cleaning dataset --- oasst-data/examples/clean_dataset.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/oasst-data/examples/clean_dataset.py b/oasst-data/examples/clean_dataset.py index 92a9f3e8f2..72b1ed64ce 100644 --- a/oasst-data/examples/clean_dataset.py +++ b/oasst-data/examples/clean_dataset.py @@ -1,3 +1,8 @@ +""" +Example usage: + + python clean_dataset.py "C:/Users/andre/Downloads/2023-11-05_oasst_all/2023-11-05_oasst_all.jsonl" "C:/Users/andre/Downloads/tmp.jsonl" --instructions "C:/Users/andre/Downloads/instructions.xlsx" +""" import argparse from collections import OrderedDict @@ -60,7 +65,9 @@ def delete_message(msg: ExportMessageNode): else: parent_msg = message_by_id[msg.parent_id] parent_msg.replies.remove(msg) - print(f"Branch deleted: {msg.message_id} ({count_descendants(msg)} messages)") + print( + f"Branch deleted: {msg.message_id} ({count_descendants(msg)} messages)" + ) # cleaning print("Cleaning...") From ab5148f87409e72a08b75b59dd0bf3b63951cb09 Mon Sep 17 00:00:00 2001 From: Andrew Maguire Date: Mon, 4 Dec 2023 14:06:40 +0000 Subject: [PATCH 2/6] add notebook for oasst2 generation --- .pre-commit-config.yaml | 2 +- oasst-data/examples/clean_dataset.py | 26 +- oasst-data/examples/tree_to_messages.py | 8 + oasst-data/generate_oasst2.ipynb | 783 ++++++++++++++++++++++++ 4 files changed, 812 insertions(+), 7 deletions(-) create mode 100644 oasst-data/generate_oasst2.ipynb diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 979e8466bc..ebf58cb04d 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -26,7 +26,7 @@ # # /WARNING! -exclude: build|stubs|^bot/templates/$|openassistant/templates|docs/docs/api/openapi.json|scripts/postprocessing/regex_pii_detector.py +exclude: build|stubs|^bot/templates/$|openassistant/templates|docs/docs/api/openapi.json|scripts/postprocessing/regex_pii_detector.py|oasst-data/examples/clean_dataset.py|oasst-data/examples/tree_to_messages.py default_language_version: python: python3 diff --git a/oasst-data/examples/clean_dataset.py b/oasst-data/examples/clean_dataset.py index 72b1ed64ce..28492d7db7 100644 --- a/oasst-data/examples/clean_dataset.py +++ b/oasst-data/examples/clean_dataset.py @@ -1,7 +1,10 @@ """ Example usage: - python clean_dataset.py "C:/Users/andre/Downloads/2023-11-05_oasst_all/2023-11-05_oasst_all.jsonl" "C:/Users/andre/Downloads/tmp.jsonl" --instructions "C:/Users/andre/Downloads/instructions.xlsx" + python clean_dataset.py / + "2023-11-05_oasst_all.jsonl" / + "2023-11-05_oasst_all.clean.jsonl" / + --instructions "instructions.xlsx" """ import argparse from collections import OrderedDict @@ -65,26 +68,32 @@ def delete_message(msg: ExportMessageNode): else: parent_msg = message_by_id[msg.parent_id] parent_msg.replies.remove(msg) - print( - f"Branch deleted: {msg.message_id} ({count_descendants(msg)} messages)" - ) + print(f"Branch deleted: {msg.message_id} ({count_descendants(msg)} messages)") # cleaning print("Cleaning...") for index, row in instructions_df.iterrows(): id = row["UUID"] + print(f"Cleaning id={id}") msg = message_by_id.get(id) if msg is None: print(f"Not found: {id}") + print(f"Skipping instructions for : {id}") + continue action = row["Action"] + print(f"Action={action}") + + # Delete if action == "Delete": print(f"deleting: {id}") delete_message(msg) + # Replace elif action == "Replace": print(f"replace: {id}") replace = row["Replace"] msg.text = replace + # Edit elif action == "Edit": print(f"edit: {id}") if row["Category"] == "Copy Code": @@ -93,8 +102,13 @@ def delete_message(msg: ExportMessageNode): else: find = row["Find"] replace = row["Replace"] - msg.text.index(find) # make sure text is present - msg.text = msg.text.replace(find, replace) + try: + msg.text.index(find) # make sure text is present + msg.text = msg.text.replace(find, replace) + except ValueError as e: + print(e) + # print(f"find not found: {find}") + continue else: print(f"Unsupported action {action}") diff --git a/oasst-data/examples/tree_to_messages.py b/oasst-data/examples/tree_to_messages.py index fe4a8d42f9..924a329dd2 100644 --- a/oasst-data/examples/tree_to_messages.py +++ b/oasst-data/examples/tree_to_messages.py @@ -1,3 +1,11 @@ +""" +Example usage: + + python tree_to_messages.py / + "2023-11-05_oasst_all.jsonl" / + "2023-11-05_oasst_all.messages.jsonl" +""" + import argparse from oasst_data import ExportMessageNode, read_message_trees, visit_messages_depth_first, write_messages diff --git a/oasst-data/generate_oasst2.ipynb b/oasst-data/generate_oasst2.ipynb new file mode 100644 index 0000000000..be7bdc27a3 --- /dev/null +++ b/oasst-data/generate_oasst2.ipynb @@ -0,0 +1,783 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# inputs\n", + "data_dir = \"C:/Users/andre/Downloads\"\n", + "data_out_dir = f\"{data_dir}/oasst2\"\n", + "raw_input_data_path = f\"{data_dir}/2023-11-05_oasst_all/2023-11-05_oasst_all.jsonl\"\n", + "instructions_path = f\"{data_dir}/instructions.xlsx\"\n", + "trees_filename = f\"2023-11-05_oasst_all.trees.jsonl\"\n", + "messages_filename = f\"2023-11-05_oasst_all.messages.jsonl\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# make data_out_dir if it doesn't exist\n", + "if not os.path.exists(data_out_dir):\n", + " os.makedirs(data_out_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading: C:/Users/andre/Downloads/2023-11-05_oasst_all/2023-11-05_oasst_all.jsonl\n", + "Loaded 70673 trees with 208686 messages.\n", + "Cleaning...\n", + "Cleaning id=36be40c8-2451-4b92-99b9-97f425f6955b\n", + "Action=Edit\n", + "edit: 36be40c8-2451-4b92-99b9-97f425f6955b\n", + "Cleaning id=e6933f01-4183-45bf-892c-31bdf778eee0\n", + "Action=Delete\n", + "deleting: e6933f01-4183-45bf-892c-31bdf778eee0\n", + "Tree deleted: e6933f01-4183-45bf-892c-31bdf778eee0\n", + "Cleaning id=449a995f-29a4-4d04-aa56-2dab1747a417\n", + "Action=Delete\n", + "deleting: 449a995f-29a4-4d04-aa56-2dab1747a417\n", + "Tree deleted: 449a995f-29a4-4d04-aa56-2dab1747a417\n", + "Cleaning id=4b6c83d4-b6c1-452e-b57a-a09bec46a887\n", + "Action=Delete\n", + "deleting: 4b6c83d4-b6c1-452e-b57a-a09bec46a887\n", + "Tree deleted: 4b6c83d4-b6c1-452e-b57a-a09bec46a887\n", + "Cleaning id=af60f432-7fb2-4f63-b73a-6092a35bf21b\n", + "Action=Delete\n", + "deleting: af60f432-7fb2-4f63-b73a-6092a35bf21b\n", + "Tree deleted: af60f432-7fb2-4f63-b73a-6092a35bf21b\n", + "Cleaning id=5ab3a8bd-bf74-4331-9564-c7e9a663de8e\n", + "Action=Edit\n", + "edit: 5ab3a8bd-bf74-4331-9564-c7e9a663de8e\n", + "Cleaning id=1c001f37-ae16-49fd-9877-48ffcc8091c4\n", + "Action=Edit\n", + "edit: 1c001f37-ae16-49fd-9877-48ffcc8091c4\n", + "Cleaning id=4fedd950-75b6-44f7-a908-3eac7d6b6056\n", + "Action=Delete\n", + "deleting: 4fedd950-75b6-44f7-a908-3eac7d6b6056\n", + "Tree deleted: 4fedd950-75b6-44f7-a908-3eac7d6b6056\n", + "Cleaning id=15b74473-55d4-41de-bdba-6aef17b08cc2\n", + "Action=Delete\n", + "deleting: 15b74473-55d4-41de-bdba-6aef17b08cc2\n", + "Tree deleted: 15b74473-55d4-41de-bdba-6aef17b08cc2\n", + "Cleaning id=e921db26-b14e-4a98-b854-ee8c07fef2d7\n", + "Action=Delete\n", + "deleting: e921db26-b14e-4a98-b854-ee8c07fef2d7\n", + "Tree deleted: e921db26-b14e-4a98-b854-ee8c07fef2d7\n", + "Cleaning id=65a641e2-bfd2-4481-b5fa-5ea4bffb7867\n", + "Action=Delete\n", + "deleting: 65a641e2-bfd2-4481-b5fa-5ea4bffb7867\n", + "Tree deleted: 65a641e2-bfd2-4481-b5fa-5ea4bffb7867\n", + "Cleaning id=ea20c005-fd42-4b1f-9089-f0545d4f379c\n", + "Action=Replace\n", + "replace: ea20c005-fd42-4b1f-9089-f0545d4f379c\n", + "Cleaning id=10dd0c91-f372-423b-96f5-045ca304166f\n", + "Action=Replace\n", + "replace: 10dd0c91-f372-423b-96f5-045ca304166f\n", + "Cleaning id=be6bb6d1-60ee-4418-ba29-5d61ba979935\n", + "Action=Delete\n", + "deleting: be6bb6d1-60ee-4418-ba29-5d61ba979935\n", + "Tree deleted: be6bb6d1-60ee-4418-ba29-5d61ba979935\n", + "Cleaning id=29bc510e-bba5-45a5-9005-d091f95351fb\n", + "Action=Delete\n", + "deleting: 29bc510e-bba5-45a5-9005-d091f95351fb\n", + "Tree deleted: 29bc510e-bba5-45a5-9005-d091f95351fb\n", + "Cleaning id=b60c32ae-2cc3-4896-9f92-347b5e65bce7\n", + "Action=Delete\n", + "deleting: b60c32ae-2cc3-4896-9f92-347b5e65bce7\n", + "Tree deleted: b60c32ae-2cc3-4896-9f92-347b5e65bce7\n", + "Cleaning id=18b6e6e2-f392-4bd4-9d12-ccd706dccbb6\n", + "Action=Delete\n", + "deleting: 18b6e6e2-f392-4bd4-9d12-ccd706dccbb6\n", + "Tree deleted: 18b6e6e2-f392-4bd4-9d12-ccd706dccbb6\n", + "Cleaning id=89141477-b899-4357-ae7f-50c2d33a119b\n", + "Action=Delete\n", + "deleting: 89141477-b899-4357-ae7f-50c2d33a119b\n", + "Tree deleted: 89141477-b899-4357-ae7f-50c2d33a119b\n", + "Cleaning id=ba17e6e7-bde1-41b3-99c8-e5b537f5abbe\n", + "Action=Delete\n", + "deleting: ba17e6e7-bde1-41b3-99c8-e5b537f5abbe\n", + "Tree deleted: ba17e6e7-bde1-41b3-99c8-e5b537f5abbe\n", + "Cleaning id=7e93d535-dd90-48a2-beeb-b71917912e90\n", + "Action=Delete\n", + "deleting: 7e93d535-dd90-48a2-beeb-b71917912e90\n", + "Tree deleted: 7e93d535-dd90-48a2-beeb-b71917912e90\n", + "Cleaning id=ede2cfed-22d2-421d-b218-adc7402f8a5a\n", + "Action=Delete\n", + "deleting: ede2cfed-22d2-421d-b218-adc7402f8a5a\n", + "Tree deleted: ede2cfed-22d2-421d-b218-adc7402f8a5a\n", + "Cleaning id=64bfc00b-9c53-4a5d-8187-91cd2c581e78\n", + "Action=Delete\n", + "deleting: 64bfc00b-9c53-4a5d-8187-91cd2c581e78\n", + "Tree deleted: 64bfc00b-9c53-4a5d-8187-91cd2c581e78\n", + "Cleaning id=80904428-577f-4cc1-95f7-c8383b093e49\n", + "Action=Delete\n", + "deleting: 80904428-577f-4cc1-95f7-c8383b093e49\n", + "Tree deleted: 80904428-577f-4cc1-95f7-c8383b093e49\n", + "Cleaning id=89a37680-a7b6-4437-bd24-ecfe8395eb29\n", + "Action=Delete\n", + "deleting: 89a37680-a7b6-4437-bd24-ecfe8395eb29\n", + "Branch deleted: 89a37680-a7b6-4437-bd24-ecfe8395eb29 (1 messages)\n", + "Cleaning id=549a7cf2-f6bc-4b13-855f-601c714bb033\n", + "Action=Delete\n", + "deleting: 549a7cf2-f6bc-4b13-855f-601c714bb033\n", + "Tree deleted: 549a7cf2-f6bc-4b13-855f-601c714bb033\n", + "Cleaning id=e5823ee5-ffd3-4604-8973-325aa1f3b2f1\n", + "Action=Delete\n", + "deleting: e5823ee5-ffd3-4604-8973-325aa1f3b2f1\n", + "Tree deleted: e5823ee5-ffd3-4604-8973-325aa1f3b2f1\n", + "Cleaning id=6b665310-180c-4a4b-95e0-b4cc4a0ff501\n", + "Action=Delete\n", + "deleting: 6b665310-180c-4a4b-95e0-b4cc4a0ff501\n", + "Branch deleted: 6b665310-180c-4a4b-95e0-b4cc4a0ff501 (1 messages)\n", + "Cleaning id=12bdba1b-e7be-432e-8619-4428e98dc144\n", + "Action=Delete\n", + "deleting: 12bdba1b-e7be-432e-8619-4428e98dc144\n", + "Branch deleted: 12bdba1b-e7be-432e-8619-4428e98dc144 (1 messages)\n", + "Cleaning id=4b0a0f5e-8e75-4917-b9de-5b7f0f651b2f\n", + "Action=Delete\n", + "deleting: 4b0a0f5e-8e75-4917-b9de-5b7f0f651b2f\n", + "Branch deleted: 4b0a0f5e-8e75-4917-b9de-5b7f0f651b2f (1 messages)\n", + "Cleaning id=5719d9f1-1cb7-4e83-bd36-d9728cefe2a3\n", + "Action=Delete\n", + "deleting: 5719d9f1-1cb7-4e83-bd36-d9728cefe2a3\n", + "Branch deleted: 5719d9f1-1cb7-4e83-bd36-d9728cefe2a3 (1 messages)\n", + "Cleaning id=b38fbad4-b43f-42f2-8561-1523165939a9\n", + "Action=Delete\n", + "deleting: b38fbad4-b43f-42f2-8561-1523165939a9\n", + "Branch deleted: b38fbad4-b43f-42f2-8561-1523165939a9 (1 messages)\n", + "Cleaning id=f3b49fe6-1ce5-44ae-b78f-95b4572a161d\n", + "Action=Delete\n", + "deleting: f3b49fe6-1ce5-44ae-b78f-95b4572a161d\n", + "Branch deleted: f3b49fe6-1ce5-44ae-b78f-95b4572a161d (1 messages)\n", + "Cleaning id=7dac84b7-407b-4b8a-8b77-ef384b2c117e\n", + "Action=Delete\n", + "deleting: 7dac84b7-407b-4b8a-8b77-ef384b2c117e\n", + "Branch deleted: 7dac84b7-407b-4b8a-8b77-ef384b2c117e (1 messages)\n", + "Cleaning id=3de67512-a8bd-4c0b-8b3a-c868fb868ee4\n", + "Action=Delete\n", + "deleting: 3de67512-a8bd-4c0b-8b3a-c868fb868ee4\n", + "Branch deleted: 3de67512-a8bd-4c0b-8b3a-c868fb868ee4 (1 messages)\n", + "Cleaning id=7effedf7-4857-4d60-b690-34814d9c9086\n", + "Action=Delete\n", + "deleting: 7effedf7-4857-4d60-b690-34814d9c9086\n", + "Branch deleted: 7effedf7-4857-4d60-b690-34814d9c9086 (3 messages)\n", + "Cleaning id=496ee685-a186-4498-ab9f-71b7602b9594\n", + "Action=Delete\n", + "deleting: 496ee685-a186-4498-ab9f-71b7602b9594\n", + "Branch deleted: 496ee685-a186-4498-ab9f-71b7602b9594 (1 messages)\n", + "Cleaning id=0a4d14a7-6831-4fa2-b04e-ff1186651200\n", + "Action=Delete\n", + "deleting: 0a4d14a7-6831-4fa2-b04e-ff1186651200\n", + "Branch deleted: 0a4d14a7-6831-4fa2-b04e-ff1186651200 (1 messages)\n", + "Cleaning id=0ed9a702-199c-4fc9-9dcc-1343e4140f88\n", + "Action=Delete\n", + "deleting: 0ed9a702-199c-4fc9-9dcc-1343e4140f88\n", + "Branch deleted: 0ed9a702-199c-4fc9-9dcc-1343e4140f88 (1 messages)\n", + "Cleaning id=803b00c5-cd83-41fd-a3f5-8c88220bfdfe\n", + "Action=Delete\n", + "deleting: 803b00c5-cd83-41fd-a3f5-8c88220bfdfe\n", + "Branch deleted: 803b00c5-cd83-41fd-a3f5-8c88220bfdfe (1 messages)\n", + "Cleaning id=eb2b4c9b-c040-41a8-a67c-dac68c020b0a\n", + "Action=Delete\n", + "deleting: eb2b4c9b-c040-41a8-a67c-dac68c020b0a\n", + "Tree deleted: eb2b4c9b-c040-41a8-a67c-dac68c020b0a\n", + "Cleaning id=61e0411e-27d4-4778-80f0-2501b1a36786\n", + "Action=Delete\n", + "deleting: 61e0411e-27d4-4778-80f0-2501b1a36786\n", + "Tree deleted: 61e0411e-27d4-4778-80f0-2501b1a36786\n", + "Cleaning id=cc193de3-0ccc-4844-b2fc-8b67d0cbf89c\n", + "Action=Delete\n", + "deleting: cc193de3-0ccc-4844-b2fc-8b67d0cbf89c\n", + "Branch deleted: cc193de3-0ccc-4844-b2fc-8b67d0cbf89c (1 messages)\n", + "Cleaning id=994141fc-e7b6-462f-9b64-305379038be1\n", + "Action=Delete\n", + "deleting: 994141fc-e7b6-462f-9b64-305379038be1\n", + "Tree deleted: 994141fc-e7b6-462f-9b64-305379038be1\n", + "Cleaning id=f3949958-ced8-4ed8-9de5-2c50e4296c1e\n", + "Action=Delete\n", + "deleting: f3949958-ced8-4ed8-9de5-2c50e4296c1e\n", + "Branch deleted: f3949958-ced8-4ed8-9de5-2c50e4296c1e (1 messages)\n", + "Cleaning id=bf12f013-a961-49eb-99a4-9f520f52bd52\n", + "Action=Delete\n", + "deleting: bf12f013-a961-49eb-99a4-9f520f52bd52\n", + "Branch deleted: bf12f013-a961-49eb-99a4-9f520f52bd52 (1 messages)\n", + "Cleaning id=59f41793-7f3a-480f-819f-b581faae1e7a\n", + "Action=Edit\n", + "edit: 59f41793-7f3a-480f-819f-b581faae1e7a\n", + "Cleaning id=a1f43d47-f0f2-42d3-804c-605714cc56f0\n", + "Action=Delete\n", + "deleting: a1f43d47-f0f2-42d3-804c-605714cc56f0\n", + "Branch deleted: a1f43d47-f0f2-42d3-804c-605714cc56f0 (1 messages)\n", + "Cleaning id=2c0b821c-2525-4c69-8480-44aaea186d14\n", + "Action=Delete\n", + "deleting: 2c0b821c-2525-4c69-8480-44aaea186d14\n", + "Tree deleted: 2c0b821c-2525-4c69-8480-44aaea186d14\n", + "Cleaning id=7695f9be-4196-4c74-918f-984ae75f0f94\n", + "Action=Delete\n", + "deleting: 7695f9be-4196-4c74-918f-984ae75f0f94\n", + "Tree deleted: 7695f9be-4196-4c74-918f-984ae75f0f94\n", + "Cleaning id=f70cd3b7-6df9-40fe-a73b-13528f3cf49b\n", + "Action=Delete\n", + "deleting: f70cd3b7-6df9-40fe-a73b-13528f3cf49b\n", + "Tree deleted: f70cd3b7-6df9-40fe-a73b-13528f3cf49b\n", + "Cleaning id=ed80741a-1e3b-4a86-879e-51369e18e796\n", + "Action=Delete\n", + "deleting: ed80741a-1e3b-4a86-879e-51369e18e796\n", + "Branch deleted: ed80741a-1e3b-4a86-879e-51369e18e796 (1 messages)\n", + "Cleaning id=1b36018f-7f42-49fe-8e83-b0af4b29a04e\n", + "Not found: 1b36018f-7f42-49fe-8e83-b0af4b29a04e\n", + "Skipping instructions for : 1b36018f-7f42-49fe-8e83-b0af4b29a04e\n", + "Cleaning id=52747b24-82dc-42f0-b764-f689e885b50b\n", + "Action=Delete\n", + "deleting: 52747b24-82dc-42f0-b764-f689e885b50b\n", + "Tree deleted: 52747b24-82dc-42f0-b764-f689e885b50b\n", + "Cleaning id=419354d2-2f8e-4d14-bd87-1c27b9253fea\n", + "Action=Delete\n", + "deleting: 419354d2-2f8e-4d14-bd87-1c27b9253fea\n", + "Branch deleted: 419354d2-2f8e-4d14-bd87-1c27b9253fea (1 messages)\n", + "Cleaning id=22dfa8a2-2776-47b0-972d-259b63597865\n", + "Action=Delete\n", + "deleting: 22dfa8a2-2776-47b0-972d-259b63597865\n", + "Tree deleted: 22dfa8a2-2776-47b0-972d-259b63597865\n", + "Cleaning id=2840a94b-ecfe-4281-8210-a866e63ee14b\n", + "Action=Delete\n", + "deleting: 2840a94b-ecfe-4281-8210-a866e63ee14b\n", + "Tree deleted: 2840a94b-ecfe-4281-8210-a866e63ee14b\n", + "Cleaning id=1c9fe3c7-19a6-4e7a-ba31-a7cacd8db4d0\n", + "Action=Delete\n", + "deleting: 1c9fe3c7-19a6-4e7a-ba31-a7cacd8db4d0\n", + "Branch deleted: 1c9fe3c7-19a6-4e7a-ba31-a7cacd8db4d0 (1 messages)\n", + "Cleaning id=36b09cbd-d369-47a0-813a-c5f124e18e38\n", + "Action=Delete\n", + "deleting: 36b09cbd-d369-47a0-813a-c5f124e18e38\n", + "Branch deleted: 36b09cbd-d369-47a0-813a-c5f124e18e38 (1 messages)\n", + "Cleaning id=cb551444-a1ab-4eb7-b0b2-d68c15da6d4d\n", + "Action=Delete\n", + "deleting: cb551444-a1ab-4eb7-b0b2-d68c15da6d4d\n", + "Branch deleted: cb551444-a1ab-4eb7-b0b2-d68c15da6d4d (1 messages)\n", + "Cleaning id=eb1d86db-4f14-4157-90d1-3d2f2e9940df\n", + "Action=Delete\n", + "deleting: eb1d86db-4f14-4157-90d1-3d2f2e9940df\n", + "Branch deleted: eb1d86db-4f14-4157-90d1-3d2f2e9940df (1 messages)\n", + "Cleaning id=b0f0b044-ecec-4222-ac17-d5c9cd954e11\n", + "Action=Delete\n", + "deleting: b0f0b044-ecec-4222-ac17-d5c9cd954e11\n", + "Branch deleted: b0f0b044-ecec-4222-ac17-d5c9cd954e11 (2 messages)\n", + "Cleaning id=29458fc2-5359-48c9-b542-cce9cb92da93\n", + "Action=Delete\n", + "deleting: 29458fc2-5359-48c9-b542-cce9cb92da93\n", + "Branch deleted: 29458fc2-5359-48c9-b542-cce9cb92da93 (1 messages)\n", + "Cleaning id=39a7a2e4-ca84-4c41-b229-891117eb54fb\n", + "Action=Delete\n", + "deleting: 39a7a2e4-ca84-4c41-b229-891117eb54fb\n", + "Branch deleted: 39a7a2e4-ca84-4c41-b229-891117eb54fb (1 messages)\n", + "Cleaning id=d757fdd4-5748-4a02-8924-6dfb5583596d\n", + "Not found: d757fdd4-5748-4a02-8924-6dfb5583596d\n", + "Skipping instructions for : d757fdd4-5748-4a02-8924-6dfb5583596d\n", + "Cleaning id=66ddd46c-299f-4394-b0d5-51ab66ca06bd\n", + "Not found: 66ddd46c-299f-4394-b0d5-51ab66ca06bd\n", + "Skipping instructions for : 66ddd46c-299f-4394-b0d5-51ab66ca06bd\n", + "Cleaning id=94da7786-44b5-4ca9-9414-fd4f85ae6ca5\n", + "Action=Delete\n", + "deleting: 94da7786-44b5-4ca9-9414-fd4f85ae6ca5\n", + "Branch deleted: 94da7786-44b5-4ca9-9414-fd4f85ae6ca5 (2 messages)\n", + "Cleaning id=f52c527e-b8d9-4948-87c7-12f40e5f4c18\n", + "Action=Delete\n", + "deleting: f52c527e-b8d9-4948-87c7-12f40e5f4c18\n", + "Branch deleted: f52c527e-b8d9-4948-87c7-12f40e5f4c18 (2 messages)\n", + "Cleaning id=3bd4b565-e359-49ea-ae35-9ef4918a3454\n", + "Action=Delete\n", + "deleting: 3bd4b565-e359-49ea-ae35-9ef4918a3454\n", + "Branch deleted: 3bd4b565-e359-49ea-ae35-9ef4918a3454 (1 messages)\n", + "Cleaning id=f01e097d-774f-43ab-b9cf-ad1c817cbb71\n", + "Action=Delete\n", + "deleting: f01e097d-774f-43ab-b9cf-ad1c817cbb71\n", + "Branch deleted: f01e097d-774f-43ab-b9cf-ad1c817cbb71 (1 messages)\n", + "Cleaning id=ccf8373b-a8cf-49be-b612-7336abf1394c\n", + "Action=Delete\n", + "deleting: ccf8373b-a8cf-49be-b612-7336abf1394c\n", + "Branch deleted: ccf8373b-a8cf-49be-b612-7336abf1394c (1 messages)\n", + "Cleaning id=3c419703-2097-4946-8901-c66d4e2c0ef7\n", + "Action=Delete\n", + "deleting: 3c419703-2097-4946-8901-c66d4e2c0ef7\n", + "Branch deleted: 3c419703-2097-4946-8901-c66d4e2c0ef7 (3 messages)\n", + "Cleaning id=31a5583c-874e-402b-970c-ba2d9a4dcaae\n", + "Action=Delete\n", + "deleting: 31a5583c-874e-402b-970c-ba2d9a4dcaae\n", + "Branch deleted: 31a5583c-874e-402b-970c-ba2d9a4dcaae (2 messages)\n", + "Cleaning id=d2fd731c-23cd-4b2f-b509-cf669f221756\n", + "Action=Delete\n", + "deleting: d2fd731c-23cd-4b2f-b509-cf669f221756\n", + "Branch deleted: d2fd731c-23cd-4b2f-b509-cf669f221756 (1 messages)\n", + "Cleaning id=c812f202-fff3-4c91-8d69-11c8b2658c0f\n", + "Not found: c812f202-fff3-4c91-8d69-11c8b2658c0f\n", + "Skipping instructions for : c812f202-fff3-4c91-8d69-11c8b2658c0f\n", + "Cleaning id=6c49c918-9a73-41ca-9dfb-74c0685e861a\n", + "Action=Delete\n", + "deleting: 6c49c918-9a73-41ca-9dfb-74c0685e861a\n", + "Tree deleted: 6c49c918-9a73-41ca-9dfb-74c0685e861a\n", + "Cleaning id=cf659a23-c5aa-494c-bb72-c08ff3cd9f93\n", + "Action=Delete\n", + "deleting: cf659a23-c5aa-494c-bb72-c08ff3cd9f93\n", + "Branch deleted: cf659a23-c5aa-494c-bb72-c08ff3cd9f93 (1 messages)\n", + "Cleaning id=753de123-8213-4f98-90ce-f6137f083db2\n", + "Action=Delete\n", + "deleting: 753de123-8213-4f98-90ce-f6137f083db2\n", + "Tree deleted: 753de123-8213-4f98-90ce-f6137f083db2\n", + "Cleaning id=3311a64f-3d64-40a0-8647-edb12616225e\n", + "Action=Delete\n", + "deleting: 3311a64f-3d64-40a0-8647-edb12616225e\n", + "Branch deleted: 3311a64f-3d64-40a0-8647-edb12616225e (1 messages)\n", + "Cleaning id=05c3893d-9c74-4618-b690-360317677d3f\n", + "Action=Delete\n", + "deleting: 05c3893d-9c74-4618-b690-360317677d3f\n", + "Branch deleted: 05c3893d-9c74-4618-b690-360317677d3f (1 messages)\n", + "Cleaning id=15f4aeab-de6c-43a0-9530-33c99e8386ff\n", + "Action=Delete\n", + "deleting: 15f4aeab-de6c-43a0-9530-33c99e8386ff\n", + "Branch deleted: 15f4aeab-de6c-43a0-9530-33c99e8386ff (1 messages)\n", + "Cleaning id=29e4ae6c-bbb6-4e07-8fc9-c400f61cbc12\n", + "Action=Delete\n", + "deleting: 29e4ae6c-bbb6-4e07-8fc9-c400f61cbc12\n", + "Branch deleted: 29e4ae6c-bbb6-4e07-8fc9-c400f61cbc12 (1 messages)\n", + "Cleaning id=ca34580a-e5df-4064-aac2-9eac4099e7ce\n", + "Action=Delete\n", + "deleting: ca34580a-e5df-4064-aac2-9eac4099e7ce\n", + "Branch deleted: ca34580a-e5df-4064-aac2-9eac4099e7ce (1 messages)\n", + "Cleaning id=4fba85a6-17b9-4f7a-b18c-f6e52f772700\n", + "Action=Delete\n", + "deleting: 4fba85a6-17b9-4f7a-b18c-f6e52f772700\n", + "Branch deleted: 4fba85a6-17b9-4f7a-b18c-f6e52f772700 (1 messages)\n", + "Cleaning id=2428fc50-d942-41e9-ac43-5252f7519485\n", + "Action=Delete\n", + "deleting: 2428fc50-d942-41e9-ac43-5252f7519485\n", + "Branch deleted: 2428fc50-d942-41e9-ac43-5252f7519485 (1 messages)\n", + "Cleaning id=f5482168-3fbc-4a15-a6e0-8660cea70f37\n", + "Action=Edit\n", + "edit: f5482168-3fbc-4a15-a6e0-8660cea70f37\n", + "substring not found\n", + "Cleaning id=9b48bb3f-94d4-4fa8-91ed-64683e63206c\n", + "Action=Edit\n", + "edit: 9b48bb3f-94d4-4fa8-91ed-64683e63206c\n", + "substring not found\n", + "Cleaning id=68c80cb8-9998-4ca3-a4f1-3e244a7aac2a\n", + "Action=Edit\n", + "edit: 68c80cb8-9998-4ca3-a4f1-3e244a7aac2a\n", + "Cleaning id=833b1b12-574a-4662-b1c6-33d0202a4a00\n", + "Action=Edit\n", + "edit: 833b1b12-574a-4662-b1c6-33d0202a4a00\n", + "Cleaning id=02702885-6c2e-4e49-8f92-f766c2a3b940\n", + "Not found: 02702885-6c2e-4e49-8f92-f766c2a3b940\n", + "Skipping instructions for : 02702885-6c2e-4e49-8f92-f766c2a3b940\n", + "Cleaning id=6a0823fe-4302-44e0-8a19-8bdc91a8ac7e\n", + "Not found: 6a0823fe-4302-44e0-8a19-8bdc91a8ac7e\n", + "Skipping instructions for : 6a0823fe-4302-44e0-8a19-8bdc91a8ac7e\n", + "Cleaning id=f4dd77d4-24dd-4ab8-a568-9d0edca5fe17\n", + "Not found: f4dd77d4-24dd-4ab8-a568-9d0edca5fe17\n", + "Skipping instructions for : f4dd77d4-24dd-4ab8-a568-9d0edca5fe17\n", + "Cleaning id=8ad4358b-9dd4-4f35-b3e2-17af38abc0d4\n", + "Action=Edit\n", + "edit: 8ad4358b-9dd4-4f35-b3e2-17af38abc0d4\n", + "Cleaning id=68a72d7a-e42e-4e5f-b22f-9964f0463cc4\n", + "Action=Edit\n", + "edit: 68a72d7a-e42e-4e5f-b22f-9964f0463cc4\n", + "substring not found\n", + "Cleaning id=ec357197-0ee8-4acd-8072-29aa8e76292e\n", + "Action=Edit\n", + "edit: ec357197-0ee8-4acd-8072-29aa8e76292e\n", + "substring not found\n", + "Cleaning id=edbecc63-47e4-4815-8744-5ad69f93bb33\n", + "Action=Edit\n", + "edit: edbecc63-47e4-4815-8744-5ad69f93bb33\n", + "substring not found\n", + "Cleaning id=5fb75daf-1926-47a2-85e8-f80c00ba7d03\n", + "Action=Edit\n", + "edit: 5fb75daf-1926-47a2-85e8-f80c00ba7d03\n", + "substring not found\n", + "Cleaning id=776f420b-6020-4f77-8b72-d48b580a0755\n", + "Action=Edit\n", + "edit: 776f420b-6020-4f77-8b72-d48b580a0755\n", + "substring not found\n", + "Cleaning id=4e97e7f6-cd85-4d8d-b73c-438faa23dd95\n", + "Action=Edit\n", + "edit: 4e97e7f6-cd85-4d8d-b73c-438faa23dd95\n", + "Cleaning id=fc010f62-4ebf-46cd-893f-180cde59f0f5\n", + "Action=Edit\n", + "edit: fc010f62-4ebf-46cd-893f-180cde59f0f5\n", + "Cleaning id=118f9a1c-f976-4120-8ff9-934b22545b0d\n", + "Action=Edit\n", + "edit: 118f9a1c-f976-4120-8ff9-934b22545b0d\n", + "substring not found\n", + "Cleaning id=8f70327e-1239-4564-a938-b9649465f14a\n", + "Action=Edit\n", + "edit: 8f70327e-1239-4564-a938-b9649465f14a\n", + "Cleaning id=b0ad34ae-4080-44bf-939f-7ece554fe9bb\n", + "Action=Edit\n", + "edit: b0ad34ae-4080-44bf-939f-7ece554fe9bb\n", + "Cleaning id=7800454d-b340-49d8-8d52-bd26f7c550e6\n", + "Action=Edit\n", + "edit: 7800454d-b340-49d8-8d52-bd26f7c550e6\n", + "Cleaning id=56de418b-c125-4d26-9d77-7db05d548faa\n", + "Action=Edit\n", + "edit: 56de418b-c125-4d26-9d77-7db05d548faa\n", + "Cleaning id=8b7c85bf-da3c-40e1-b3f8-b7fcd10424b6\n", + "Action=Edit\n", + "edit: 8b7c85bf-da3c-40e1-b3f8-b7fcd10424b6\n", + "Cleaning id=8d0d0f03-2fd0-4236-a872-3826cc9d36d6\n", + "Action=Edit\n", + "edit: 8d0d0f03-2fd0-4236-a872-3826cc9d36d6\n", + "Cleaning id=976893a1-416c-448d-84e4-7ee0311b6809\n", + "Action=Edit\n", + "edit: 976893a1-416c-448d-84e4-7ee0311b6809\n", + "substring not found\n", + "Cleaning id=34e3d882-9be9-44a7-859e-78a828efa0f8\n", + "Action=Edit\n", + "edit: 34e3d882-9be9-44a7-859e-78a828efa0f8\n", + "substring not found\n", + "Cleaning id=f0d3b169-1e2a-43d7-83ce-a792304e1de0\n", + "Action=Edit\n", + "edit: f0d3b169-1e2a-43d7-83ce-a792304e1de0\n", + "Cleaning id=2528f101-fbe9-4907-a44a-783e74e47aa6\n", + "Action=Edit\n", + "edit: 2528f101-fbe9-4907-a44a-783e74e47aa6\n", + "Cleaning id=84418163-5721-4653-9730-c483dd7b563e\n", + "Action=Edit\n", + "edit: 84418163-5721-4653-9730-c483dd7b563e\n", + "Cleaning id=7953dff8-09ec-4372-a1dc-f7c8a2ae6053\n", + "Action=Edit\n", + "edit: 7953dff8-09ec-4372-a1dc-f7c8a2ae6053\n", + "Cleaning id=cfa6b4b7-406a-4990-81b0-a2169cbed8d3\n", + "Action=Edit\n", + "edit: cfa6b4b7-406a-4990-81b0-a2169cbed8d3\n", + "Cleaning id=bfbf6135-3818-4d28-ac8e-cd9946bc72f3\n", + "Action=Edit\n", + "edit: bfbf6135-3818-4d28-ac8e-cd9946bc72f3\n", + "Cleaning id=ce81ca69-5a48-43eb-8df7-580a3d68578d\n", + "Action=Edit\n", + "edit: ce81ca69-5a48-43eb-8df7-580a3d68578d\n", + "Cleaning id=647add48-fbc9-4ac6-9930-1901da34520a\n", + "Action=Edit\n", + "edit: 647add48-fbc9-4ac6-9930-1901da34520a\n", + "Cleaning id=24545bf7-949d-446b-9b68-2553b2392357\n", + "Action=Edit\n", + "edit: 24545bf7-949d-446b-9b68-2553b2392357\n", + "Cleaning id=b15d9839-8e4a-4d7b-8b91-634eb1b37376\n", + "Action=Edit\n", + "edit: b15d9839-8e4a-4d7b-8b91-634eb1b37376\n", + "Cleaning id=f4e9c4d0-8686-451a-bc90-8dd7a5a51fbe\n", + "Action=Edit\n", + "edit: f4e9c4d0-8686-451a-bc90-8dd7a5a51fbe\n", + "Cleaning id=bea22011-334a-4e34-8c11-b1c1566de59c\n", + "Action=Edit\n", + "edit: bea22011-334a-4e34-8c11-b1c1566de59c\n", + "Cleaning id=756cdbe4-e04e-46bf-a049-e375e5100653\n", + "Action=Edit\n", + "edit: 756cdbe4-e04e-46bf-a049-e375e5100653\n", + "Cleaning id=b1aee184-1ec4-45dc-9a9c-515de51f636f\n", + "Action=Edit\n", + "edit: b1aee184-1ec4-45dc-9a9c-515de51f636f\n", + "Cleaning id=4f5227cc-1d36-4e19-b78f-d08a60dc2141\n", + "Action=Edit\n", + "edit: 4f5227cc-1d36-4e19-b78f-d08a60dc2141\n", + "Cleaning id=14d1d5e8-61a2-4e0b-898c-f4b2c38236c2\n", + "Action=Edit\n", + "edit: 14d1d5e8-61a2-4e0b-898c-f4b2c38236c2\n", + "substring not found\n", + "Cleaning id=a275451b-6674-467b-b7cf-539cfec31f64\n", + "Action=Edit\n", + "edit: a275451b-6674-467b-b7cf-539cfec31f64\n", + "Cleaning id=58adf822-cd49-4c18-b72e-526d9473bf6b\n", + "Action=Edit\n", + "edit: 58adf822-cd49-4c18-b72e-526d9473bf6b\n", + "Cleaning id=ae9d3f55-9f6f-41ea-a0bc-0f0eea162cb9\n", + "Action=Edit\n", + "edit: ae9d3f55-9f6f-41ea-a0bc-0f0eea162cb9\n", + "Cleaning id=1aed98d8-de97-449d-8f10-5ee506bd3ae4\n", + "Action=Edit\n", + "edit: 1aed98d8-de97-449d-8f10-5ee506bd3ae4\n", + "Cleaning id=d71ad38d-3bdd-4eed-8d19-526156f545a2\n", + "Action=Edit\n", + "edit: d71ad38d-3bdd-4eed-8d19-526156f545a2\n", + "Cleaning id=ae59b5aa-39cb-4ba0-8f05-8ae516ec9b3f\n", + "Action=Edit\n", + "edit: ae59b5aa-39cb-4ba0-8f05-8ae516ec9b3f\n", + "Cleaning id=7b0b2599-294f-4d4b-be67-724f6e17280b\n", + "Action=Edit\n", + "edit: 7b0b2599-294f-4d4b-be67-724f6e17280b\n", + "Cleaning id=6fc61c3c-af7a-4c7f-9556-3e2317795421\n", + "Action=Edit\n", + "edit: 6fc61c3c-af7a-4c7f-9556-3e2317795421\n", + "Cleaning id=307d9761-8dd0-4e7f-99fe-88fe9156a989\n", + "Action=Edit\n", + "edit: 307d9761-8dd0-4e7f-99fe-88fe9156a989\n", + "Cleaning id=4b509f26-ebb4-4de3-b665-2f506d9019ac\n", + "Action=Edit\n", + "edit: 4b509f26-ebb4-4de3-b665-2f506d9019ac\n", + "Cleaning id=7cbd1761-35d7-482d-8c6e-2cfde4677681\n", + "Action=Edit\n", + "edit: 7cbd1761-35d7-482d-8c6e-2cfde4677681\n", + "Cleaning id=2302958f-8b4b-467f-8955-8a991ddf7836\n", + "Action=Edit\n", + "edit: 2302958f-8b4b-467f-8955-8a991ddf7836\n", + "Cleaning id=377b1d72-cb10-4959-a20a-0a3846d34fbe\n", + "Action=Edit\n", + "edit: 377b1d72-cb10-4959-a20a-0a3846d34fbe\n", + "Cleaning id=6f272372-540c-47be-ac90-1f0b0f24b944\n", + "Action=Edit\n", + "edit: 6f272372-540c-47be-ac90-1f0b0f24b944\n", + "Cleaning id=ce4c4015-b5fd-464a-ab1b-df8d994e55ea\n", + "Action=Edit\n", + "edit: ce4c4015-b5fd-464a-ab1b-df8d994e55ea\n", + "substring not found\n", + "Cleaning id=c51c25d5-f632-436a-840c-e46ff07e3e79\n", + "Not found: c51c25d5-f632-436a-840c-e46ff07e3e79\n", + "Skipping instructions for : c51c25d5-f632-436a-840c-e46ff07e3e79\n", + "Cleaning id=598cc071-e847-48a6-a064-b1f6447654fb\n", + "Not found: 598cc071-e847-48a6-a064-b1f6447654fb\n", + "Skipping instructions for : 598cc071-e847-48a6-a064-b1f6447654fb\n", + "Cleaning id=743d0067-999c-4987-996b-8cf746a84195\n", + "Action=Edit\n", + "edit: 743d0067-999c-4987-996b-8cf746a84195\n", + "substring not found\n", + "Cleaning id=f372c08d-6054-491b-b503-b45f5996b854\n", + "Action=Edit\n", + "edit: f372c08d-6054-491b-b503-b45f5996b854\n", + "substring not found\n", + "Cleaning id=6df241aa-f5f8-4649-8313-1f8128f9bdcd\n", + "Action=Edit\n", + "edit: 6df241aa-f5f8-4649-8313-1f8128f9bdcd\n", + "Cleaning id=f817015f-0524-4ea4-a691-c8b6137858b4\n", + "Action=Edit\n", + "edit: f817015f-0524-4ea4-a691-c8b6137858b4\n", + "substring not found\n", + "Cleaning id=3eb60a82-2a3e-4a2b-b49f-7ea21002bd3d\n", + "Action=Edit\n", + "edit: 3eb60a82-2a3e-4a2b-b49f-7ea21002bd3d\n", + "substring not found\n", + "Cleaning id=b2763984-9f09-41b4-a1b3-18ecebf8eaaf\n", + "Action=Edit\n", + "edit: b2763984-9f09-41b4-a1b3-18ecebf8eaaf\n", + "substring not found\n", + "Cleaning id=01216bb7-2999-411d-9224-d9ad12aeb7ae\n", + "Action=Edit\n", + "edit: 01216bb7-2999-411d-9224-d9ad12aeb7ae\n", + "substring not found\n", + "Cleaning id=4e722d24-f373-48b9-b8df-afb5f564fd18\n", + "Action=Edit\n", + "edit: 4e722d24-f373-48b9-b8df-afb5f564fd18\n", + "Cleaning id=28df10e4-62a6-4b4e-84da-a9806f743b40\n", + "Action=Edit\n", + "edit: 28df10e4-62a6-4b4e-84da-a9806f743b40\n", + "substring not found\n", + "Cleaning id=2134fb02-ac5b-4c80-bfc9-0b4f7811ac22\n", + "Action=Edit\n", + "edit: 2134fb02-ac5b-4c80-bfc9-0b4f7811ac22\n", + "substring not found\n", + "Cleaning id=c67f523f-27a9-4648-a6ca-47856067b878\n", + "Action=Edit\n", + "edit: c67f523f-27a9-4648-a6ca-47856067b878\n", + "Cleaning id=7a875918-1bca-4cbe-9098-0cfdcf5c6a06\n", + "Action=Edit\n", + "edit: 7a875918-1bca-4cbe-9098-0cfdcf5c6a06\n", + "Cleaning id=4cf9f4c3-ab6d-4610-9fa4-6e2f6d7187c6\n", + "Action=Edit\n", + "edit: 4cf9f4c3-ab6d-4610-9fa4-6e2f6d7187c6\n", + "Cleaning id=5d41aea3-e16f-4453-8623-4d1e1fa46189\n", + "Action=Edit\n", + "edit: 5d41aea3-e16f-4453-8623-4d1e1fa46189\n", + "Cleaning id=8ab59449-3c58-4282-b491-9256056a0b06\n", + "Action=Edit\n", + "edit: 8ab59449-3c58-4282-b491-9256056a0b06\n", + "Cleaning id=287eba5e-29c9-4e67-b2ef-ebb3083b3003\n", + "Action=Edit\n", + "edit: 287eba5e-29c9-4e67-b2ef-ebb3083b3003\n", + "Cleaning id=ee378b30-e1db-4356-a3f1-57e6356fced4\n", + "Action=Edit\n", + "edit: ee378b30-e1db-4356-a3f1-57e6356fced4\n", + "Cleaning id=ac7a787a-b73c-46b4-87cd-5b5674b72898\n", + "Action=Edit\n", + "edit: ac7a787a-b73c-46b4-87cd-5b5674b72898\n", + "Cleaning id=aa96b30b-f1fe-4887-812a-b207240838be\n", + "Action=Edit\n", + "edit: aa96b30b-f1fe-4887-812a-b207240838be\n", + "Cleaning id=4b1f967e-15a5-4286-838f-f74a7542e365\n", + "Action=Edit\n", + "edit: 4b1f967e-15a5-4286-838f-f74a7542e365\n", + "Cleaning id=bc1503ac-bf8d-417a-9200-92fb1e18089c\n", + "Action=Edit\n", + "edit: bc1503ac-bf8d-417a-9200-92fb1e18089c\n", + "Cleaning id=5ca77f89-9460-4939-a971-940959fe8dff\n", + "Action=Edit\n", + "edit: 5ca77f89-9460-4939-a971-940959fe8dff\n", + "Cleaning id=a55cf83d-3980-4529-af25-7c16d81825f7\n", + "Action=Edit\n", + "edit: a55cf83d-3980-4529-af25-7c16d81825f7\n", + "Cleaning id=c752db2d-6bf9-4ac7-b46b-defed71c0252\n", + "Action=Edit\n", + "edit: c752db2d-6bf9-4ac7-b46b-defed71c0252\n", + "Cleaning id=adf3edbf-d1c4-40db-9ec6-be096f7d7353\n", + "Action=Edit\n", + "edit: adf3edbf-d1c4-40db-9ec6-be096f7d7353\n", + "Cleaning id=192a2529-525e-47f2-841b-b82c4b2feacf\n", + "Action=Edit\n", + "edit: 192a2529-525e-47f2-841b-b82c4b2feacf\n", + "Cleaning id=f59c1667-1fd2-4120-a924-e02a1c69ac73\n", + "Action=Edit\n", + "edit: f59c1667-1fd2-4120-a924-e02a1c69ac73\n", + "Cleaning id=754296b6-dc97-4340-8e21-e42f22ec538b\n", + "Action=Edit\n", + "edit: 754296b6-dc97-4340-8e21-e42f22ec538b\n", + "substring not found\n", + "Cleaning id=0d4e9ee5-54f0-4cd0-b026-5952700a5bb4\n", + "Action=Edit\n", + "edit: 0d4e9ee5-54f0-4cd0-b026-5952700a5bb4\n", + "Cleaning id=4a7094e2-25f4-4589-9547-c52827002cf3\n", + "Action=Edit\n", + "edit: 4a7094e2-25f4-4589-9547-c52827002cf3\n", + "Cleaning id=59973454-8b7f-44dc-b0e9-f6b9d2331e13\n", + "Action=Edit\n", + "edit: 59973454-8b7f-44dc-b0e9-f6b9d2331e13\n", + "Cleaning id=973ad122-16af-4e3f-92eb-1228a4ab04ec\n", + "Action=Edit\n", + "edit: 973ad122-16af-4e3f-92eb-1228a4ab04ec\n", + "Cleaning id=cfccf4f0-346f-4690-94b0-2676d697aeab\n", + "Action=Edit\n", + "edit: cfccf4f0-346f-4690-94b0-2676d697aeab\n", + "Cleaning id=f567161f-09c6-42de-b2b1-5e577eda5b46\n", + "Action=Edit\n", + "edit: f567161f-09c6-42de-b2b1-5e577eda5b46\n", + "Cleaning id=1ae6629a-c6cf-4df2-9bde-d5d437a7c412\n", + "Action=Edit\n", + "edit: 1ae6629a-c6cf-4df2-9bde-d5d437a7c412\n", + "Cleaning id=a96bdb41-f57a-46bf-9806-fc692d4a485a\n", + "Action=Edit\n", + "edit: a96bdb41-f57a-46bf-9806-fc692d4a485a\n", + "Cleaning id=c4d2323f-4cff-4816-8d6a-6e0fb39b4685\n", + "Action=Edit\n", + "edit: c4d2323f-4cff-4816-8d6a-6e0fb39b4685\n", + "Cleaning id=95d50a89-368d-46e9-872d-685b31a0ec6a\n", + "Action=Edit\n", + "edit: 95d50a89-368d-46e9-872d-685b31a0ec6a\n", + "Cleaning id=d469a34a-f234-405c-bb3a-d5d0328c2c3d\n", + "Action=Edit\n", + "edit: d469a34a-f234-405c-bb3a-d5d0328c2c3d\n", + "substring not found\n", + "Cleaning id=d3ae4235-90a4-4fad-9106-3075ee9c43ba\n", + "Action=Edit\n", + "edit: d3ae4235-90a4-4fad-9106-3075ee9c43ba\n", + "Cleaning id=5b56e959-7ce3-4d52-82dc-c0dcbb439a91\n", + "Action=Edit\n", + "edit: 5b56e959-7ce3-4d52-82dc-c0dcbb439a91\n", + "Done\n", + "Writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl\n" + ] + } + ], + "source": [ + "# use instructions file to clean the raw dataset\n", + "!python ./examples/clean_dataset.py \\\n", + " \"{raw_input_data_path}\" \\\n", + " \"{data_out_dir}/{trees_filename}\" \\\n", + " --instructions \"{instructions_path}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl\n", + "70642 trees with 208598 total messages read.\n", + "writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.jsonl\n", + "208598 messages written.\n" + ] + } + ], + "source": [ + "# convert cleaned dataset from tree to messages\n", + "!python ./examples/tree_to_messages.py \\\n", + " \"{data_out_dir}/{trees_filename}\" \\\n", + " \"{data_out_dir}/{messages_filename}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "gzip: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl.gz already exists;\tnot overwritten\n", + "gzip: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.jsonl.gz already exists;\tnot overwritten\n" + ] + } + ], + "source": [ + "# make .gz files, keeping the original files\n", + "!gzip -c \"{data_out_dir}/{trees_filename}\" > \"{data_out_dir}/{trees_filename}.gz\"\n", + "!gzip -c \"{data_out_dir}/{messages_filename}\" > \"{data_out_dir}/{messages_filename}.gz\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TODO 1: add detoxify scores\n", + "# TODO 2: add & do manual bad keyword filtering\n", + "# TODO 3: remove bad messages based on above steps\n", + "# TODO 4: generate huggingface parquet format files (?)\n", + "# TODO 5: create readme for oasst2 dataset" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From dc7054d705be4c7a9440625532cf6ea378af2519 Mon Sep 17 00:00:00 2001 From: Andrew Maguire Date: Mon, 4 Dec 2023 15:46:25 +0000 Subject: [PATCH 3/6] move to oasst2 folder and rerun with additional instructions --- oasst-data/examples/clean_dataset.py | 7 +- oasst-data/examples/keyword_flagging.py | 121 +++++++++++++ oasst-data/{ => oasst2}/generate_oasst2.ipynb | 164 +++++++++++++++--- 3 files changed, 267 insertions(+), 25 deletions(-) create mode 100644 oasst-data/examples/keyword_flagging.py rename oasst-data/{ => oasst2}/generate_oasst2.ipynb (83%) diff --git a/oasst-data/examples/clean_dataset.py b/oasst-data/examples/clean_dataset.py index 28492d7db7..d7c055213e 100644 --- a/oasst-data/examples/clean_dataset.py +++ b/oasst-data/examples/clean_dataset.py @@ -67,8 +67,11 @@ def delete_message(msg: ExportMessageNode): print(f"Tree deleted: {msg.message_id}") else: parent_msg = message_by_id[msg.parent_id] - parent_msg.replies.remove(msg) - print(f"Branch deleted: {msg.message_id} ({count_descendants(msg)} messages)") + try: + parent_msg.replies.remove(msg) + print(f"Branch deleted: {msg.message_id} ({count_descendants(msg)} messages)") + except ValueError: + print(f"Message not found: {msg.message_id}") # cleaning print("Cleaning...") diff --git a/oasst-data/examples/keyword_flagging.py b/oasst-data/examples/keyword_flagging.py new file mode 100644 index 0000000000..cae7355efc --- /dev/null +++ b/oasst-data/examples/keyword_flagging.py @@ -0,0 +1,121 @@ +import argparse +import csv +import glob +import json +import re + +banned_words = { + "卐", + "mein führer", + "sieg heil", + "heil hitler" "child porn", + "childporn", + "loli", + "hentai", + "pedophile", + "nigger", + "nigga", + "faggot", + "tranny", + "faggy", + "пидор", + "хуесос", + "хуйло", + "хохол", + "хохлы", + "русня", +} + + +def parse_args(): + parser = argparse.ArgumentParser(description="filter_dataset") + parser.add_argument( + "input_file_name", + type=str, + help="path to input .jsonl or .jsonl.gz input file", + ) + parser.add_argument( + "output_dir", + type=str, + help="dir to output", + ) + args = parser.parse_args() + return args + + +def contains_banned_word(text): + pattern = r"\b(?:" + "|".join(re.escape(word) for word in banned_words) + r")\b" + regex = re.compile(pattern, re.IGNORECASE) + return bool(regex.search(text)) + + +def process_message(msg, writers): + text = msg.get("text", "") + if contains_banned_word(text): + writers["hate_speech_ban_words"].writerow([msg["message_id"], text]) + if "labels" in msg: + for label in ["hate_speech", "toxicity", "pii", "not_appropriate", "violence"]: + if label in msg["labels"] and msg["labels"][label]["value"] > 0.85: + writers[label].writerow([msg["message_id"], text]) + if len(text) < 10: + writers["junk_by_len"].writerow([msg["message_id"], text]) + if "replies" in msg: + for reply in msg["replies"]: + process_message(reply, writers) + + +def process_jsonl_file(file, writers): + print(f"Processing file: {file}") + with open(file, "r", encoding="utf-8") as f: + for line in f: + data = json.loads(line.strip()) + if "prompt" in data: + process_message(data["prompt"], writers) + + +if __name__ == "__main__": + args = parse_args() + files = glob.glob(args.input_file_name) + if not files: + print("No files found") + for file in files: + with open( + f"{args.output_dir}/hate_speech_labelled.csv", + "w", + newline="", + encoding="utf-8", + ) as file1, open( + f"{args.output_dir}/hate_speech_ban_words.csv", + "w", + newline="", + encoding="utf-8", + ) as file2, open( + f"{args.output_dir}/junk_len.csv", "w", newline="", encoding="utf-8" + ) as file3, open( + f"{args.output_dir}/toxicity_labelled.csv", + "w", + newline="", + encoding="utf-8", + ) as file4, open( + f"{args.output_dir}/pii_labelled.csv", "w", newline="", encoding="utf-8" + ) as file5, open( + f"{args.output_dir}/not_appropriate_labelled.csv", + "w", + newline="", + encoding="utf-8", + ) as file6, open( + f"{args.output_dir}/violence_labelled.csv", + "w", + newline="", + encoding="utf-8", + ) as file7: + writers = { + "hate_speech": csv.writer(file1), + "hate_speech_ban_words": csv.writer(file2), + "junk_by_len": csv.writer(file3), + "toxicity": csv.writer(file4), + "pii": csv.writer(file5), + "not_appropriate": csv.writer(file6), + "violence": csv.writer(file7), + } + process_jsonl_file(file, writers) diff --git a/oasst-data/generate_oasst2.ipynb b/oasst-data/oasst2/generate_oasst2.ipynb similarity index 83% rename from oasst-data/generate_oasst2.ipynb rename to oasst-data/oasst2/generate_oasst2.ipynb index be7bdc27a3..368b214adb 100644 --- a/oasst-data/generate_oasst2.ipynb +++ b/oasst-data/oasst2/generate_oasst2.ipynb @@ -6,7 +6,8 @@ "metadata": {}, "outputs": [], "source": [ - "import os" + "import os\n", + "import hashlib" ] }, { @@ -21,18 +22,39 @@ "raw_input_data_path = f\"{data_dir}/2023-11-05_oasst_all/2023-11-05_oasst_all.jsonl\"\n", "instructions_path = f\"{data_dir}/instructions.xlsx\"\n", "trees_filename = f\"2023-11-05_oasst_all.trees.jsonl\"\n", - "messages_filename = f\"2023-11-05_oasst_all.messages.jsonl\"" + "messages_filename = f\"2023-11-05_oasst_all.messages.jsonl\"\n", + "\n", + "# make data_out_dir if it doesn't exist\n", + "if not os.path.exists(data_out_dir):\n", + " os.makedirs(data_out_dir)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hash of input data: C:/Users/andre/Downloads/2023-11-05_oasst_all/2023-11-05_oasst_all.jsonl\n", + "8223a0083f70749ecf430b4057c50dc4\n", + "Hash of Instructions: C:/Users/andre/Downloads/instructions.xlsx\n", + "99e7a311f473b08781fad2b1855243dc\n" + ] + } + ], "source": [ - "# make data_out_dir if it doesn't exist\n", - "if not os.path.exists(data_out_dir):\n", - " os.makedirs(data_out_dir)" + "# print hashes of input files\n", + "\n", + "print(f\"Hash of input data: {raw_input_data_path}\")\n", + "with open(raw_input_data_path, \"rb\") as f:\n", + " print(hashlib.md5(f.read()).hexdigest())\n", + "\n", + "print(f\"Hash of Instructions: {instructions_path}\")\n", + "with open(instructions_path, \"rb\") as f:\n", + " print(hashlib.md5(f.read()).hexdigest())" ] }, { @@ -689,6 +711,91 @@ "Cleaning id=5b56e959-7ce3-4d52-82dc-c0dcbb439a91\n", "Action=Edit\n", "edit: 5b56e959-7ce3-4d52-82dc-c0dcbb439a91\n", + "Cleaning id=5a25c3dc-b6f5-44ec-9a8a-9d0b51f4fcf1 \n", + "Not found: 5a25c3dc-b6f5-44ec-9a8a-9d0b51f4fcf1 \n", + "Skipping instructions for : 5a25c3dc-b6f5-44ec-9a8a-9d0b51f4fcf1 \n", + "Cleaning id=c91afb6c-1585-40a0-a529-8b9e0e0220d4 \n", + "Not found: c91afb6c-1585-40a0-a529-8b9e0e0220d4 \n", + "Skipping instructions for : c91afb6c-1585-40a0-a529-8b9e0e0220d4 \n", + "Cleaning id=36c6d5e6-d19e-435a-9a10-5b536ea52666\n", + "Action=Delete\n", + "deleting: 36c6d5e6-d19e-435a-9a10-5b536ea52666\n", + "Branch deleted: 36c6d5e6-d19e-435a-9a10-5b536ea52666 (1 messages)\n", + "Cleaning id=de142443-6c47-47f7-b849-9c1f3abacfeb \n", + "Not found: de142443-6c47-47f7-b849-9c1f3abacfeb \n", + "Skipping instructions for : de142443-6c47-47f7-b849-9c1f3abacfeb \n", + "Cleaning id=a7d5b481-d1dc-405d-83be-94189afd7050 \n", + "Not found: a7d5b481-d1dc-405d-83be-94189afd7050 \n", + "Skipping instructions for : a7d5b481-d1dc-405d-83be-94189afd7050 \n", + "Cleaning id=381503b9-7867-4c48-ad7a-287e889bc12a \n", + "Not found: 381503b9-7867-4c48-ad7a-287e889bc12a \n", + "Skipping instructions for : 381503b9-7867-4c48-ad7a-287e889bc12a \n", + "Cleaning id=8b814e22-5ea6-4092-8908-a76ca50e988c \n", + "Not found: 8b814e22-5ea6-4092-8908-a76ca50e988c \n", + "Skipping instructions for : 8b814e22-5ea6-4092-8908-a76ca50e988c \n", + "Cleaning id=e953b4f4-8476-48f2-8b1c-2dbf7a65a5da \n", + "Not found: e953b4f4-8476-48f2-8b1c-2dbf7a65a5da \n", + "Skipping instructions for : e953b4f4-8476-48f2-8b1c-2dbf7a65a5da \n", + "Cleaning id=ab8e193f-fd9f-45e7-90c5-f5d38cf84aab \n", + "Not found: ab8e193f-fd9f-45e7-90c5-f5d38cf84aab \n", + "Skipping instructions for : ab8e193f-fd9f-45e7-90c5-f5d38cf84aab \n", + "Cleaning id=c91afb6c-1585-40a0-a529-8b9e0e0220d4 \n", + "Not found: c91afb6c-1585-40a0-a529-8b9e0e0220d4 \n", + "Skipping instructions for : c91afb6c-1585-40a0-a529-8b9e0e0220d4 \n", + "Cleaning id=6f3e265c-b45e-44d7-9278-4d73e42811d4 \n", + "Not found: 6f3e265c-b45e-44d7-9278-4d73e42811d4 \n", + "Skipping instructions for : 6f3e265c-b45e-44d7-9278-4d73e42811d4 \n", + "Cleaning id=4dbe3bcf-489c-4a25-84f9-300b68f951c6 \n", + "Not found: 4dbe3bcf-489c-4a25-84f9-300b68f951c6 \n", + "Skipping instructions for : 4dbe3bcf-489c-4a25-84f9-300b68f951c6 \n", + "Cleaning id=a328a64e-20a7-46ce-b56a-622d694341d4\n", + "Action=Delete\n", + "deleting: a328a64e-20a7-46ce-b56a-622d694341d4\n", + "Branch deleted: a328a64e-20a7-46ce-b56a-622d694341d4 (1 messages)\n", + "Cleaning id=ca789bea-f12f-4814-a59b-b70455eb5e7c\n", + "Action=Delete\n", + "deleting: ca789bea-f12f-4814-a59b-b70455eb5e7c\n", + "Branch deleted: ca789bea-f12f-4814-a59b-b70455eb5e7c (1 messages)\n", + "Cleaning id=7531f5c3-0df0-4f9b-a11e-fe52ed3f809e\n", + "Action=Delete\n", + "deleting: 7531f5c3-0df0-4f9b-a11e-fe52ed3f809e\n", + "Branch deleted: 7531f5c3-0df0-4f9b-a11e-fe52ed3f809e (1 messages)\n", + "Cleaning id=36c6d5e6-d19e-435a-9a10-5b536ea52666\n", + "Action=Delete\n", + "deleting: 36c6d5e6-d19e-435a-9a10-5b536ea52666\n", + "Message not found: 36c6d5e6-d19e-435a-9a10-5b536ea52666\n", + "Cleaning id=970d2aa9-0089-428f-96ef-a94345231a58\n", + "Action=Delete\n", + "deleting: 970d2aa9-0089-428f-96ef-a94345231a58\n", + "Branch deleted: 970d2aa9-0089-428f-96ef-a94345231a58 (1 messages)\n", + "Cleaning id=e71da408-c9a8-4d28-9741-500ec1b02f0f\n", + "Action=Delete\n", + "deleting: e71da408-c9a8-4d28-9741-500ec1b02f0f\n", + "Branch deleted: e71da408-c9a8-4d28-9741-500ec1b02f0f (2 messages)\n", + "Cleaning id=b09ef779-8e2a-4ccb-aa1e-c0f108c1ea90\n", + "Action=Delete\n", + "deleting: b09ef779-8e2a-4ccb-aa1e-c0f108c1ea90\n", + "Branch deleted: b09ef779-8e2a-4ccb-aa1e-c0f108c1ea90 (2 messages)\n", + "Cleaning id=de142443-6c47-47f7-b849-9c1f3abacfeb\n", + "Action=Delete\n", + "deleting: de142443-6c47-47f7-b849-9c1f3abacfeb\n", + "Branch deleted: de142443-6c47-47f7-b849-9c1f3abacfeb (1 messages)\n", + "Cleaning id=fad47867-8c5d-4654-a8a4-97fda14fac1d\n", + "Action=Delete\n", + "deleting: fad47867-8c5d-4654-a8a4-97fda14fac1d\n", + "Branch deleted: fad47867-8c5d-4654-a8a4-97fda14fac1d (1 messages)\n", + "Cleaning id=8b814e22-5ea6-4092-8908-a76ca50e988c\n", + "Action=Delete\n", + "deleting: 8b814e22-5ea6-4092-8908-a76ca50e988c\n", + "Branch deleted: 8b814e22-5ea6-4092-8908-a76ca50e988c (1 messages)\n", + "Cleaning id=c91afb6c-1585-40a0-a529-8b9e0e0220d4\n", + "Action=Delete\n", + "deleting: c91afb6c-1585-40a0-a529-8b9e0e0220d4\n", + "Branch deleted: c91afb6c-1585-40a0-a529-8b9e0e0220d4 (1 messages)\n", + "Cleaning id=3f2913ba-49a6-4641-9267-2e7fc7f7fbd4\n", + "Action=Delete\n", + "deleting: 3f2913ba-49a6-4641-9267-2e7fc7f7fbd4\n", + "Branch deleted: 3f2913ba-49a6-4641-9267-2e7fc7f7fbd4 (1 messages)\n", "Done\n", "Writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl\n" ] @@ -696,7 +803,7 @@ ], "source": [ "# use instructions file to clean the raw dataset\n", - "!python ./examples/clean_dataset.py \\\n", + "!python ../examples/clean_dataset.py \\\n", " \"{raw_input_data_path}\" \\\n", " \"{data_out_dir}/{trees_filename}\" \\\n", " --instructions \"{instructions_path}\"" @@ -711,18 +818,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl\n", - "70642 trees with 208598 total messages read.\n", - "writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.jsonl\n", - "208598 messages written.\n" + "Processing file: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl\n" ] } ], "source": [ - "# convert cleaned dataset from tree to messages\n", - "!python ./examples/tree_to_messages.py \\\n", + "# run keyword flagging\n", + "!python ../examples/keyword_flagging.py \\\n", " \"{data_out_dir}/{trees_filename}\" \\\n", - " \"{data_out_dir}/{messages_filename}\"" + " \"{data_out_dir}\"\n", + " \n", + "# outputs have been manually reviewed and appended to instructions file and notebook has been rerun" ] }, { @@ -731,14 +837,28 @@ "metadata": {}, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "gzip: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl.gz already exists;\tnot overwritten\n", - "gzip: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.jsonl.gz already exists;\tnot overwritten\n" + "reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl\n", + "70642 trees with 208584 total messages read.\n", + "writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.jsonl\n", + "208584 messages written.\n" ] } ], + "source": [ + "# convert cleaned dataset from tree to messages\n", + "!python ../examples/tree_to_messages.py \\\n", + " \"{data_out_dir}/{trees_filename}\" \\\n", + " \"{data_out_dir}/{messages_filename}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], "source": [ "# make .gz files, keeping the original files\n", "!gzip -c \"{data_out_dir}/{trees_filename}\" > \"{data_out_dir}/{trees_filename}.gz\"\n", @@ -747,15 +867,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ - "# TODO 1: add detoxify scores\n", - "# TODO 2: add & do manual bad keyword filtering\n", - "# TODO 3: remove bad messages based on above steps\n", - "# TODO 4: generate huggingface parquet format files (?)\n", - "# TODO 5: create readme for oasst2 dataset" + "# TODO: add detoxify scores\n", + "# TODO: generate huggingface parquet format files\n", + "# TODO: create readme for oasst2 dataset" ] } ], From 9e446eddb17fb3fbdf9e2dd8639734291c052cc0 Mon Sep 17 00:00:00 2001 From: Andrew Maguire Date: Mon, 4 Dec 2023 15:51:04 +0000 Subject: [PATCH 4/6] Remove keyword flagging script and update notebook --- oasst-data/oasst2/generate_oasst2.ipynb | 4 ++-- oasst-data/{examples => oasst2}/keyword_flagging.py | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename oasst-data/{examples => oasst2}/keyword_flagging.py (100%) diff --git a/oasst-data/oasst2/generate_oasst2.ipynb b/oasst-data/oasst2/generate_oasst2.ipynb index 368b214adb..48a706bfef 100644 --- a/oasst-data/oasst2/generate_oasst2.ipynb +++ b/oasst-data/oasst2/generate_oasst2.ipynb @@ -811,7 +811,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -824,7 +824,7 @@ ], "source": [ "# run keyword flagging\n", - "!python ../examples/keyword_flagging.py \\\n", + "!python ./keyword_flagging.py \\\n", " \"{data_out_dir}/{trees_filename}\" \\\n", " \"{data_out_dir}\"\n", " \n", diff --git a/oasst-data/examples/keyword_flagging.py b/oasst-data/oasst2/keyword_flagging.py similarity index 100% rename from oasst-data/examples/keyword_flagging.py rename to oasst-data/oasst2/keyword_flagging.py From a095cb6285e6cc4e84a01dc2c04591e6429116ea Mon Sep 17 00:00:00 2001 From: Andrew Maguire Date: Mon, 4 Dec 2023 16:09:43 +0000 Subject: [PATCH 5/6] Add split_dataset.py and generate_oasst2.ipynb files --- oasst-data/examples/split_dataset.py | 9 +++++++ oasst-data/oasst2/generate_oasst2.ipynb | 34 ++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/oasst-data/examples/split_dataset.py b/oasst-data/examples/split_dataset.py index 0a47a7ca0c..618185bb24 100644 --- a/oasst-data/examples/split_dataset.py +++ b/oasst-data/examples/split_dataset.py @@ -1,3 +1,12 @@ +""" +Example usage: + + python split_dataset.py / + "2023-11-05_oasst_all.messages.jsonl" / + --val_output "2023-11-05_oasst_all.messages.validation.jsonl" / + --train_output "2023-11-05_oasst_all.messages.train.jsonl" +""" + import argparse import random diff --git a/oasst-data/oasst2/generate_oasst2.ipynb b/oasst-data/oasst2/generate_oasst2.ipynb index 48a706bfef..7ee968bc10 100644 --- a/oasst-data/oasst2/generate_oasst2.ipynb +++ b/oasst-data/oasst2/generate_oasst2.ipynb @@ -23,6 +23,8 @@ "instructions_path = f\"{data_dir}/instructions.xlsx\"\n", "trees_filename = f\"2023-11-05_oasst_all.trees.jsonl\"\n", "messages_filename = f\"2023-11-05_oasst_all.messages.jsonl\"\n", + "messages_train_filename = f\"2023-11-05_oasst_all.messages.train.jsonl\"\n", + "messages_validation_filename = f\"2023-11-05_oasst_all.messages.validation.jsonl\"\n", "\n", "# make data_out_dir if it doesn't exist\n", "if not os.path.exists(data_out_dir):\n", @@ -811,7 +813,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -858,16 +860,42 @@ "cell_type": "code", "execution_count": 7, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.jsonl\n", + "Found 208584 matching messages.\n", + "Writing train 198293 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.train.jsonl\n", + "Writing valid 10291 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.validation.jsonl\n" + ] + } + ], + "source": [ + "# split messages into train and validation\n", + "!python ../examples/split_dataset.py \\\n", + " \"{data_out_dir}/{messages_filename}\" \\\n", + " --train_output \"{data_out_dir}/{messages_train_filename}\" \\\n", + " --val_output \"{data_out_dir}/{messages_validation_filename}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, "outputs": [], "source": [ "# make .gz files, keeping the original files\n", "!gzip -c \"{data_out_dir}/{trees_filename}\" > \"{data_out_dir}/{trees_filename}.gz\"\n", - "!gzip -c \"{data_out_dir}/{messages_filename}\" > \"{data_out_dir}/{messages_filename}.gz\"" + "!gzip -c \"{data_out_dir}/{messages_filename}\" > \"{data_out_dir}/{messages_filename}.gz\"\n", + "!gzip -c \"{data_out_dir}/{messages_train_filename}\" > \"{data_out_dir}/{messages_train_filename}.gz\"\n", + "!gzip -c \"{data_out_dir}/{messages_validation_filename}\" > \"{data_out_dir}/{messages_validation_filename}.gz\"" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ From 99adb6f377ba181277d5ab8a249345c809776e4a Mon Sep 17 00:00:00 2001 From: Andrew Maguire Date: Mon, 4 Dec 2023 16:23:04 +0000 Subject: [PATCH 6/6] add _ready_ versions of files too --- oasst-data/examples/filter_messages.py | 7 +- oasst-data/examples/filter_trees.py | 9 +++ oasst-data/oasst2/generate_oasst2.ipynb | 86 ++++++++++++++++++++++++- 3 files changed, 99 insertions(+), 3 deletions(-) diff --git a/oasst-data/examples/filter_messages.py b/oasst-data/examples/filter_messages.py index b8005b569c..9ff0a9c656 100644 --- a/oasst-data/examples/filter_messages.py +++ b/oasst-data/examples/filter_messages.py @@ -126,7 +126,12 @@ def approve_message(msg: ExportMessageNode) -> bool: ): return False - if exclude_normal is True and not msg.deleted and not msg.synthetic and msg.review_result: + if ( + exclude_normal is True + and not msg.deleted + and not msg.synthetic + and msg.review_result + ): return False if spam is not None and spam != (not msg.review_result): diff --git a/oasst-data/examples/filter_trees.py b/oasst-data/examples/filter_trees.py index 76c753aefb..ce757bc452 100644 --- a/oasst-data/examples/filter_trees.py +++ b/oasst-data/examples/filter_trees.py @@ -1,3 +1,12 @@ +""" +Example usage: + + python filter_trees.py / + "2023-11-05_oasst_all.jsonl" / + "2023-11-05_oasst_all.clean.jsonl" / + --states "ready_for_export" +""" + import argparse from oasst_data import read_message_trees, write_message_trees diff --git a/oasst-data/oasst2/generate_oasst2.ipynb b/oasst-data/oasst2/generate_oasst2.ipynb index 7ee968bc10..3e47fd68e3 100644 --- a/oasst-data/oasst2/generate_oasst2.ipynb +++ b/oasst-data/oasst2/generate_oasst2.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -22,9 +22,17 @@ "raw_input_data_path = f\"{data_dir}/2023-11-05_oasst_all/2023-11-05_oasst_all.jsonl\"\n", "instructions_path = f\"{data_dir}/instructions.xlsx\"\n", "trees_filename = f\"2023-11-05_oasst_all.trees.jsonl\"\n", + "trees_ready_filename = f\"2023-11-05_oasst_all.trees.ready_for_export.jsonl\"\n", "messages_filename = f\"2023-11-05_oasst_all.messages.jsonl\"\n", + "messages_ready_filename = f\"2023-11-05_oasst_all.messages.ready_for_export.jsonl\"\n", "messages_train_filename = f\"2023-11-05_oasst_all.messages.train.jsonl\"\n", + "messages_ready_train_filename = (\n", + " f\"2023-11-05_oasst_all.messages.ready_for_export.train.jsonl\"\n", + ")\n", "messages_validation_filename = f\"2023-11-05_oasst_all.messages.validation.jsonl\"\n", + "messages_ready_validation_filename = (\n", + " f\"2023-11-05_oasst_all.messages.ready_for_export.validation.jsonl\"\n", + ")\n", "\n", "# make data_out_dir if it doesn't exist\n", "if not os.path.exists(data_out_dir):\n", @@ -833,6 +841,29 @@ "# outputs have been manually reviewed and appended to instructions file and notebook has been rerun" ] }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.jsonl\n", + "Found 13854 matching trees.\n", + "Writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.ready_for_export.jsonl\n" + ] + } + ], + "source": [ + "# filter trees to make a version with status ready for export\n", + "!python ../examples/filter_trees.py \\\n", + " \"{data_out_dir}/{trees_filename}\" \\\n", + " \"{data_out_dir}/{trees_ready_filename}\" \\\n", + " --states \"ready_for_export\"" + ] + }, { "cell_type": "code", "execution_count": 6, @@ -856,6 +887,29 @@ " \"{data_out_dir}/{messages_filename}\"" ] }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.trees.ready_for_export.jsonl\n", + "13854 trees with 135174 total messages read.\n", + "writing: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.jsonl\n", + "135174 messages written.\n" + ] + } + ], + "source": [ + "# convert cleaned state=ready_for_export dataset from tree to messages\n", + "!python ../examples/tree_to_messages.py \\\n", + " \"{data_out_dir}/{trees_ready_filename}\" \\\n", + " \"{data_out_dir}/{messages_ready_filename}\"" + ] + }, { "cell_type": "code", "execution_count": 7, @@ -880,6 +934,30 @@ " --val_output \"{data_out_dir}/{messages_validation_filename}\"" ] }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.jsonl\n", + "Found 135174 matching messages.\n", + "Writing train 128412 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.train.jsonl\n", + "Writing valid 6762 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.ready_for_export.validation.jsonl\n" + ] + } + ], + "source": [ + "# split ready messages into train and validation\n", + "!python ../examples/split_dataset.py \\\n", + " \"{data_out_dir}/{messages_ready_filename}\" \\\n", + " --train_output \"{data_out_dir}/{messages_ready_train_filename}\" \\\n", + " --val_output \"{data_out_dir}/{messages_ready_validation_filename}\"" + ] + }, { "cell_type": "code", "execution_count": 8, @@ -888,9 +966,13 @@ "source": [ "# make .gz files, keeping the original files\n", "!gzip -c \"{data_out_dir}/{trees_filename}\" > \"{data_out_dir}/{trees_filename}.gz\"\n", + "!gzip -c \"{data_out_dir}/{trees_ready_filename}\" > \"{data_out_dir}/{trees_ready_filename}.gz\"\n", "!gzip -c \"{data_out_dir}/{messages_filename}\" > \"{data_out_dir}/{messages_filename}.gz\"\n", + "!gzip -c \"{data_out_dir}/{messages_ready_filename}\" > \"{data_out_dir}/{messages_ready_filename}.gz\"\n", "!gzip -c \"{data_out_dir}/{messages_train_filename}\" > \"{data_out_dir}/{messages_train_filename}.gz\"\n", - "!gzip -c \"{data_out_dir}/{messages_validation_filename}\" > \"{data_out_dir}/{messages_validation_filename}.gz\"" + "!gzip -c \"{data_out_dir}/{messages_ready_train_filename}\" > \"{data_out_dir}/{messages_ready_train_filename}.gz\"\n", + "!gzip -c \"{data_out_dir}/{messages_validation_filename}\" > \"{data_out_dir}/{messages_validation_filename}.gz\"\n", + "!gzip -c \"{data_out_dir}/{messages_ready_validation_filename}\" > \"{data_out_dir}/{messages_ready_validation_filename}.gz\"" ] }, {