From 9bfbc149d3f156e3e7f349d6b2763e3e2b5c3d15 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 19 Apr 2022 16:27:02 +0000 Subject: [PATCH 1/9] [Librispeech] Add 'all' config --- datasets/librispeech_asr/librispeech_asr.py | 135 ++++++++++++++------ 1 file changed, 95 insertions(+), 40 deletions(-) diff --git a/datasets/librispeech_asr/librispeech_asr.py b/datasets/librispeech_asr/librispeech_asr.py index 2e4d6bcd2c1..46450ce3246 100644 --- a/datasets/librispeech_asr/librispeech_asr.py +++ b/datasets/librispeech_asr/librispeech_asr.py @@ -57,6 +57,11 @@ def map_to_array(batch): _URL = "http://www.openslr.org/12" _DL_URL = "http://www.openslr.org/resources/12/" + +def retrieve_subset(name): + return "-".join(name.split(".")[0].split("-")) + + _DL_URLS = { "clean": { "dev": _DL_URL + "dev-clean.tar.gz", @@ -69,6 +74,14 @@ def map_to_array(batch): "dev": _DL_URL + "dev-other.tar.gz", "train.500": _DL_URL + "train-other-500.tar.gz", }, + "all": { + "dev": {retrieve_subset(u): _DL_URL + u for u in ["dev-clean.tar.gz", "dev-other.tar.gz"]}, + "test": {retrieve_subset(u): _DL_URL + u for u in ["test-clean.tar.gz", "test-other.tar.gz"]}, + "train": { + retrieve_subset(u): _DL_URL + u + for u in ["train-clean-100.tar.gz", "train-clean-360.tar.gz", "train-other-500.tar.gz"] + }, + }, } @@ -94,6 +107,7 @@ class LibrispeechASR(datasets.GeneratorBasedBuilder): BUILDER_CONFIGS = [ LibrispeechASRConfig(name="clean", description="'Clean' speech."), LibrispeechASRConfig(name="other", description="'Other', more challenging, speech."), + LibrispeechASRConfig(name="all", description="Combined clean and other dataset."), ] def _info(self): @@ -107,6 +121,7 @@ def _info(self): "speaker_id": datasets.Value("int64"), "chapter_id": datasets.Value("int64"), "id": datasets.Value("string"), + "subset": datasets.Value("string"), } ), supervised_keys=("file", "text"), @@ -117,61 +132,101 @@ def _info(self): def _split_generators(self, dl_manager): archive_path = dl_manager.download(_DL_URLS[self.config.name]) - if self.config.name == "clean": train_splits = [ datasets.SplitGenerator( - name="train.100", gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.100"])} + name="train.100", + gen_kwargs={"files": {"clean-100": dl_manager.iter_archive(archive_path["train.100"])}}, ), datasets.SplitGenerator( - name="train.360", gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.360"])} + name="train.360", + gen_kwargs={"files": {"clean-360": dl_manager.iter_archive(archive_path["train.360"])}}, ), ] + dev_splits = [ + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"files": {"clean": dl_manager.iter_archive(archive_path["dev"])}}, + ) + ] + test_splits = [ + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"files": {"clean": dl_manager.iter_archive(archive_path["test"])}}, + ) + ] elif self.config.name == "other": train_splits = [ datasets.SplitGenerator( - name="train.500", gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.500"])} - ), + name="train.500", + gen_kwargs={"files": {"other-500": dl_manager.iter_archive(archive_path["train.500"])}}, + ) + ] + dev_splits = [ + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"files": {"other": dl_manager.iter_archive(archive_path["dev"])}}, + ) + ] + test_splits = [ + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"files": {"other": dl_manager.iter_archive(archive_path["test"])}}, + ) + ] + elif self.config.name == "all": + train_splits = [ + datasets.SplitGenerator( + name="train", + gen_kwargs={"files": {k: dl_manager.iter_archive(v) for k, v in archive_path["train"].items()}}, + ) + ] + dev_splits = [ + datasets.SplitGenerator( + name=datasets.Split.VALIDATION, + gen_kwargs={"files": {k: dl_manager.iter_archive(v) for k, v in archive_path["dev"].items()}}, + ) + ] + test_splits = [ + datasets.SplitGenerator( + name=datasets.Split.TEST, + gen_kwargs={"files": {k: dl_manager.iter_archive(v) for k, v in archive_path["test"].items()}}, + ) ] - return train_splits + [ - datasets.SplitGenerator( - name=datasets.Split.VALIDATION, gen_kwargs={"files": dl_manager.iter_archive(archive_path["dev"])} - ), - datasets.SplitGenerator( - name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive_path["test"])} - ), - ] + return train_splits + dev_splits + test_splits def _generate_examples(self, files): """Generate examples from a LibriSpeech archive_path.""" key = 0 audio_data = {} transcripts = [] - for path, f in files: - if path.endswith(".flac"): - id_ = path.split("/")[-1][: -len(".flac")] - audio_data[id_] = f.read() - elif path.endswith(".trans.txt"): - for line in f: - if line: - line = line.decode("utf-8").strip() - id_, transcript = line.split(" ", 1) - audio_file = f"{id_}.flac" - speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]] - transcripts.append( - { - "id": id_, - "speaker_id": speaker_id, - "chapter_id": chapter_id, - "file": audio_file, - "text": transcript, - } - ) - if audio_data and len(audio_data) == len(transcripts): - for transcript in transcripts: - audio = {"path": transcript["file"], "bytes": audio_data[transcript["id"]]} - yield key, {"audio": audio, **transcript} - key += 1 - audio_data = {} - transcripts = [] + for subset, iterator in files.items(): + for path, f in iterator: + if path.endswith(".flac"): + id_ = path.split("/")[-1][: -len(".flac")] + audio_data[id_] = f.read() + elif path.endswith(".trans.txt"): + for line in f: + if line: + line = line.decode("utf-8").strip() + id_, transcript = line.split(" ", 1) + audio_file = f"{id_}.flac" + speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]] + transcripts.append( + { + "id": id_, + "speaker_id": speaker_id, + "chapter_id": chapter_id, + "file": audio_file, + "text": transcript, + "subset": subset, + } + ) + if audio_data and len(audio_data) == len(transcripts): + for transcript in transcripts: + audio = {"path": transcript["file"], "bytes": audio_data[transcript["id"]]} + yield key, {"audio": audio, **transcript} + key += 1 + audio_data = {} + transcripts = [] From fe68645fbd14edff070b09150e1908e4d028d954 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 19 Apr 2022 18:30:45 +0200 Subject: [PATCH 2/9] Update datasets/librispeech_asr/librispeech_asr.py --- datasets/librispeech_asr/librispeech_asr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datasets/librispeech_asr/librispeech_asr.py b/datasets/librispeech_asr/librispeech_asr.py index 46450ce3246..79e43bd6208 100644 --- a/datasets/librispeech_asr/librispeech_asr.py +++ b/datasets/librispeech_asr/librispeech_asr.py @@ -59,7 +59,8 @@ def map_to_array(batch): def retrieve_subset(name): - return "-".join(name.split(".")[0].split("-")) + # "train-other-500.tar.gz" -> "other-500" + return "-".join(name.split(".")[0].split("-")[1:]) _DL_URLS = { From 189ee52397d63bd858115a3d4effbb855ad5a512 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 20 Apr 2022 11:25:18 +0000 Subject: [PATCH 3/9] apply suggestions --- datasets/librispeech_asr/librispeech_asr.py | 121 +++++++++++--------- 1 file changed, 65 insertions(+), 56 deletions(-) diff --git a/datasets/librispeech_asr/librispeech_asr.py b/datasets/librispeech_asr/librispeech_asr.py index 46450ce3246..89b31db85a7 100644 --- a/datasets/librispeech_asr/librispeech_asr.py +++ b/datasets/librispeech_asr/librispeech_asr.py @@ -58,10 +58,6 @@ def map_to_array(batch): _DL_URL = "http://www.openslr.org/resources/12/" -def retrieve_subset(name): - return "-".join(name.split(".")[0].split("-")) - - _DL_URLS = { "clean": { "dev": _DL_URL + "dev-clean.tar.gz", @@ -75,12 +71,13 @@ def retrieve_subset(name): "train.500": _DL_URL + "train-other-500.tar.gz", }, "all": { - "dev": {retrieve_subset(u): _DL_URL + u for u in ["dev-clean.tar.gz", "dev-other.tar.gz"]}, - "test": {retrieve_subset(u): _DL_URL + u for u in ["test-clean.tar.gz", "test-other.tar.gz"]}, - "train": { - retrieve_subset(u): _DL_URL + u - for u in ["train-clean-100.tar.gz", "train-clean-360.tar.gz", "train-other-500.tar.gz"] - }, + "dev.clean": _DL_URL + "dev-clean.tar.gz", + "dev.other": _DL_URL + "dev-other.tar.gz", + "test.clean": _DL_URL + "test-clean.tar.gz", + "test.other": _DL_URL + "test-other.tar.gz", + "train.clean.100": _DL_URL + "train-clean-100.tar.gz", + "train.clean.360": _DL_URL + "train-clean-360.tar.gz", + "train.other.500": _DL_URL + "train-other-500.tar.gz", }, } @@ -104,6 +101,7 @@ class LibrispeechASR(datasets.GeneratorBasedBuilder): """Librispeech dataset.""" DEFAULT_WRITER_BATCH_SIZE = 256 + DEFAULT_CONFIG_NAME = "all" BUILDER_CONFIGS = [ LibrispeechASRConfig(name="clean", description="'Clean' speech."), LibrispeechASRConfig(name="other", description="'Other', more challenging, speech."), @@ -121,7 +119,6 @@ def _info(self): "speaker_id": datasets.Value("int64"), "chapter_id": datasets.Value("int64"), "id": datasets.Value("string"), - "subset": datasets.Value("string"), } ), supervised_keys=("file", "text"), @@ -136,62 +133,76 @@ def _split_generators(self, dl_manager): train_splits = [ datasets.SplitGenerator( name="train.100", - gen_kwargs={"files": {"clean-100": dl_manager.iter_archive(archive_path["train.100"])}}, + gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.100"])}, ), datasets.SplitGenerator( name="train.360", - gen_kwargs={"files": {"clean-360": dl_manager.iter_archive(archive_path["train.360"])}}, + gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.360"])}, ), ] dev_splits = [ datasets.SplitGenerator( name=datasets.Split.VALIDATION, - gen_kwargs={"files": {"clean": dl_manager.iter_archive(archive_path["dev"])}}, + gen_kwargs={"files": dl_manager.iter_archive(archive_path["dev"])}, ) ] test_splits = [ datasets.SplitGenerator( name=datasets.Split.TEST, - gen_kwargs={"files": {"clean": dl_manager.iter_archive(archive_path["test"])}}, + gen_kwargs={"files": dl_manager.iter_archive(archive_path["test"])}, ) ] elif self.config.name == "other": train_splits = [ datasets.SplitGenerator( name="train.500", - gen_kwargs={"files": {"other-500": dl_manager.iter_archive(archive_path["train.500"])}}, + gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.500"])}, ) ] dev_splits = [ datasets.SplitGenerator( name=datasets.Split.VALIDATION, - gen_kwargs={"files": {"other": dl_manager.iter_archive(archive_path["dev"])}}, + gen_kwargs={"files": dl_manager.iter_archive(archive_path["dev"])}, ) ] test_splits = [ datasets.SplitGenerator( name=datasets.Split.TEST, - gen_kwargs={"files": {"other": dl_manager.iter_archive(archive_path["test"])}}, + gen_kwargs={"files": dl_manager.iter_archive(archive_path["test"])}, ) ] elif self.config.name == "all": train_splits = [ datasets.SplitGenerator( - name="train", - gen_kwargs={"files": {k: dl_manager.iter_archive(v) for k, v in archive_path["train"].items()}}, - ) + name="train.clean.100", + gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.clean.100"])}, + ), + datasets.SplitGenerator( + name="train.clean.360", + gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.clean.360"])}, + ), + datasets.SplitGenerator( + name="train.other.500", + gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.other.500"])}, + ), ] dev_splits = [ datasets.SplitGenerator( - name=datasets.Split.VALIDATION, - gen_kwargs={"files": {k: dl_manager.iter_archive(v) for k, v in archive_path["dev"].items()}}, - ) + name="validation.clean", gen_kwargs={"files": dl_manager.iter_archive(archive_path["dev.clean"])} + ), + datasets.SplitGenerator( + name="validation.other", + gen_kwargs={"files": dl_manager.iter_archive(archive_path["dev.other"])}, + ), ] test_splits = [ datasets.SplitGenerator( - name=datasets.Split.TEST, - gen_kwargs={"files": {k: dl_manager.iter_archive(v) for k, v in archive_path["test"].items()}}, - ) + name="test.clean", gen_kwargs={"files": dl_manager.iter_archive(archive_path["test.clean"])} + ), + datasets.SplitGenerator( + name="test.other", + gen_kwargs={"files": dl_manager.iter_archive(archive_path["test.other"])}, + ), ] return train_splits + dev_splits + test_splits @@ -201,32 +212,30 @@ def _generate_examples(self, files): key = 0 audio_data = {} transcripts = [] - for subset, iterator in files.items(): - for path, f in iterator: - if path.endswith(".flac"): - id_ = path.split("/")[-1][: -len(".flac")] - audio_data[id_] = f.read() - elif path.endswith(".trans.txt"): - for line in f: - if line: - line = line.decode("utf-8").strip() - id_, transcript = line.split(" ", 1) - audio_file = f"{id_}.flac" - speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]] - transcripts.append( - { - "id": id_, - "speaker_id": speaker_id, - "chapter_id": chapter_id, - "file": audio_file, - "text": transcript, - "subset": subset, - } - ) - if audio_data and len(audio_data) == len(transcripts): - for transcript in transcripts: - audio = {"path": transcript["file"], "bytes": audio_data[transcript["id"]]} - yield key, {"audio": audio, **transcript} - key += 1 - audio_data = {} - transcripts = [] + for path, f in files: + if path.endswith(".flac"): + id_ = path.split("/")[-1][: -len(".flac")] + audio_data[id_] = f.read() + elif path.endswith(".trans.txt"): + for line in f: + if line: + line = line.decode("utf-8").strip() + id_, transcript = line.split(" ", 1) + audio_file = f"{id_}.flac" + speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]] + transcripts.append( + { + "id": id_, + "speaker_id": speaker_id, + "chapter_id": chapter_id, + "file": audio_file, + "text": transcript, + } + ) + if audio_data and len(audio_data) == len(transcripts): + for transcript in transcripts: + audio = {"path": transcript["file"], "bytes": audio_data[transcript["id"]]} + yield key, {"audio": audio, **transcript} + key += 1 + audio_data = {} + transcripts = [] From 627c99f9c033f409d8d8f54f5e9659946649a7d5 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 20 Apr 2022 12:30:12 +0000 Subject: [PATCH 4/9] correct paths --- datasets/librispeech_asr/librispeech_asr.py | 81 +++++++++++++++++---- 1 file changed, 66 insertions(+), 15 deletions(-) diff --git a/datasets/librispeech_asr/librispeech_asr.py b/datasets/librispeech_asr/librispeech_asr.py index 89b31db85a7..6fdef1d96d0 100644 --- a/datasets/librispeech_asr/librispeech_asr.py +++ b/datasets/librispeech_asr/librispeech_asr.py @@ -18,6 +18,7 @@ import datasets +import os from datasets.tasks import AutomaticSpeechRecognition @@ -129,85 +130,132 @@ def _info(self): def _split_generators(self, dl_manager): archive_path = dl_manager.download(_DL_URLS[self.config.name]) + # (Optional) In non-streaming mode, we can extract the archive locally to have actual local audio files: + local_extracted_archive = dl_manager.extract(archive_path) if not dl_manager.is_streaming else {} + if self.config.name == "clean": train_splits = [ datasets.SplitGenerator( name="train.100", - gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.100"])}, + gen_kwargs={ + "local_extracted_archive": local_extracted_archive.get("train.100"), + "files": dl_manager.iter_archive(archive_path["train.100"]), + }, ), datasets.SplitGenerator( name="train.360", - gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.360"])}, + gen_kwargs={ + "local_extracted_archive": local_extracted_archive.get("train.360"), + "files": dl_manager.iter_archive(archive_path["train.360"]), + }, ), ] dev_splits = [ datasets.SplitGenerator( name=datasets.Split.VALIDATION, - gen_kwargs={"files": dl_manager.iter_archive(archive_path["dev"])}, + gen_kwargs={ + "local_extracted_archive": local_extracted_archive.get("dev"), + "files": dl_manager.iter_archive(archive_path["dev"]), + }, ) ] test_splits = [ datasets.SplitGenerator( name=datasets.Split.TEST, - gen_kwargs={"files": dl_manager.iter_archive(archive_path["test"])}, + gen_kwargs={ + "local_extracted_archive": local_extracted_archive.get("test"), + "files": dl_manager.iter_archive(archive_path["test"]), + }, ) ] elif self.config.name == "other": train_splits = [ datasets.SplitGenerator( name="train.500", - gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.500"])}, + gen_kwargs={ + "local_extracted_archive": local_extracted_archive.get("train.500"), + "files": dl_manager.iter_archive(archive_path["train.500"]), + }, ) ] dev_splits = [ datasets.SplitGenerator( name=datasets.Split.VALIDATION, - gen_kwargs={"files": dl_manager.iter_archive(archive_path["dev"])}, + gen_kwargs={ + "local_extracted_archive": local_extracted_archive.get("dev"), + "files": dl_manager.iter_archive(archive_path["dev"]), + }, ) ] test_splits = [ datasets.SplitGenerator( name=datasets.Split.TEST, - gen_kwargs={"files": dl_manager.iter_archive(archive_path["test"])}, + gen_kwargs={ + "local_extracted_archive": local_extracted_archive.get("test"), + "files": dl_manager.iter_archive(archive_path["test"]), + }, ) ] elif self.config.name == "all": train_splits = [ datasets.SplitGenerator( name="train.clean.100", - gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.clean.100"])}, + gen_kwargs={ + "local_extracted_archive": local_extracted_archive.get("train.clean.100"), + "files": dl_manager.iter_archive(archive_path["train.clean.100"]), + }, ), datasets.SplitGenerator( name="train.clean.360", - gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.clean.360"])}, + gen_kwargs={ + "local_extracted_archive": local_extracted_archive.get("train.clean.360"), + "files": dl_manager.iter_archive(archive_path["train.clean.360"]), + }, ), datasets.SplitGenerator( name="train.other.500", - gen_kwargs={"files": dl_manager.iter_archive(archive_path["train.other.500"])}, + gen_kwargs={ + "local_extracted_archive": local_extracted_archive.get("train.other.500"), + "files": dl_manager.iter_archive(archive_path["train.other.500"]), + }, ), ] dev_splits = [ datasets.SplitGenerator( - name="validation.clean", gen_kwargs={"files": dl_manager.iter_archive(archive_path["dev.clean"])} + name="validation.clean", + gen_kwargs={ + "local_extracted_archive": local_extracted_archive.get("validation.clean"), + "files": dl_manager.iter_archive(archive_path["dev.clean"]), + }, ), datasets.SplitGenerator( name="validation.other", - gen_kwargs={"files": dl_manager.iter_archive(archive_path["dev.other"])}, + gen_kwargs={ + "local_extracted_archive": local_extracted_archive.get("validation.other"), + "files": dl_manager.iter_archive(archive_path["dev.other"]), + }, ), ] test_splits = [ datasets.SplitGenerator( - name="test.clean", gen_kwargs={"files": dl_manager.iter_archive(archive_path["test.clean"])} + name="test.clean", + gen_kwargs={ + "local_extracted_archive": local_extracted_archive.get("test.clean"), + "files": dl_manager.iter_archive(archive_path["test.clean"]), + }, ), datasets.SplitGenerator( name="test.other", - gen_kwargs={"files": dl_manager.iter_archive(archive_path["test.other"])}, + gen_kwargs={ + "local_extracted_archive": local_extracted_archive.get("test.other"), + "files": dl_manager.iter_archive(archive_path["test.other"]), + }, ), ] return train_splits + dev_splits + test_splits - def _generate_examples(self, files): + def _generate_examples(self, files, local_extracted_archive): """Generate examples from a LibriSpeech archive_path.""" key = 0 audio_data = {} @@ -223,6 +271,9 @@ def _generate_examples(self, files): id_, transcript = line.split(" ", 1) audio_file = f"{id_}.flac" speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]] + audio_file = ( + os.path.join(local_extracted_archive, audio_file) if local_extracted_archive else None + ) transcripts.append( { "id": id_, From 31e67cb2edd922a323c3cc4c397f0b67e045b50b Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 20 Apr 2022 12:49:01 +0000 Subject: [PATCH 5/9] up --- datasets/librispeech_asr/librispeech_asr.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/datasets/librispeech_asr/librispeech_asr.py b/datasets/librispeech_asr/librispeech_asr.py index 6fdef1d96d0..cc5f50efb87 100644 --- a/datasets/librispeech_asr/librispeech_asr.py +++ b/datasets/librispeech_asr/librispeech_asr.py @@ -37,22 +37,6 @@ LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz, prepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read audiobooks from the LibriVox project, and has been carefully segmented and aligned.87 - -Note that in order to limit the required storage for preparing this dataset, the audio -is stored in the .flac format and is not converted to a float32 array. To convert, the audio -file to a float32 array, please make use of the `.map()` function as follows: - - -```python -import soundfile as sf - -def map_to_array(batch): - speech_array, _ = sf.read(batch["file"]) - batch["speech"] = speech_array - return batch - -dataset = dataset.map(map_to_array, remove_columns=["file"]) -``` """ _URL = "http://www.openslr.org/12" From 3944a3b277a5aa66dfbd953f1731273789230daa Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Wed, 20 Apr 2022 13:28:33 +0000 Subject: [PATCH 6/9] up --- datasets/librispeech_asr/librispeech_asr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datasets/librispeech_asr/librispeech_asr.py b/datasets/librispeech_asr/librispeech_asr.py index cc5f50efb87..b6112e066cc 100644 --- a/datasets/librispeech_asr/librispeech_asr.py +++ b/datasets/librispeech_asr/librispeech_asr.py @@ -17,8 +17,9 @@ """Librispeech automatic speech recognition dataset.""" -import datasets import os + +import datasets from datasets.tasks import AutomaticSpeechRecognition From 5f699fa91e71bf6082bf1792dbf6c605ca34ffff Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 21 Apr 2022 16:56:27 +0000 Subject: [PATCH 7/9] up --- datasets/librispeech_asr/dataset_infos.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/librispeech_asr/dataset_infos.json b/datasets/librispeech_asr/dataset_infos.json index 1652d9ceab4..63c9ef18282 100644 --- a/datasets/librispeech_asr/dataset_infos.json +++ b/datasets/librispeech_asr/dataset_infos.json @@ -1 +1 @@ -{"clean": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@inproceedings{panayotov2015librispeech,\n title={Librispeech: an ASR corpus based on public domain audio books},\n author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n pages={5206--5210},\n year={2015},\n organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_column": "audio", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.100": {"name": "train.100", "num_bytes": 6619683041, "num_examples": 28539, "dataset_name": "librispeech_asr"}, "train.360": {"name": "train.360", "num_bytes": 23898214592, "num_examples": 104014, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 359572231, "num_examples": 2703, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 367705423, "num_examples": 2620, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/dev-clean.tar.gz": {"num_bytes": 337926286, "checksum": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3"}, "http://www.openslr.org/resources/12/test-clean.tar.gz": {"num_bytes": 346663984, "checksum": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23"}, "http://www.openslr.org/resources/12/train-clean-100.tar.gz": {"num_bytes": 6387309499, "checksum": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2"}, "http://www.openslr.org/resources/12/train-clean-360.tar.gz": {"num_bytes": 23049477885, "checksum": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf"}}, "download_size": 30121377654, "post_processing_size": null, "dataset_size": 31245175287, "size_in_bytes": 61366552941}, "other": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@inproceedings{panayotov2015librispeech,\n title={Librispeech: an ASR corpus based on public domain audio books},\n author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n pages={5206--5210},\n year={2015},\n organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "_storage_dtype": "struct", "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_column": "audio", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "other", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.500": {"name": "train.500", "num_bytes": 31810256902, "num_examples": 148688, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 337283304, "num_examples": 2864, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 352396474, "num_examples": 2939, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/test-other.tar.gz": {"num_bytes": 328757843, "checksum": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29"}, "http://www.openslr.org/resources/12/dev-other.tar.gz": {"num_bytes": 314305928, "checksum": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365"}, "http://www.openslr.org/resources/12/train-other-500.tar.gz": {"num_bytes": 30593501606, "checksum": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2"}}, "download_size": 31236565377, "post_processing_size": null, "dataset_size": 32499936680, "size_in_bytes": 63736502057}} \ No newline at end of file +{"clean": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@inproceedings{panayotov2015librispeech,\n title={Librispeech: an ASR corpus based on public domain audio books},\n author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n pages={5206--5210},\n year={2015},\n organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "decode": true, "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_column": "audio", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "clean", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.100": {"name": "train.100", "num_bytes": 6619683041, "num_examples": 28539, "dataset_name": "librispeech_asr"}, "train.360": {"name": "train.360", "num_bytes": 23898214592, "num_examples": 104014, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 359572231, "num_examples": 2703, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 367705423, "num_examples": 2620, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/dev-clean.tar.gz": {"num_bytes": 337926286, "checksum": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3"}, "http://www.openslr.org/resources/12/test-clean.tar.gz": {"num_bytes": 346663984, "checksum": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23"}, "http://www.openslr.org/resources/12/train-clean-100.tar.gz": {"num_bytes": 6387309499, "checksum": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2"}, "http://www.openslr.org/resources/12/train-clean-360.tar.gz": {"num_bytes": 23049477885, "checksum": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf"}}, "download_size": 30121377654, "post_processing_size": null, "dataset_size": 31245175287, "size_in_bytes": 61366552941}, "other": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n\nNote that in order to limit the required storage for preparing this dataset, the audio\nis stored in the .flac format and is not converted to a float32 array. To convert, the audio\nfile to a float32 array, please make use of the `.map()` function as follows:\n\n\n```python\nimport soundfile as sf\n\ndef map_to_array(batch):\n speech_array, _ = sf.read(batch[\"file\"])\n batch[\"speech\"] = speech_array\n return batch\n\ndataset = dataset.map(map_to_array, remove_columns=[\"file\"])\n```\n", "citation": "@inproceedings{panayotov2015librispeech,\n title={Librispeech: an ASR corpus based on public domain audio books},\n author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n pages={5206--5210},\n year={2015},\n organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "decode": true, "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_column": "audio", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "other", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.500": {"name": "train.500", "num_bytes": 31810256902, "num_examples": 148688, "dataset_name": "librispeech_asr"}, "validation": {"name": "validation", "num_bytes": 337283304, "num_examples": 2864, "dataset_name": "librispeech_asr"}, "test": {"name": "test", "num_bytes": 352396474, "num_examples": 2939, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/test-other.tar.gz": {"num_bytes": 328757843, "checksum": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29"}, "http://www.openslr.org/resources/12/dev-other.tar.gz": {"num_bytes": 314305928, "checksum": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365"}, "http://www.openslr.org/resources/12/train-other-500.tar.gz": {"num_bytes": 30593501606, "checksum": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2"}}, "download_size": 31236565377, "post_processing_size": null, "dataset_size": 32499936680, "size_in_bytes": 63736502057}, "all": {"description": "LibriSpeech is a corpus of approximately 1000 hours of read English speech with sampling rate of 16 kHz,\nprepared by Vassil Panayotov with the assistance of Daniel Povey. The data is derived from read\naudiobooks from the LibriVox project, and has been carefully segmented and aligned.87\n", "citation": "@inproceedings{panayotov2015librispeech,\n title={Librispeech: an ASR corpus based on public domain audio books},\n author={Panayotov, Vassil and Chen, Guoguo and Povey, Daniel and Khudanpur, Sanjeev},\n booktitle={Acoustics, Speech and Signal Processing (ICASSP), 2015 IEEE International Conference on},\n pages={5206--5210},\n year={2015},\n organization={IEEE}\n}\n", "homepage": "http://www.openslr.org/12", "license": "", "features": {"file": {"dtype": "string", "id": null, "_type": "Value"}, "audio": {"sampling_rate": 16000, "mono": true, "decode": true, "id": null, "_type": "Audio"}, "text": {"dtype": "string", "id": null, "_type": "Value"}, "speaker_id": {"dtype": "int64", "id": null, "_type": "Value"}, "chapter_id": {"dtype": "int64", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": {"input": "file", "output": "text"}, "task_templates": [{"task": "automatic-speech-recognition", "audio_column": "audio", "transcription_column": "text"}], "builder_name": "librispeech_asr", "config_name": "all", "version": {"version_str": "2.1.0", "description": "", "major": 2, "minor": 1, "patch": 0}, "splits": {"train.clean.100": {"name": "train.clean.100", "num_bytes": 6627791685, "num_examples": 28539, "dataset_name": "librispeech_asr"}, "train.clean.360": {"name": "train.clean.360", "num_bytes": 23927767570, "num_examples": 104014, "dataset_name": "librispeech_asr"}, "train.other.500": {"name": "train.other.500", "num_bytes": 31852502880, "num_examples": 148688, "dataset_name": "librispeech_asr"}, "validation.clean": {"name": "validation.clean", "num_bytes": 359505691, "num_examples": 2703, "dataset_name": "librispeech_asr"}, "validation.other": {"name": "validation.other", "num_bytes": 337213112, "num_examples": 2864, "dataset_name": "librispeech_asr"}, "test.clean": {"name": "test.clean", "num_bytes": 368449831, "num_examples": 2620, "dataset_name": "librispeech_asr"}, "test.other": {"name": "test.other", "num_bytes": 353231518, "num_examples": 2939, "dataset_name": "librispeech_asr"}}, "download_checksums": {"http://www.openslr.org/resources/12/dev-clean.tar.gz": {"num_bytes": 337926286, "checksum": "76f87d090650617fca0cac8f88b9416e0ebf80350acb97b343a85fa903728ab3"}, "http://www.openslr.org/resources/12/dev-other.tar.gz": {"num_bytes": 314305928, "checksum": "12661c48e8c3fe1de2c1caa4c3e135193bfb1811584f11f569dd12645aa84365"}, "http://www.openslr.org/resources/12/test-clean.tar.gz": {"num_bytes": 346663984, "checksum": "39fde525e59672dc6d1551919b1478f724438a95aa55f874b576be21967e6c23"}, "http://www.openslr.org/resources/12/test-other.tar.gz": {"num_bytes": 328757843, "checksum": "d09c181bba5cf717b3dee7d4d592af11a3ee3a09e08ae025c5506f6ebe961c29"}, "http://www.openslr.org/resources/12/train-clean-100.tar.gz": {"num_bytes": 6387309499, "checksum": "d4ddd1d5a6ab303066f14971d768ee43278a5f2a0aa43dc716b0e64ecbbbf6e2"}, "http://www.openslr.org/resources/12/train-clean-360.tar.gz": {"num_bytes": 23049477885, "checksum": "146a56496217e96c14334a160df97fffedd6e0a04e66b9c5af0d40be3c792ecf"}, "http://www.openslr.org/resources/12/train-other-500.tar.gz": {"num_bytes": 30593501606, "checksum": "ddb22f27f96ec163645d53215559df6aa36515f26e01dd70798188350adcb6d2"}}, "download_size": 61357943031, "post_processing_size": null, "dataset_size": 63826462287, "size_in_bytes": 125184405318}} \ No newline at end of file From 2adb2d710c67a6802ebfbab08499b1443ca9d403 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 21 Apr 2022 17:09:35 +0000 Subject: [PATCH 8/9] up --- datasets/librispeech_asr/librispeech_asr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datasets/librispeech_asr/librispeech_asr.py b/datasets/librispeech_asr/librispeech_asr.py index b6112e066cc..f2405350dc8 100644 --- a/datasets/librispeech_asr/librispeech_asr.py +++ b/datasets/librispeech_asr/librispeech_asr.py @@ -257,7 +257,9 @@ def _generate_examples(self, files, local_extracted_archive): audio_file = f"{id_}.flac" speaker_id, chapter_id = [int(el) for el in id_.split("-")[:2]] audio_file = ( - os.path.join(local_extracted_archive, audio_file) if local_extracted_archive else None + os.path.join(local_extracted_archive, audio_file) + if local_extracted_archive + else audio_file ) transcripts.append( { From bce822175480793bce3b75b788aaf60248168286 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Thu, 21 Apr 2022 17:26:22 +0000 Subject: [PATCH 9/9] up --- datasets/librispeech_asr/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasets/librispeech_asr/README.md b/datasets/librispeech_asr/README.md index 8806cab4258..a3d6acf8b54 100644 --- a/datasets/librispeech_asr/README.md +++ b/datasets/librispeech_asr/README.md @@ -20,7 +20,7 @@ task_categories: - automatic-speech-recognition - audio-classification task_ids: -- audio-speaker-identification +- speaker-identification --- # Dataset Card for librispeech_asr