Skip to content

Commit

Permalink
initial version of whisper normalizer
Browse files Browse the repository at this point in the history
  • Loading branch information
kurianbenoy committed Mar 21, 2023
1 parent c47942d commit f65fe64
Show file tree
Hide file tree
Showing 4 changed files with 107 additions and 53 deletions.
43 changes: 17 additions & 26 deletions nbs/00_basic.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@
"text/markdown": [
"---\n",
"\n",
"[source](https://github.com/kurianbenoy/whisper_normalizer/blob/main/whisper_normalizer/base.py#L62){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"[source](https://github.com/kurianbenoy/whisper_normalizer/blob/main/whisper_normalizer/basic.py#L62){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"\n",
"### BasicTextNormalizer\n",
"\n",
Expand All @@ -143,7 +143,7 @@
"text/plain": [
"---\n",
"\n",
"[source](https://github.com/kurianbenoy/whisper_normalizer/blob/main/whisper_normalizer/base.py#L62){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"[source](https://github.com/kurianbenoy/whisper_normalizer/blob/main/whisper_normalizer/basic.py#L62){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"\n",
"### BasicTextNormalizer\n",
"\n",
Expand Down Expand Up @@ -174,37 +174,28 @@
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/plain": [
"'എന റ കമ പ യ ട ടറ ന എന റ ഭ ഷ'"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"normalizer = BasicTextNormalizer()\n"
"normalizer = BasicTextNormalizer()\n",
"normalizer(\"എന്റെ കമ്പ്യൂട്ടറിനു് എന്റെ ഭാഷ\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "JSONDecodeError",
"evalue": "Expecting value: line 1 column 1 (char 0)",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m#| hide\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mnbdev\u001b[39;00m; nbdev\u001b[39m.\u001b[39;49mnbdev_export()\n",
"File \u001b[0;32m~/mambaforge/lib/python3.10/site-packages/fastcore/script.py:110\u001b[0m, in \u001b[0;36mcall_parse.<locals>._f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[39m@wraps\u001b[39m(func)\n\u001b[1;32m 108\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_f\u001b[39m(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m 109\u001b[0m mod \u001b[39m=\u001b[39m inspect\u001b[39m.\u001b[39mgetmodule(inspect\u001b[39m.\u001b[39mcurrentframe()\u001b[39m.\u001b[39mf_back)\n\u001b[0;32m--> 110\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m mod: \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 111\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m SCRIPT_INFO\u001b[39m.\u001b[39mfunc \u001b[39mand\u001b[39;00m mod\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m==\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m__main__\u001b[39m\u001b[39m\"\u001b[39m: SCRIPT_INFO\u001b[39m.\u001b[39mfunc \u001b[39m=\u001b[39m func\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\n\u001b[1;32m 112\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(sys\u001b[39m.\u001b[39margv)\u001b[39m>\u001b[39m\u001b[39m1\u001b[39m \u001b[39mand\u001b[39;00m sys\u001b[39m.\u001b[39margv[\u001b[39m1\u001b[39m]\u001b[39m==\u001b[39m\u001b[39m'\u001b[39m\u001b[39m'\u001b[39m: sys\u001b[39m.\u001b[39margv\u001b[39m.\u001b[39mpop(\u001b[39m1\u001b[39m)\n",
"File \u001b[0;32m~/mambaforge/lib/python3.10/site-packages/nbdev/doclinks.py:138\u001b[0m, in \u001b[0;36mnbdev_export\u001b[0;34m(path, **kwargs)\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[39mif\u001b[39;00m os\u001b[39m.\u001b[39menviron\u001b[39m.\u001b[39mget(\u001b[39m'\u001b[39m\u001b[39mIN_TEST\u001b[39m\u001b[39m'\u001b[39m,\u001b[39m0\u001b[39m): \u001b[39mreturn\u001b[39;00m\n\u001b[1;32m 137\u001b[0m files \u001b[39m=\u001b[39m nbglob(path\u001b[39m=\u001b[39mpath, as_path\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\u001b[39m.\u001b[39msorted(\u001b[39m'\u001b[39m\u001b[39mname\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m--> 138\u001b[0m \u001b[39mfor\u001b[39;00m f \u001b[39min\u001b[39;00m files: nb_export(f)\n\u001b[1;32m 139\u001b[0m add_init(get_config()\u001b[39m.\u001b[39mlib_path)\n\u001b[1;32m 140\u001b[0m _build_modidx()\n",
"File \u001b[0;32m~/mambaforge/lib/python3.10/site-packages/nbdev/export.py:48\u001b[0m, in \u001b[0;36mnb_export\u001b[0;34m(nbname, lib_path, procs, debug, mod_maker, name)\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[39mif\u001b[39;00m lib_path \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m: lib_path \u001b[39m=\u001b[39m get_config()\u001b[39m.\u001b[39mlib_path\n\u001b[1;32m 47\u001b[0m exp \u001b[39m=\u001b[39m ExportModuleProc()\n\u001b[0;32m---> 48\u001b[0m nb \u001b[39m=\u001b[39m NBProcessor(nbname, [exp]\u001b[39m+\u001b[39;49mL(procs), debug\u001b[39m=\u001b[39;49mdebug)\n\u001b[1;32m 49\u001b[0m nb\u001b[39m.\u001b[39mprocess()\n\u001b[1;32m 50\u001b[0m \u001b[39mfor\u001b[39;00m mod,cells \u001b[39min\u001b[39;00m exp\u001b[39m.\u001b[39mmodules\u001b[39m.\u001b[39mitems():\n",
"File \u001b[0;32m~/mambaforge/lib/python3.10/site-packages/nbdev/process.py:92\u001b[0m, in \u001b[0;36mNBProcessor.__init__\u001b[0;34m(self, path, procs, nb, debug, rm_directives, process)\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\u001b[39mself\u001b[39m, path\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, procs\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, nb\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, debug\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m, rm_directives\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m, process\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m):\n\u001b[0;32m---> 92\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mnb \u001b[39m=\u001b[39m read_nb(path) \u001b[39mif\u001b[39;00m nb \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m nb\n\u001b[1;32m 93\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlang \u001b[39m=\u001b[39m nb_lang(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mnb)\n\u001b[1;32m 94\u001b[0m \u001b[39mfor\u001b[39;00m cell \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mnb\u001b[39m.\u001b[39mcells: cell\u001b[39m.\u001b[39mdirectives_ \u001b[39m=\u001b[39m extract_directives(cell, remove\u001b[39m=\u001b[39mrm_directives, lang\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlang)\n",
"File \u001b[0;32m~/mambaforge/lib/python3.10/site-packages/execnb/nbio.py:57\u001b[0m, in \u001b[0;36mread_nb\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mread_nb\u001b[39m(path):\n\u001b[1;32m 56\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mReturn notebook at `path`\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m---> 57\u001b[0m res \u001b[39m=\u001b[39m dict2nb(_read_json(path, encoding\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mutf-8\u001b[39;49m\u001b[39m'\u001b[39;49m))\n\u001b[1;32m 58\u001b[0m res[\u001b[39m'\u001b[39m\u001b[39mpath_\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(path)\n\u001b[1;32m 59\u001b[0m \u001b[39mreturn\u001b[39;00m res\n",
"File \u001b[0;32m~/mambaforge/lib/python3.10/site-packages/execnb/nbio.py:16\u001b[0m, in \u001b[0;36m_read_json\u001b[0;34m(self, encoding, errors)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_read_json\u001b[39m(\u001b[39mself\u001b[39m, encoding\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, errors\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m):\n\u001b[0;32m---> 16\u001b[0m \u001b[39mreturn\u001b[39;00m loads(Path(\u001b[39mself\u001b[39;49m)\u001b[39m.\u001b[39;49mread_text(encoding\u001b[39m=\u001b[39;49mencoding, errors\u001b[39m=\u001b[39;49merrors))\n",
"File \u001b[0;32m~/mambaforge/lib/python3.10/json/__init__.py:346\u001b[0m, in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 341\u001b[0m s \u001b[39m=\u001b[39m s\u001b[39m.\u001b[39mdecode(detect_encoding(s), \u001b[39m'\u001b[39m\u001b[39msurrogatepass\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 343\u001b[0m \u001b[39mif\u001b[39;00m (\u001b[39mcls\u001b[39m \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m object_hook \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m\n\u001b[1;32m 344\u001b[0m parse_int \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m parse_float \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m\n\u001b[1;32m 345\u001b[0m parse_constant \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m object_pairs_hook \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m kw):\n\u001b[0;32m--> 346\u001b[0m \u001b[39mreturn\u001b[39;00m _default_decoder\u001b[39m.\u001b[39;49mdecode(s)\n\u001b[1;32m 347\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mcls\u001b[39m \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 348\u001b[0m \u001b[39mcls\u001b[39m \u001b[39m=\u001b[39m JSONDecoder\n",
"File \u001b[0;32m~/mambaforge/lib/python3.10/json/decoder.py:337\u001b[0m, in \u001b[0;36mJSONDecoder.decode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdecode\u001b[39m(\u001b[39mself\u001b[39m, s, _w\u001b[39m=\u001b[39mWHITESPACE\u001b[39m.\u001b[39mmatch):\n\u001b[1;32m 333\u001b[0m \u001b[39m\"\"\"Return the Python representation of ``s`` (a ``str`` instance\u001b[39;00m\n\u001b[1;32m 334\u001b[0m \u001b[39m containing a JSON document).\u001b[39;00m\n\u001b[1;32m 335\u001b[0m \n\u001b[1;32m 336\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 337\u001b[0m obj, end \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mraw_decode(s, idx\u001b[39m=\u001b[39;49m_w(s, \u001b[39m0\u001b[39;49m)\u001b[39m.\u001b[39;49mend())\n\u001b[1;32m 338\u001b[0m end \u001b[39m=\u001b[39m _w(s, end)\u001b[39m.\u001b[39mend()\n\u001b[1;32m 339\u001b[0m \u001b[39mif\u001b[39;00m end \u001b[39m!=\u001b[39m \u001b[39mlen\u001b[39m(s):\n",
"File \u001b[0;32m~/mambaforge/lib/python3.10/json/decoder.py:355\u001b[0m, in \u001b[0;36mJSONDecoder.raw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m 353\u001b[0m obj, end \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mscan_once(s, idx)\n\u001b[1;32m 354\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mStopIteration\u001b[39;00m \u001b[39mas\u001b[39;00m err:\n\u001b[0;32m--> 355\u001b[0m \u001b[39mraise\u001b[39;00m JSONDecodeError(\u001b[39m\"\u001b[39m\u001b[39mExpecting value\u001b[39m\u001b[39m\"\u001b[39m, s, err\u001b[39m.\u001b[39mvalue) \u001b[39mfrom\u001b[39;00m \u001b[39mNone\u001b[39m\n\u001b[1;32m 356\u001b[0m \u001b[39mreturn\u001b[39;00m obj, end\n",
"\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)"
]
}
],
"outputs": [],
"source": [
"#| hide\n",
"import nbdev; nbdev.nbdev_export()"
Expand Down
101 changes: 82 additions & 19 deletions nbs/01_english.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,26 +32,15 @@
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'whisper_normalizer'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[3], line 10\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtyping\u001b[39;00m \u001b[39mimport\u001b[39;00m Iterator, List, Match, Optional, Union\n\u001b[1;32m 8\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mmore_itertools\u001b[39;00m \u001b[39mimport\u001b[39;00m windowed\n\u001b[0;32m---> 10\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mwhisper_normalizer\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbasic\u001b[39;00m \u001b[39mimport\u001b[39;00m remove_symbols_and_diacritics\n\u001b[1;32m 13\u001b[0m \u001b[39mclass\u001b[39;00m \u001b[39mEnglishNumberNormalizer\u001b[39;00m:\n\u001b[1;32m 14\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[39m Convert any spelled-out numbers into arabic numbers, while handling:\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[39m - interpret successive single-digit numbers as nominal: `one oh one` -> `101`\u001b[39;00m\n\u001b[1;32m 22\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'whisper_normalizer'"
]
}
],
"outputs": [],
"source": [
"# | export\n",
"import json\n",
"import os\n",
"import re\n",
"from fractions import Fraction\n",
"from typing import Iterator, List, Match, Optional, Union\n",
"import urllib\n",
"\n",
"from more_itertools import windowed\n",
"\n",
Expand Down Expand Up @@ -493,9 +482,16 @@
" s = \" \".join(word for word in self.process_words(s.split()) if word is not None)\n",
" s = self.postprocess(s)\n",
"\n",
" return s\n",
"\n",
"\n",
" return s\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#| export\n",
"class EnglishSpellingNormalizer:\n",
" \"\"\"\n",
" Applies British-American spelling mappings as listed in [1].\n",
Expand All @@ -504,14 +500,35 @@
" \"\"\"\n",
"\n",
" def __init__(self):\n",
" mapping_path = os.path.join(os.path.dirname(__file__), \"english.json\")\n",
" self.mapping = json.load(open(mapping_path))\n",
" response = urllib.request.urlopen(\"https://gist.githubusercontent.com/kurianbenoy/715c4528be9859ff64338f69416795c7/raw/936c565f059b81d007e2c52beb733b3a01937d90/openai_whisper.json\")\n",
" self.mapping = json.loads(response.read())\n",
"\n",
" def __call__(self, s: str):\n",
" return \" \".join(self.mapping.get(word, word) for word in s.split())\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'accessorize'"
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"n = EnglishSpellingNormalizer()\n",
"n(\"accessorise\")"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -604,8 +621,54 @@
"\n",
" s = re.sub(r\"\\s+\", \" \", s) # replace any successive whitespaces with a space\n",
"\n",
" return s\n"
" return s"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/markdown": [
"---\n",
"\n",
"[source](https://github.com/kurianbenoy/whisper_normalizer/blob/main/whisper_normalizer/english.py#L473){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"\n",
"### EnglishTextNormalizer\n",
"\n",
"> EnglishTextNormalizer ()\n",
"\n",
"Initialize self. See help(type(self)) for accurate signature."
],
"text/plain": [
"---\n",
"\n",
"[source](https://github.com/kurianbenoy/whisper_normalizer/blob/main/whisper_normalizer/english.py#L473){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n",
"\n",
"### EnglishTextNormalizer\n",
"\n",
"> EnglishTextNormalizer ()\n",
"\n",
"Initialize self. See help(type(self)) for accurate signature."
]
},
"execution_count": null,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"show_doc(EnglishTextNormalizer)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down
2 changes: 1 addition & 1 deletion whisper_normalizer/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
"Ł": "L",
}


def remove_symbols_and_diacritics(s: str, keep=""):
"""
Replace any other markers, symbols, and punctuations with a space,
Expand Down Expand Up @@ -57,7 +58,6 @@ def remove_symbols(s: str):
for c in unicodedata.normalize("NFKC", s)
)


# %% ../nbs/00_basic.ipynb 4
class BasicTextNormalizer:
def __init__(self, remove_diacritics: bool = False, split_letters: bool = False):
Expand Down
14 changes: 7 additions & 7 deletions whisper_normalizer/english.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import re
from fractions import Fraction
from typing import Iterator, List, Match, Optional, Union
import urllib

from more_itertools import windowed

Expand Down Expand Up @@ -452,7 +453,7 @@ def __call__(self, s: str):

return s


# %% ../nbs/01_english.ipynb 4
class EnglishSpellingNormalizer:
"""
Applies British-American spelling mappings as listed in [1].
Expand All @@ -461,15 +462,15 @@ class EnglishSpellingNormalizer:
"""

def __init__(self):
mapping_path = os.path.join(os.path.dirname(__file__), "english.json")
self.mapping = json.load(open(mapping_path))
response = urllib.request.urlopen(
"https://gist.githubusercontent.com/kurianbenoy/715c4528be9859ff64338f69416795c7/raw/936c565f059b81d007e2c52beb733b3a01937d90/openai_whisper.json"
)
self.mapping = json.loads(response.read())

def __call__(self, s: str):
return " ".join(self.mapping.get(word, word) for word in s.split())



# %% ../nbs/01_english.ipynb 4
# %% ../nbs/01_english.ipynb 6
class EnglishTextNormalizer:
def __init__(self):
self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
Expand Down Expand Up @@ -556,4 +557,3 @@ def __call__(self, s: str):
s = re.sub(r"\s+", " ", s) # replace any successive whitespaces with a space

return s

0 comments on commit f65fe64

Please sign in to comment.