diff --git a/nbs/00_basic.ipynb b/nbs/00_basic.ipynb index b402b2d..b581aac 100644 --- a/nbs/00_basic.ipynb +++ b/nbs/00_basic.ipynb @@ -131,7 +131,7 @@ "text/markdown": [ "---\n", "\n", - "[source](https://github.com/kurianbenoy/whisper_normalizer/blob/main/whisper_normalizer/base.py#L62){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/kurianbenoy/whisper_normalizer/blob/main/whisper_normalizer/basic.py#L62){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### BasicTextNormalizer\n", "\n", @@ -143,7 +143,7 @@ "text/plain": [ "---\n", "\n", - "[source](https://github.com/kurianbenoy/whisper_normalizer/blob/main/whisper_normalizer/base.py#L62){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "[source](https://github.com/kurianbenoy/whisper_normalizer/blob/main/whisper_normalizer/basic.py#L62){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", "\n", "### BasicTextNormalizer\n", "\n", @@ -174,37 +174,28 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'എന റ കമ പ യ ട ടറ ന എന റ ഭ ഷ'" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "normalizer = BasicTextNormalizer()\n" + "normalizer = BasicTextNormalizer()\n", + "normalizer(\"എന്റെ കമ്പ്യൂട്ടറിനു് എന്റെ ഭാഷ\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "JSONDecodeError", - "evalue": "Expecting value: line 1 column 1 (char 0)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39m#| hide\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mnbdev\u001b[39;00m; nbdev\u001b[39m.\u001b[39;49mnbdev_export()\n", - "File \u001b[0;32m~/mambaforge/lib/python3.10/site-packages/fastcore/script.py:110\u001b[0m, in \u001b[0;36mcall_parse.._f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[39m@wraps\u001b[39m(func)\n\u001b[1;32m 108\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_f\u001b[39m(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs):\n\u001b[1;32m 109\u001b[0m mod \u001b[39m=\u001b[39m inspect\u001b[39m.\u001b[39mgetmodule(inspect\u001b[39m.\u001b[39mcurrentframe()\u001b[39m.\u001b[39mf_back)\n\u001b[0;32m--> 110\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m mod: \u001b[39mreturn\u001b[39;00m func(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[1;32m 111\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m SCRIPT_INFO\u001b[39m.\u001b[39mfunc \u001b[39mand\u001b[39;00m mod\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\u001b[39m==\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m__main__\u001b[39m\u001b[39m\"\u001b[39m: SCRIPT_INFO\u001b[39m.\u001b[39mfunc \u001b[39m=\u001b[39m func\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m\n\u001b[1;32m 112\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(sys\u001b[39m.\u001b[39margv)\u001b[39m>\u001b[39m\u001b[39m1\u001b[39m \u001b[39mand\u001b[39;00m sys\u001b[39m.\u001b[39margv[\u001b[39m1\u001b[39m]\u001b[39m==\u001b[39m\u001b[39m'\u001b[39m\u001b[39m'\u001b[39m: sys\u001b[39m.\u001b[39margv\u001b[39m.\u001b[39mpop(\u001b[39m1\u001b[39m)\n", - "File \u001b[0;32m~/mambaforge/lib/python3.10/site-packages/nbdev/doclinks.py:138\u001b[0m, in \u001b[0;36mnbdev_export\u001b[0;34m(path, **kwargs)\u001b[0m\n\u001b[1;32m 136\u001b[0m \u001b[39mif\u001b[39;00m os\u001b[39m.\u001b[39menviron\u001b[39m.\u001b[39mget(\u001b[39m'\u001b[39m\u001b[39mIN_TEST\u001b[39m\u001b[39m'\u001b[39m,\u001b[39m0\u001b[39m): \u001b[39mreturn\u001b[39;00m\n\u001b[1;32m 137\u001b[0m files \u001b[39m=\u001b[39m nbglob(path\u001b[39m=\u001b[39mpath, as_path\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\u001b[39m.\u001b[39msorted(\u001b[39m'\u001b[39m\u001b[39mname\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[0;32m--> 138\u001b[0m \u001b[39mfor\u001b[39;00m f \u001b[39min\u001b[39;00m files: nb_export(f)\n\u001b[1;32m 139\u001b[0m add_init(get_config()\u001b[39m.\u001b[39mlib_path)\n\u001b[1;32m 140\u001b[0m _build_modidx()\n", - "File \u001b[0;32m~/mambaforge/lib/python3.10/site-packages/nbdev/export.py:48\u001b[0m, in \u001b[0;36mnb_export\u001b[0;34m(nbname, lib_path, procs, debug, mod_maker, name)\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[39mif\u001b[39;00m lib_path \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m: lib_path \u001b[39m=\u001b[39m get_config()\u001b[39m.\u001b[39mlib_path\n\u001b[1;32m 47\u001b[0m exp \u001b[39m=\u001b[39m ExportModuleProc()\n\u001b[0;32m---> 48\u001b[0m nb \u001b[39m=\u001b[39m NBProcessor(nbname, [exp]\u001b[39m+\u001b[39;49mL(procs), debug\u001b[39m=\u001b[39;49mdebug)\n\u001b[1;32m 49\u001b[0m nb\u001b[39m.\u001b[39mprocess()\n\u001b[1;32m 50\u001b[0m \u001b[39mfor\u001b[39;00m mod,cells \u001b[39min\u001b[39;00m exp\u001b[39m.\u001b[39mmodules\u001b[39m.\u001b[39mitems():\n", - "File \u001b[0;32m~/mambaforge/lib/python3.10/site-packages/nbdev/process.py:92\u001b[0m, in \u001b[0;36mNBProcessor.__init__\u001b[0;34m(self, path, procs, nb, debug, rm_directives, process)\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m__init__\u001b[39m(\u001b[39mself\u001b[39m, path\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, procs\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, nb\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, debug\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m, rm_directives\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m, process\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m):\n\u001b[0;32m---> 92\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mnb \u001b[39m=\u001b[39m read_nb(path) \u001b[39mif\u001b[39;00m nb \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39melse\u001b[39;00m nb\n\u001b[1;32m 93\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlang \u001b[39m=\u001b[39m nb_lang(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mnb)\n\u001b[1;32m 94\u001b[0m \u001b[39mfor\u001b[39;00m cell \u001b[39min\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mnb\u001b[39m.\u001b[39mcells: cell\u001b[39m.\u001b[39mdirectives_ \u001b[39m=\u001b[39m extract_directives(cell, remove\u001b[39m=\u001b[39mrm_directives, lang\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mlang)\n", - "File \u001b[0;32m~/mambaforge/lib/python3.10/site-packages/execnb/nbio.py:57\u001b[0m, in \u001b[0;36mread_nb\u001b[0;34m(path)\u001b[0m\n\u001b[1;32m 55\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mread_nb\u001b[39m(path):\n\u001b[1;32m 56\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mReturn notebook at `path`\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m---> 57\u001b[0m res \u001b[39m=\u001b[39m dict2nb(_read_json(path, encoding\u001b[39m=\u001b[39;49m\u001b[39m'\u001b[39;49m\u001b[39mutf-8\u001b[39;49m\u001b[39m'\u001b[39;49m))\n\u001b[1;32m 58\u001b[0m res[\u001b[39m'\u001b[39m\u001b[39mpath_\u001b[39m\u001b[39m'\u001b[39m] \u001b[39m=\u001b[39m \u001b[39mstr\u001b[39m(path)\n\u001b[1;32m 59\u001b[0m \u001b[39mreturn\u001b[39;00m res\n", - "File \u001b[0;32m~/mambaforge/lib/python3.10/site-packages/execnb/nbio.py:16\u001b[0m, in \u001b[0;36m_read_json\u001b[0;34m(self, encoding, errors)\u001b[0m\n\u001b[1;32m 15\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_read_json\u001b[39m(\u001b[39mself\u001b[39m, encoding\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, errors\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m):\n\u001b[0;32m---> 16\u001b[0m \u001b[39mreturn\u001b[39;00m loads(Path(\u001b[39mself\u001b[39;49m)\u001b[39m.\u001b[39;49mread_text(encoding\u001b[39m=\u001b[39;49mencoding, errors\u001b[39m=\u001b[39;49merrors))\n", - "File \u001b[0;32m~/mambaforge/lib/python3.10/json/__init__.py:346\u001b[0m, in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 341\u001b[0m s \u001b[39m=\u001b[39m s\u001b[39m.\u001b[39mdecode(detect_encoding(s), \u001b[39m'\u001b[39m\u001b[39msurrogatepass\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m 343\u001b[0m \u001b[39mif\u001b[39;00m (\u001b[39mcls\u001b[39m \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m object_hook \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m\n\u001b[1;32m 344\u001b[0m parse_int \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m parse_float \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m\n\u001b[1;32m 345\u001b[0m parse_constant \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m object_pairs_hook \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m kw):\n\u001b[0;32m--> 346\u001b[0m \u001b[39mreturn\u001b[39;00m _default_decoder\u001b[39m.\u001b[39;49mdecode(s)\n\u001b[1;32m 347\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mcls\u001b[39m \u001b[39mis\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 348\u001b[0m \u001b[39mcls\u001b[39m \u001b[39m=\u001b[39m JSONDecoder\n", - "File \u001b[0;32m~/mambaforge/lib/python3.10/json/decoder.py:337\u001b[0m, in \u001b[0;36mJSONDecoder.decode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 332\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mdecode\u001b[39m(\u001b[39mself\u001b[39m, s, _w\u001b[39m=\u001b[39mWHITESPACE\u001b[39m.\u001b[39mmatch):\n\u001b[1;32m 333\u001b[0m \u001b[39m\"\"\"Return the Python representation of ``s`` (a ``str`` instance\u001b[39;00m\n\u001b[1;32m 334\u001b[0m \u001b[39m containing a JSON document).\u001b[39;00m\n\u001b[1;32m 335\u001b[0m \n\u001b[1;32m 336\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 337\u001b[0m obj, end \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mraw_decode(s, idx\u001b[39m=\u001b[39;49m_w(s, \u001b[39m0\u001b[39;49m)\u001b[39m.\u001b[39;49mend())\n\u001b[1;32m 338\u001b[0m end \u001b[39m=\u001b[39m _w(s, end)\u001b[39m.\u001b[39mend()\n\u001b[1;32m 339\u001b[0m \u001b[39mif\u001b[39;00m end \u001b[39m!=\u001b[39m \u001b[39mlen\u001b[39m(s):\n", - "File \u001b[0;32m~/mambaforge/lib/python3.10/json/decoder.py:355\u001b[0m, in \u001b[0;36mJSONDecoder.raw_decode\u001b[0;34m(self, s, idx)\u001b[0m\n\u001b[1;32m 353\u001b[0m obj, end \u001b[39m=\u001b[39m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mscan_once(s, idx)\n\u001b[1;32m 354\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mStopIteration\u001b[39;00m \u001b[39mas\u001b[39;00m err:\n\u001b[0;32m--> 355\u001b[0m \u001b[39mraise\u001b[39;00m JSONDecodeError(\u001b[39m\"\u001b[39m\u001b[39mExpecting value\u001b[39m\u001b[39m\"\u001b[39m, s, err\u001b[39m.\u001b[39mvalue) \u001b[39mfrom\u001b[39;00m \u001b[39mNone\u001b[39m\n\u001b[1;32m 356\u001b[0m \u001b[39mreturn\u001b[39;00m obj, end\n", - "\u001b[0;31mJSONDecodeError\u001b[0m: Expecting value: line 1 column 1 (char 0)" - ] - } - ], + "outputs": [], "source": [ "#| hide\n", "import nbdev; nbdev.nbdev_export()" diff --git a/nbs/01_english.ipynb b/nbs/01_english.ipynb index b99408d..470c75a 100644 --- a/nbs/01_english.ipynb +++ b/nbs/01_english.ipynb @@ -32,19 +32,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'whisper_normalizer'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[3], line 10\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtyping\u001b[39;00m \u001b[39mimport\u001b[39;00m Iterator, List, Match, Optional, Union\n\u001b[1;32m 8\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mmore_itertools\u001b[39;00m \u001b[39mimport\u001b[39;00m windowed\n\u001b[0;32m---> 10\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mwhisper_normalizer\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mbasic\u001b[39;00m \u001b[39mimport\u001b[39;00m remove_symbols_and_diacritics\n\u001b[1;32m 13\u001b[0m \u001b[39mclass\u001b[39;00m \u001b[39mEnglishNumberNormalizer\u001b[39;00m:\n\u001b[1;32m 14\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 15\u001b[0m \u001b[39m Convert any spelled-out numbers into arabic numbers, while handling:\u001b[39;00m\n\u001b[1;32m 16\u001b[0m \n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[39m - interpret successive single-digit numbers as nominal: `one oh one` -> `101`\u001b[39;00m\n\u001b[1;32m 22\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'whisper_normalizer'" - ] - } - ], + "outputs": [], "source": [ "# | export\n", "import json\n", @@ -52,6 +40,7 @@ "import re\n", "from fractions import Fraction\n", "from typing import Iterator, List, Match, Optional, Union\n", + "import urllib\n", "\n", "from more_itertools import windowed\n", "\n", @@ -493,9 +482,16 @@ " s = \" \".join(word for word in self.process_words(s.split()) if word is not None)\n", " s = self.postprocess(s)\n", "\n", - " return s\n", - "\n", - "\n", + " return s\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| export\n", "class EnglishSpellingNormalizer:\n", " \"\"\"\n", " Applies British-American spelling mappings as listed in [1].\n", @@ -504,14 +500,35 @@ " \"\"\"\n", "\n", " def __init__(self):\n", - " mapping_path = os.path.join(os.path.dirname(__file__), \"english.json\")\n", - " self.mapping = json.load(open(mapping_path))\n", + " response = urllib.request.urlopen(\"https://gist.githubusercontent.com/kurianbenoy/715c4528be9859ff64338f69416795c7/raw/936c565f059b81d007e2c52beb733b3a01937d90/openai_whisper.json\")\n", + " self.mapping = json.loads(response.read())\n", "\n", " def __call__(self, s: str):\n", " return \" \".join(self.mapping.get(word, word) for word in s.split())\n", "\n" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'accessorize'" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n = EnglishSpellingNormalizer()\n", + "n(\"accessorise\")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -604,8 +621,54 @@ "\n", " s = re.sub(r\"\\s+\", \" \", s) # replace any successive whitespaces with a space\n", "\n", - " return s\n" + " return s" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "---\n", + "\n", + "[source](https://github.com/kurianbenoy/whisper_normalizer/blob/main/whisper_normalizer/english.py#L473){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "\n", + "### EnglishTextNormalizer\n", + "\n", + "> EnglishTextNormalizer ()\n", + "\n", + "Initialize self. See help(type(self)) for accurate signature." + ], + "text/plain": [ + "---\n", + "\n", + "[source](https://github.com/kurianbenoy/whisper_normalizer/blob/main/whisper_normalizer/english.py#L473){target=\"_blank\" style=\"float:right; font-size:smaller\"}\n", + "\n", + "### EnglishTextNormalizer\n", + "\n", + "> EnglishTextNormalizer ()\n", + "\n", + "Initialize self. See help(type(self)) for accurate signature." + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "show_doc(EnglishTextNormalizer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/whisper_normalizer/basic.py b/whisper_normalizer/basic.py index 41a9874..cfeea86 100644 --- a/whisper_normalizer/basic.py +++ b/whisper_normalizer/basic.py @@ -29,6 +29,7 @@ "Ł": "L", } + def remove_symbols_and_diacritics(s: str, keep=""): """ Replace any other markers, symbols, and punctuations with a space, @@ -57,7 +58,6 @@ def remove_symbols(s: str): for c in unicodedata.normalize("NFKC", s) ) - # %% ../nbs/00_basic.ipynb 4 class BasicTextNormalizer: def __init__(self, remove_diacritics: bool = False, split_letters: bool = False): diff --git a/whisper_normalizer/english.py b/whisper_normalizer/english.py index 060a0a9..3cfcde4 100644 --- a/whisper_normalizer/english.py +++ b/whisper_normalizer/english.py @@ -9,6 +9,7 @@ import re from fractions import Fraction from typing import Iterator, List, Match, Optional, Union +import urllib from more_itertools import windowed @@ -452,7 +453,7 @@ def __call__(self, s: str): return s - +# %% ../nbs/01_english.ipynb 4 class EnglishSpellingNormalizer: """ Applies British-American spelling mappings as listed in [1]. @@ -461,15 +462,15 @@ class EnglishSpellingNormalizer: """ def __init__(self): - mapping_path = os.path.join(os.path.dirname(__file__), "english.json") - self.mapping = json.load(open(mapping_path)) + response = urllib.request.urlopen( + "https://gist.githubusercontent.com/kurianbenoy/715c4528be9859ff64338f69416795c7/raw/936c565f059b81d007e2c52beb733b3a01937d90/openai_whisper.json" + ) + self.mapping = json.loads(response.read()) def __call__(self, s: str): return " ".join(self.mapping.get(word, word) for word in s.split()) - - -# %% ../nbs/01_english.ipynb 4 +# %% ../nbs/01_english.ipynb 6 class EnglishTextNormalizer: def __init__(self): self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b" @@ -556,4 +557,3 @@ def __call__(self, s: str): s = re.sub(r"\s+", " ", s) # replace any successive whitespaces with a space return s -