3.0.3 (#783)

* Bug fixes
MontrealCorpusTools · Mar 22, 2024 · 91fd82d · 91fd82d
1 parent 587ce0f
commit 91fd82d
Show file tree

Hide file tree

Showing 15 changed files with 48 additions and 49 deletions.
diff --git a/docs/source/changelog/changelog_3.0.rst b/docs/source/changelog/changelog_3.0.rst
@@ -5,6 +5,16 @@
 3.0 Changelog
 *************
 
+3.0.3
+=====
+
+- Fixed regression for not merging clitic words when textgrid cleanup is disabled
+- Fixed issue with copying files when symlinks are not possible on windows
+- Fixed an issue with using G2P models during training/alignment
+- Changed default feature config to set :code:`use_energy=True` and :code:`dithering=0.0001`
+- Updated tokenization when lower casing to remove extra dot for capital :code:`i` in Turkish
+- Fix an issue where special disambiguation symbols were not always in the phone table
+
 3.0.2
 =====
 

diff --git a/docs/source/installation.rst b/docs/source/installation.rst
@@ -28,6 +28,11 @@ General installation
    2. :code:`conda install -c conda-forge mamba`
    3. :code:`mamba create -n aligner -c conda-forge montreal-forced-aligner`
 
+Updating Montreal Forced Aligner
+--------------------------------
+
+To install the latest version, please run either :code:`conda update -c conda-forge montreal-forced-aligner --update-deps` or  :code:`mamba update -c conda-forge montreal-forced-aligner --update-deps` if you have mamba installed.
+
 Installing SpeechBrain
 ----------------------
 
@@ -52,7 +57,7 @@ If you need to use an older version of MFA, you can install it via:
 
 More stable key versions:
 
-* Stable 3.0 release: :code:`conda install -c conda-forge montreal-forced-aligner=3.0.2`
+* Stable 3.0 release: :code:`conda update -c conda-forge montreal-forced-aligner`
 * Stable 2.2 release: :code:`conda install -c conda-forge montreal-forced-aligner=2.2.17 openfst=1.8.2 kaldi=5.5.1068`
 * Stable 2.1 release: :code:`conda install -c conda-forge montreal-forced-aligner=2.1.7 openfst=1.8.2 kaldi=5.5.1068`
 * Stable 2.0 release: :code:`conda install -c conda-forge montreal-forced-aligner=2.0.6 openfst=1.8.2 kaldi=5.5.1068`

diff --git a/montreal_forced_aligner/alignment/multiprocessing.py b/montreal_forced_aligner/alignment/multiprocessing.py
@@ -621,18 +621,18 @@ def _run(self) -> None:
                         ).symlink_to(likes_path)
                     except OSError:
                         shutil.copyfile(
-                            job.construct_path(self.working_directory, "ali", "ark", dict_id),
                             ali_path,
+                            job.construct_path(self.working_directory, "ali", "ark", dict_id),
                         )
                         shutil.copyfile(
-                            job.construct_path(self.working_directory, "words", "ark", dict_id),
                             words_path,
+                            job.construct_path(self.working_directory, "words", "ark", dict_id),
                         )
                         shutil.copyfile(
+                            likes_path,
                             job.construct_path(
                                 self.working_directory, "likelihoods", "ark", dict_id
                             ),
-                            likes_path,
                         )
 
 

diff --git a/montreal_forced_aligner/corpus/acoustic_corpus.py b/montreal_forced_aligner/corpus/acoustic_corpus.py
@@ -1087,7 +1087,6 @@ def load_corpus(self) -> None:
         all_begin = time.time()
         self.initialize_database()
         if self.dictionary_model is not None and not self.imported:
-            logger.debug(f"Using {self.phone_set_type}")
             self.dictionary_setup()
             logger.debug(f"Loaded dictionary in {time.time() - all_begin:.3f} seconds")
 

diff --git a/montreal_forced_aligner/corpus/base.py b/montreal_forced_aligner/corpus/base.py
@@ -727,6 +727,7 @@ def normalize_text(self) -> None:
                                 oovs.add(w)
                                 if self.language is Language.unknown:
                                     to_g2p.add((w, dict_id))
+                                    word_to_g2p_mapping[dict_id][w].add(w)
                                 else:
                                     to_g2p.add((pronunciation_text[i], dict_id))
                                     word_to_g2p_mapping[dict_id][w].add(pronunciation_text[i])
@@ -793,8 +794,8 @@ def normalize_text(self) -> None:
                     if g2p_model is not None:
                         from montreal_forced_aligner.g2p.generator import PyniniGenerator
 
+                        g2pped = {}
                         if isinstance(g2p_model, dict):
-                            g2pped = {}
                             for dict_name, g2p_model in g2p_model.items():
                                 dict_id = dict_name_to_id[dict_name]
                                 gen = PyniniGenerator(
@@ -811,7 +812,8 @@ def normalize_text(self) -> None:
                                 num_pronunciations=1,
                                 strict_graphemes=True,
                             )
-                            g2pped = gen.generate_pronunciations()
+                            dict_id = list(dictionaries.keys())[0]
+                            g2pped[dict_id] = gen.generate_pronunciations()
                         for dict_id, mapping in word_to_g2p_mapping.items():
                             log_file.write(f"For dictionary {dict_id}:\n")
                             for w, ps in mapping.items():

diff --git a/montreal_forced_aligner/corpus/features.py b/montreal_forced_aligner/corpus/features.py
@@ -593,7 +593,7 @@ class FeatureConfigMixin:
     def __init__(
         self,
         feature_type: str = "mfcc",
-        use_energy: bool = False,
+        use_energy: bool = True,
         frame_shift: int = 10,
         frame_length: int = 25,
         snip_edges: bool = False,
@@ -602,7 +602,7 @@ def __init__(
         sample_frequency: int = 16000,
         allow_downsample: bool = True,
         allow_upsample: bool = True,
-        dither: float = 0.0,
+        dither: float = 0.0001,
         energy_floor: float = 1.0,
         num_coefficients: int = 13,
         num_mel_bins: int = 23,
@@ -800,7 +800,7 @@ def mfcc_options(self) -> MetaDict:
             }
         options.update(
             {
-                "dither": 0.0,
+                "dither": 0.0001,
                 "energy_floor": 1.0,
             }
         )

diff --git a/montreal_forced_aligner/corpus/ivector_corpus.py b/montreal_forced_aligner/corpus/ivector_corpus.py
@@ -423,7 +423,7 @@ def collect_speaker_ivectors(self) -> None:
                 speaker_ids.append(speaker_id)
                 num_utts.append(utts)
                 ivector_normalize_length(ivector)
-                ivectors.append(FloatVector(ivector))
+                ivectors.append(DoubleVector(ivector))
             ivector_subtract_mean(ivectors)
             update_mapping = []
             for i in range(len(speaker_ids)):

diff --git a/montreal_forced_aligner/db.py b/montreal_forced_aligner/db.py
@@ -439,6 +439,9 @@ def phone_table(self):
         if not hasattr(self, "_phone_table"):
             if self.phone_symbol_table_path.exists():
                 self._phone_table = pywrapfst.SymbolTable.read_text(self.phone_symbol_table_path)
+                for k in ["#0", "#1", "#2"]:
+                    if not self._phone_table.member(k):
+                        self._phone_table.add_symbol(k)
             else:
                 self.phones_directory.mkdir(parents=True, exist_ok=True)
                 session = sqlalchemy.orm.Session.object_session(self)

diff --git a/montreal_forced_aligner/diarization/multiprocessing.py b/montreal_forced_aligner/diarization/multiprocessing.py
@@ -23,10 +23,9 @@
 
 import numpy as np
 import sqlalchemy
-from _kalpy.ivector import Plda, ivector_normalize_length, ivector_subtract_mean
-from _kalpy.matrix import FloatVector
+from _kalpy.ivector import Plda
 from kalpy.data import Segment
-from kalpy.ivector.data import IvectorArchive
+from kalpy.ivector.plda import PldaScorer
 from scipy.spatial import distance
 from sklearn import cluster, manifold, metrics, neighbors, preprocessing
 from sqlalchemy.orm import joinedload
@@ -83,7 +82,7 @@
 class PldaClassificationArguments(MfaArguments):
     """Arguments for :class:`~montreal_forced_aligner.diarization.multiprocessing.PldaClassificationFunction`"""
 
-    plda: Plda
+    plda_path: Path
     train_ivector_path: Path
     num_utts_path: Path
     use_xvector: bool
@@ -129,7 +128,7 @@ def visualize_clusters(
         tsne_iterations = 500
         mds_iterations = 150
     if metric_type is DistanceMetric.plda:
-        metric = plda.log_likelihood_distance
+        metric = plda.log_likelihood_distance_vectorized
     if manifold_algorithm is ManifoldAlgorithm.mds:
         if metric_type is DistanceMetric.cosine:
             to_fit = preprocessing.normalize(ivectors, norm="l2")
@@ -291,7 +290,7 @@ def cluster_matrix(
     to_fit = ivectors
     score_metric_params = None
     if score_metric == "plda" and cluster_type is not ClusterType.affinity:
-        score_metric = plda.log_likelihood_distance
+        score_metric = plda.log_likelihood_distance_vectorized
     if cluster_type is ClusterType.affinity:
         affinity = metric
         if metric is DistanceMetric.cosine:
@@ -486,27 +485,15 @@ class PldaClassificationFunction(KaldiFunction):
 
     def __init__(self, args: PldaClassificationArguments):
         super().__init__(args)
-        self.plda = args.plda
+        self.plda_path = args.plda_path
         self.train_ivector_path = args.train_ivector_path
         self.num_utts_path = args.num_utts_path
         self.use_xvector = args.use_xvector
 
     def _run(self):
         """Run the function"""
-
-        ivector_archive = IvectorArchive(
-            self.train_ivector_path, num_utterances_file_name=self.num_utts_path
-        )
-        speaker_ivectors = []
-        speaker_ids = []
-        num_utts = []
-        for speaker_id, ivector, utts in ivector_archive:
-            speaker_ids.append(speaker_id)
-            num_utts.append(utts)
-            ivector_normalize_length(ivector)
-            speaker_ivectors.append(FloatVector(ivector))
-        ivector_subtract_mean(speaker_ivectors)
-        speaker_ivectors = self.plda.transform_ivectors(speaker_ivectors, num_utts)
+        plda_scorer = PldaScorer(self.plda_path)
+        plda_scorer.load_speaker_ivectors(self.train_ivector_path, self.num_utts_path)
         with self.session() as session:
             job: Job = (
                 session.query(Job)
@@ -521,10 +508,7 @@ def _run(self):
                 .order_by(Utterance.kaldi_id)
             )
             for u_id, u_ivector in utterances:
-                ivector = FloatVector()
-                ivector.from_numpy(u_ivector)
-                ind, score = self.plda.classify_utterance(ivector, speaker_ivectors, num_utts)
-                speaker = speaker_ids[ind]
+                speaker, score = plda_scorer.classify_speaker(u_ivector)
                 self.callback((u_id, speaker, score))
 
 

diff --git a/montreal_forced_aligner/diarization/speaker_diarizer.py b/montreal_forced_aligner/diarization/speaker_diarizer.py
@@ -315,7 +315,7 @@ def plda_classification_arguments(self) -> List[PldaClassificationArguments]:
                 j.id,
                 getattr(self, "session" if config.USE_THREADING else "db_string", ""),
                 self.working_log_directory.joinpath(f"plda_classification.{j.id}.log"),
-                self.plda,
+                self.plda_path,
                 self.speaker_ivector_path,
                 self.num_utts_path,
                 self.use_xvector,

diff --git a/montreal_forced_aligner/dictionary/mixins.py b/montreal_forced_aligner/dictionary/mixins.py
@@ -18,7 +18,7 @@
 if TYPE_CHECKING:
     from montreal_forced_aligner.abc import MetaDict
 
-DEFAULT_PUNCTUATION = list(r'、。।，？！!@<>→"”()“„–,.:;—¿?¡：）!\\&%#*~【】，…‥「」『』〝〟″⟨⟩♪・‹›«»～′$+=‘۔')
+DEFAULT_PUNCTUATION = list(r'、。।，？！!@<>→"”()“„–,.:;—¿?¡：）!\\&%#*~【】，…‥「」『』〝〟″⟨⟩♪・‚‘‹›«»～′$+=‘۔')
 
 DEFAULT_WORD_BREAK_MARKERS = list(r'？！!()，,.:;¡¿?“„"”&~%#—…‥、。【】$+=〝〟″‹›«»・⟨⟩「」『』')
 

diff --git a/montreal_forced_aligner/models.py b/montreal_forced_aligner/models.py
@@ -535,7 +535,7 @@ def mfcc_options(self) -> MetaDict:
             "sample_frequency": self._meta["features"].get("sample_frequency", 16000),
             "frame_shift": self._meta["features"].get("frame_shift", 10),
             "frame_length": self._meta["features"].get("frame_length", 25),
-            "dither": self._meta["features"].get("dither", 0.0),
+            "dither": self._meta["features"].get("dither", 0.0001),
             "preemphasis_coefficient": self._meta["features"].get("preemphasis_coefficient", 0.97),
             "snip_edges": self._meta["features"].get("snip_edges", True),
             "num_mel_bins": self._meta["features"].get("num_mel_bins", 23),
@@ -882,7 +882,7 @@ def mfcc_options(self) -> MetaDict:
         """Parameters to use in computing MFCC features."""
         return {
             "use_energy": self._meta["features"].get("use_energy", False),
-            "dither": self._meta["features"].get("dither", 0.0),
+            "dither": self._meta["features"].get("dither", 0.0001),
             "energy_floor": self._meta["features"].get("energy_floor", 1.0),
             "num_coefficients": self._meta["features"].get("num_coefficients", 13),
             "num_mel_bins": self._meta["features"].get("num_mel_bins", 23),

diff --git a/montreal_forced_aligner/textgrid.py b/montreal_forced_aligner/textgrid.py
@@ -329,7 +329,8 @@ def construct_output_tiers(
             if include_original_text:
                 data[speaker_name]["utterances"] = []
         if (
-            data[speaker_name]["words"]
+            cleanup_textgrids
+            and data[speaker_name]["words"]
             and w_begin - data[speaker_name]["words"][-1].end < 0.02
             and clitic_marker
             and (

diff --git a/montreal_forced_aligner/tokenization/simple.py b/montreal_forced_aligner/tokenization/simple.py
@@ -74,6 +74,7 @@ def __call__(self, text) -> typing.Generator[str]:
         """
         if self.ignore_case:
             text = text.lower()
+            text = text.replace("i̇", "i")  # Turkish normalization
         if self.bracket_regex:
             for word_object in self.bracket_regex.finditer(text):
                 word = word_object.group(0)
@@ -458,7 +459,6 @@ def _compile_regexes(self) -> None:
                 self.final_clitic_regex = re.compile(rf"(?<=\w)({'|'.join(final_clitics)})$")
 
     def _dictionary_sanitize(self, text):
-
         words = self.sanitize_function(text)
         normalized_text = []
         normalized_character_text = []

diff --git a/tests/test_commandline_model.py b/tests/test_commandline_model.py
@@ -42,7 +42,6 @@ def test_download_error():
 
 
 def test_download_acoustic():
-
     command = ["model", "download", "acoustic", "german_mfa", "--ignore_cache"]
 
     result = click.testing.CliRunner(mix_stderr=False).invoke(
@@ -57,7 +56,7 @@ def test_download_acoustic():
     path = AcousticModel.get_pretrained_path("german_mfa")
     assert path.exists()
 
-    assert AcousticModel(path).version == "2.0.0rc4.dev19+ged818cb.d20220404"
+    assert AcousticModel(path).version == "3.0.0"
 
     command = ["model", "download", "acoustic", "german_mfa", "--version", "2.0.0"]
 
@@ -73,11 +72,10 @@ def test_download_acoustic():
     path = AcousticModel.get_pretrained_path("german_mfa")
     assert path.exists()
 
-    assert AcousticModel(path).version != "2.0.0rc4.dev19+ged818cb.d20220404"
+    assert AcousticModel(path).version != "3.0.0"
 
 
 def test_download_g2p():
-
     command = [
         "model",
         "download",
@@ -98,7 +96,6 @@ def test_download_g2p():
 
 
 def test_download_dictionary():
-
     command = [
         "model",
         "download",
@@ -120,7 +117,6 @@ def test_download_dictionary():
 
 
 def test_download_list_acoustic():
-
     command = ["model", "download", "acoustic", "--ignore_cache"]
 
     result = click.testing.CliRunner(mix_stderr=False).invoke(
@@ -135,7 +131,6 @@ def test_download_list_acoustic():
 
 
 def test_download_list_dictionary():
-
     command = [
         "model",
         "download",