Sublinear tf idf (#173)

* Initial commit * Normalize before dampen * Touch ups * Log instead of sqrt tf dampening * Appease Stan * Update CHANGELOG * Finishing touches
RubixML · Apr 18, 2021 · e88bb07 · e88bb07
1 parent 7ad1840
commit e88bb07
Show file tree

Hide file tree

Showing 6 changed files with 171 additions and 26 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,6 @@
 - 1.0.0-beta2
     - Interval Discretizer now uses variable width histograms
+    - Added TF-IDF sublinear TF scaling and document length normalization
     - Dataset filterByColumn() is now filter()
     - Added Lambda Function transformer from Extras
     - Rename Dataset column methods to feature

diff --git a/benchmarks/Transformers/TfIdfTransformerBench.php b/benchmarks/Transformers/TfIdfTransformerBench.php
@@ -35,7 +35,7 @@ public function setUp() : void
 
         $this->dataset = Unlabeled::quick($samples);
 
-        $this->transformer = new TfIdfTransformer(1.0);
+        $this->transformer = new TfIdfTransformer();
     }
 
     /**

diff --git a/benchmarks/Transformers/WordCountVectorizerBench.php b/benchmarks/Transformers/WordCountVectorizerBench.php
@@ -0,0 +1,62 @@
+<?php
+
+namespace Rubix\ML\Benchmarks\Transformers;
+
+use Rubix\ML\Datasets\Unlabeled;
+use Rubix\ML\Transformers\WordCountVectorizer;
+
+/**
+ * @Groups({"Transformers"})
+ * @BeforeMethods({"setUp"})
+ */
+class WordCountVectorizerBench
+{
+    protected const DATASET_SIZE = 10000;
+
+    protected const SAMPLE_TEXT = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec at nisl posuere, luctus sapien vel, maximus ex. Curabitur tincidunt, libero at commodo tempor, magna neque malesuada diam, vel blandit metus velit quis magna. Vestibulum auctor libero quam, eu ullamcorper nulla dapibus a. Mauris id ultricies sapien. Integer consequat mi eget vehicula vulputate. Mauris cursus nisi non semper dictum. Quisque luctus ex in tortor laoreet tincidunt. Vestibulum imperdiet purus sit amet sapien dignissim elementum. Mauris tincidunt eget ex eu laoreet. Etiam efficitur quam at purus sagittis hendrerit. Mauris tempus, sem in pulvinar imperdiet, lectus ipsum molestie ante, id semper nunc est sit amet sem. Nulla at justo eleifend, gravida neque eu, consequat arcu. Vivamus bibendum eleifend metus, id elementum orci aliquet ac. Praesent pellentesque nisi vitae tincidunt eleifend. Pellentesque quis ex et lorem laoreet hendrerit ut ac lorem. Aliquam non sagittis est.';
+
+    /**
+     * @var \Rubix\ML\Datasets\Dataset
+     */
+    protected $dataset;
+
+    /**
+     * @var \Rubix\ML\Transformers\WordCountVectorizer
+     */
+    protected $transformer;
+
+    /**
+     * @var array[]
+     */
+    protected $aSamples;
+
+    /**
+     * @var array[]
+     */
+    protected $bSamples;
+
+    public function setUp() : void
+    {
+        $samples = [];
+
+        for ($i = 0; $i < self::DATASET_SIZE; ++$i) {
+            $text = self::SAMPLE_TEXT;
+
+            $samples[] = [str_shuffle($text)];
+        }
+
+        $this->dataset = Unlabeled::quick($samples);
+
+        $this->transformer = new WordCountVectorizer(1000);
+    }
+
+    /**
+     * @Subject
+     * @Iterations(3)
+     * @OutputTimeUnit("milliseconds", precision=3)
+     */
+    public function apply() : void
+    {
+        $this->dataset->apply($this->transformer);
+    }
+}
diff --git a/docs/transformers/tf-idf-transformer.md b/docs/transformers/tf-idf-transformer.md
@@ -1,7 +1,7 @@
 <span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/Transformers/TfIdfTransformer.php">[source]</a></span>
 
 # TF-IDF Transformer
-*Term Frequency - Inverse Document Frequency* is a measurement of how important a word is to a document. The TF-IDF value increases proportionally (linearly) with the number of times a word appears in a document (*TF*) and is offset by the frequency of the word in the corpus (*IDF*).
+*Term Frequency - Inverse Document Frequency* is a measurement of how important a word is to a document. The TF-IDF value increases with the number of times a word appears in a document (*TF*) and is offset by the frequency of the word in the corpus (*IDF*).
 
 !!! note
     TF-IDF Transformer assumes that its inputs are token frequency vectors such as those created by [Word Count Vectorizer](word-count-vectorizer.md).
@@ -13,13 +13,15 @@
 ## Parameters
 | # | Name | Default | Type | Description |
 |---|---|---|---|---|
-| 1 | smoothing | 1.0 | float | The amount of additive Laplace smoothing to add to the inverse document frequencies (IDFs). |
+| 1 | smoothing | 1.0 | float | The amount of additive (Laplace) smoothing to add to the IDFs. |
+| 2 | dampening | false | bool | Should we apply a sub-linear function to dampen the effect of recurring tokens? |
+| 3 | normalize | false | bool | Should we normalize by document length? |
 
 ## Example
 ```php
 use Rubix\ML\Transformers\TfIdfTransformer;
 
-$transformer = new TfIdfTransformer(1.0);
+$transformer = new TfIdfTransformer(2.0, true, true);
 ```
 
 ## Additional Methods
@@ -28,5 +30,12 @@ Return the document frequencies calculated during fitting:
 public dfs() : ?array
 ```
 
+Return the average length of a document in tokens:
+```php
+public averageDocumentLength() : ?float
+```
+
 ## References
-[^1]: S. Robertson. (2003). Understanding Inverse Document Frequency: On theoretical arguments for IDF.
+[^1]: S. Robertson. (2003). Understanding Inverse Document Frequency: On theoretical arguments for IDF.
+[^2]: S. Robertson et al. (2009). The Probabilistic Relevance Framework: BM25 and Beyond.
+[^3]: C. D. Manning et al. (2009). An Introduction to Information Retrieval.
diff --git a/src/Transformers/TfIdfTransformer.php b/src/Transformers/TfIdfTransformer.php
@@ -4,29 +4,34 @@
 
 use Rubix\ML\DataType;
 use Rubix\ML\Persistable;
+use Rubix\ML\Helpers\Params;
 use Rubix\ML\Datasets\Dataset;
 use Rubix\ML\Traits\AutotrackRevisions;
+use Rubix\ML\Specifications\DatasetIsNotEmpty;
+use Rubix\ML\Specifications\SpecificationChain;
 use Rubix\ML\Specifications\SamplesAreCompatibleWithTransformer;
 use Rubix\ML\Exceptions\InvalidArgumentException;
 use Rubix\ML\Exceptions\RuntimeException;
 
 use function array_fill;
+use function array_sum;
 use function log;
 
 /**
  * TF-IDF Transformer
  *
  * Term Frequency - Inverse Document Frequency is a measure of how important a word is to
- * a document. The TF-IDF value increases proportionally (linearly) with the number of
- * times a word appears in a document and is offset by the frequency of the word in the
- * corpus.
+ * a document. The TF-IDF value increases with the number of times a word appears in a document
+ * and is offset by the frequency of the word in the corpus.
  *
  * > **Note**: TF-IDF Transformer assumes that its input is made up of term frequency
  * vectors such as those created by Word Count Vectorizer.
  *
  * References:
- * [1] S. Robertson. (2003). Understanding Inverse Document Frequency: On
- * theoretical arguments for IDF.
+ * [1] S. Robertson. (2003). Understanding Inverse Document Frequency: On theoretical
+ * arguments for IDF.
+ * [2] S. Robertson et al. (2009). The Probabilistic Relevance Framework: BM25 and Beyond.
+ * [3] C. D. Manning et al. (2009). An Introduction to Information Retrieval.
  *
  * @category    Machine Learning
  * @package     Rubix/ML
@@ -37,45 +42,76 @@ class TfIdfTransformer implements Transformer, Stateful, Elastic, Persistable
     use AutotrackRevisions;
 
     /**
-     * The amount of additive Laplace smoothing to add to the inverse document frequencies (IDFs).
+     * The amount of additive (Laplace) smoothing to add to the IDFs.
      *
      * @var float
      */
     protected $smoothing;
 
     /**
-     * The document frequencies of each word i.e. the number of times a word
-     * appeared in a document given the entire corpus.
+     * Should we apply a sub-linear function to dampen the effect of recurring tokens?
+     *
+     * @var bool
+     */
+    protected $dampening;
+
+    /**
+     * Should we normalize by document length?
+     *
+     * @var bool
+     */
+    protected $normalize;
+
+    /**
+     * The document frequencies of each word i.e. the number of times a word appeared in a document.
      *
      * @var int[]|null
      */
     protected $dfs;
 
     /**
-     * The inverse document frequency values for each feature column.
+     * The inverse document frequencies for each feature column.
      *
      * @var float[]|null
      */
     protected $idfs;
 
+    /**
+     * The number of tokens fitted so far.
+     *
+     * @var int|null
+     */
+    protected $tokenCount;
+
     /**
      * The number of documents (samples) that have been fitted so far.
      *
-     * @var int
+     * @var int|null
+     */
+    protected $n;
+
+    /**
+     * The average token count per document.
+     *
+     * @var float|null
      */
-    protected $n = 0;
+    protected $averageDocumentLength;
 
     /**
      * @param float $smoothing
+     * @param bool $dampening
+     * @param bool $normalize
      */
-    public function __construct(float $smoothing = 1.0)
+    public function __construct(float $smoothing = 1.0, bool $dampening = false, bool $normalize = false)
     {
         if ($smoothing <= 0.0) {
             throw new InvalidArgumentException('Smoothing must be'
                 . " greater than 0, $smoothing given.");
         }
 
         $this->smoothing = $smoothing;
+        $this->dampening = $dampening;
+        $this->normalize = $normalize;
     }
 
     /**
@@ -99,7 +135,7 @@ public function compatibility() : array
      */
     public function fitted() : bool
     {
-        return isset($this->idfs);
+        return isset($this->idfs) and isset($this->averageDocumentLength);
     }
 
     /**
@@ -112,6 +148,16 @@ public function dfs() : ?array
         return $this->dfs;
     }
 
+    /**
+     * Return the average length of a document in tokens.
+     *
+     * @return float|null
+     */
+    public function averageDocumentLength() : ?float
+    {
+        return $this->averageDocumentLength;
+    }
+
     /**
      * Fit the transformer to a dataset.
      *
@@ -120,7 +166,7 @@ public function dfs() : ?array
     public function fit(Dataset $dataset) : void
     {
         $this->dfs = array_fill(0, $dataset->numFeatures(), 0);
-        $this->n = 0;
+        $this->tokenCount = $this->n = 0;
 
         $this->update($dataset);
     }
@@ -133,7 +179,10 @@ public function fit(Dataset $dataset) : void
      */
     public function update(Dataset $dataset) : void
     {
-        SamplesAreCompatibleWithTransformer::with($dataset, $this)->check();
+        SpecificationChain::with([
+            new DatasetIsNotEmpty($dataset),
+            new SamplesAreCompatibleWithTransformer($dataset, $this),
+        ])->check();
 
         if ($this->dfs === null) {
             $this->fit($dataset);
@@ -145,12 +194,16 @@ public function update(Dataset $dataset) : void
             foreach ($sample as $column => $value) {
                 if ($value > 0) {
                     ++$this->dfs[$column];
+
+                    $this->tokenCount += $value;
                 }
             }
         }
 
         $this->n += $dataset->numSamples();
 
+        $this->averageDocumentLength = $this->tokenCount / $this->n;
+
         $nHat = $this->n + $this->smoothing;
 
         $idfs = [];
@@ -170,14 +223,32 @@ public function update(Dataset $dataset) : void
      */
     public function transform(array &$samples) : void
     {
-        if ($this->idfs === null) {
+        if ($this->idfs === null or $this->averageDocumentLength === null) {
             throw new RuntimeException('Transformer has not been fitted.');
         }
 
         foreach ($samples as &$sample) {
-            foreach ($sample as $column => &$tf) {
-                if ($tf > 0) {
-                    $tf *= $this->idfs[$column];
+            if ($this->normalize) {
+                $documentLength = array_sum($sample);
+
+                if ($documentLength == 0) {
+                    continue;
+                }
+
+                $delta = $this->averageDocumentLength / $documentLength;
+            }
+
+            foreach ($sample as $column => &$value) {
+                if ($value > 0) {
+                    if (isset($delta)) {
+                        $value *= $delta;
+                    }
+
+                    if ($this->dampening) {
+                        $value = 1.0 + log($value);
+                    }
+
+                    $value *= $this->idfs[$column];
                 }
             }
         }
@@ -190,6 +261,8 @@ public function transform(array &$samples) : void
      */
     public function __toString() : string
     {
-        return "TF-IDF Transformer (smoothing: {$this->smoothing})";
+        return "TF-IDF Transformer (smoothing: {$this->smoothing}, dampening: "
+            . Params::toString($this->dampening) . ', normalize: '
+            . Params::toString($this->normalize) . ')';
     }
 }
diff --git a/tests/Transformers/TfIdfTransformerTest.php b/tests/Transformers/TfIdfTransformerTest.php
@@ -37,7 +37,7 @@ protected function setUp() : void
             [0, 0, 0, 1, 2, 3, 0, 0, 4, 2, 0, 0, 1, 0, 2, 0, 1, 0, 0],
         ]);
 
-        $this->transformer = new TfIdfTransformer(1.0);
+        $this->transformer = new TfIdfTransformer(1.0, false, false);
     }
 
     /**
-Original file line number
+Diff line change
@@ Expand Up / @@ -35,7 +35,7 @@ public function setUp() : void @@
             $this->dataset = Unlabeled::quick($samples);
-            $this->transformer = new TfIdfTransformer(1.0);
+            $this->transformer = new TfIdfTransformer();
         }
         /**
@@ Expand Down @@