Skip to content

Commit

Permalink
Sublinear tf idf (#173)
Browse files Browse the repository at this point in the history
* Initial commit

* Normalize before dampen

* Touch ups

* Log instead of sqrt tf dampening

* Appease Stan

* Update CHANGELOG

* Finishing touches
  • Loading branch information
andrewdalpino authored Apr 18, 2021
1 parent 7ad1840 commit e88bb07
Show file tree
Hide file tree
Showing 6 changed files with 171 additions and 26 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
- 1.0.0-beta2
- Interval Discretizer now uses variable width histograms
- Added TF-IDF sublinear TF scaling and document length normalization
- Dataset filterByColumn() is now filter()
- Added Lambda Function transformer from Extras
- Rename Dataset column methods to feature
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/Transformers/TfIdfTransformerBench.php
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public function setUp() : void

$this->dataset = Unlabeled::quick($samples);

$this->transformer = new TfIdfTransformer(1.0);
$this->transformer = new TfIdfTransformer();
}

/**
Expand Down
62 changes: 62 additions & 0 deletions benchmarks/Transformers/WordCountVectorizerBench.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
<?php

namespace Rubix\ML\Benchmarks\Transformers;

use Rubix\ML\Datasets\Unlabeled;
use Rubix\ML\Transformers\WordCountVectorizer;

/**
* @Groups({"Transformers"})
* @BeforeMethods({"setUp"})
*/
class WordCountVectorizerBench
{
protected const DATASET_SIZE = 10000;

protected const SAMPLE_TEXT = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Donec at nisl posuere, luctus sapien vel, maximus ex. Curabitur tincidunt, libero at commodo tempor, magna neque malesuada diam, vel blandit metus velit quis magna. Vestibulum auctor libero quam, eu ullamcorper nulla dapibus a. Mauris id ultricies sapien. Integer consequat mi eget vehicula vulputate. Mauris cursus nisi non semper dictum. Quisque luctus ex in tortor laoreet tincidunt. Vestibulum imperdiet purus sit amet sapien dignissim elementum. Mauris tincidunt eget ex eu laoreet. Etiam efficitur quam at purus sagittis hendrerit. Mauris tempus, sem in pulvinar imperdiet, lectus ipsum molestie ante, id semper nunc est sit amet sem. Nulla at justo eleifend, gravida neque eu, consequat arcu. Vivamus bibendum eleifend metus, id elementum orci aliquet ac. Praesent pellentesque nisi vitae tincidunt eleifend. Pellentesque quis ex et lorem laoreet hendrerit ut ac lorem. Aliquam non sagittis est.';

/**
* @var \Rubix\ML\Datasets\Dataset
*/
protected $dataset;

/**
* @var \Rubix\ML\Transformers\WordCountVectorizer
*/
protected $transformer;

/**
* @var array[]
*/
protected $aSamples;

/**
* @var array[]
*/
protected $bSamples;

public function setUp() : void
{
$samples = [];

for ($i = 0; $i < self::DATASET_SIZE; ++$i) {
$text = self::SAMPLE_TEXT;

$samples[] = [str_shuffle($text)];
}

$this->dataset = Unlabeled::quick($samples);

$this->transformer = new WordCountVectorizer(1000);
}

/**
* @Subject
* @Iterations(3)
* @OutputTimeUnit("milliseconds", precision=3)
*/
public function apply() : void
{
$this->dataset->apply($this->transformer);
}
}
17 changes: 13 additions & 4 deletions docs/transformers/tf-idf-transformer.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/Transformers/TfIdfTransformer.php">[source]</a></span>

# TF-IDF Transformer
*Term Frequency - Inverse Document Frequency* is a measurement of how important a word is to a document. The TF-IDF value increases proportionally (linearly) with the number of times a word appears in a document (*TF*) and is offset by the frequency of the word in the corpus (*IDF*).
*Term Frequency - Inverse Document Frequency* is a measurement of how important a word is to a document. The TF-IDF value increases with the number of times a word appears in a document (*TF*) and is offset by the frequency of the word in the corpus (*IDF*).

!!! note
TF-IDF Transformer assumes that its inputs are token frequency vectors such as those created by [Word Count Vectorizer](word-count-vectorizer.md).
Expand All @@ -13,13 +13,15 @@
## Parameters
| # | Name | Default | Type | Description |
|---|---|---|---|---|
| 1 | smoothing | 1.0 | float | The amount of additive Laplace smoothing to add to the inverse document frequencies (IDFs). |
| 1 | smoothing | 1.0 | float | The amount of additive (Laplace) smoothing to add to the IDFs. |
| 2 | dampening | false | bool | Should we apply a sub-linear function to dampen the effect of recurring tokens? |
| 3 | normalize | false | bool | Should we normalize by document length? |

## Example
```php
use Rubix\ML\Transformers\TfIdfTransformer;

$transformer = new TfIdfTransformer(1.0);
$transformer = new TfIdfTransformer(2.0, true, true);
```

## Additional Methods
Expand All @@ -28,5 +30,12 @@ Return the document frequencies calculated during fitting:
public dfs() : ?array
```

Return the average length of a document in tokens:
```php
public averageDocumentLength() : ?float
```

## References
[^1]: S. Robertson. (2003). Understanding Inverse Document Frequency: On theoretical arguments for IDF.
[^1]: S. Robertson. (2003). Understanding Inverse Document Frequency: On theoretical arguments for IDF.
[^2]: S. Robertson et al. (2009). The Probabilistic Relevance Framework: BM25 and Beyond.
[^3]: C. D. Manning et al. (2009). An Introduction to Information Retrieval.
113 changes: 93 additions & 20 deletions src/Transformers/TfIdfTransformer.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,29 +4,34 @@

use Rubix\ML\DataType;
use Rubix\ML\Persistable;
use Rubix\ML\Helpers\Params;
use Rubix\ML\Datasets\Dataset;
use Rubix\ML\Traits\AutotrackRevisions;
use Rubix\ML\Specifications\DatasetIsNotEmpty;
use Rubix\ML\Specifications\SpecificationChain;
use Rubix\ML\Specifications\SamplesAreCompatibleWithTransformer;
use Rubix\ML\Exceptions\InvalidArgumentException;
use Rubix\ML\Exceptions\RuntimeException;

use function array_fill;
use function array_sum;
use function log;

/**
* TF-IDF Transformer
*
* Term Frequency - Inverse Document Frequency is a measure of how important a word is to
* a document. The TF-IDF value increases proportionally (linearly) with the number of
* times a word appears in a document and is offset by the frequency of the word in the
* corpus.
* a document. The TF-IDF value increases with the number of times a word appears in a document
* and is offset by the frequency of the word in the corpus.
*
* > **Note**: TF-IDF Transformer assumes that its input is made up of term frequency
* vectors such as those created by Word Count Vectorizer.
*
* References:
* [1] S. Robertson. (2003). Understanding Inverse Document Frequency: On
* theoretical arguments for IDF.
* [1] S. Robertson. (2003). Understanding Inverse Document Frequency: On theoretical
* arguments for IDF.
* [2] S. Robertson et al. (2009). The Probabilistic Relevance Framework: BM25 and Beyond.
* [3] C. D. Manning et al. (2009). An Introduction to Information Retrieval.
*
* @category Machine Learning
* @package Rubix/ML
Expand All @@ -37,45 +42,76 @@ class TfIdfTransformer implements Transformer, Stateful, Elastic, Persistable
use AutotrackRevisions;

/**
* The amount of additive Laplace smoothing to add to the inverse document frequencies (IDFs).
* The amount of additive (Laplace) smoothing to add to the IDFs.
*
* @var float
*/
protected $smoothing;

/**
* The document frequencies of each word i.e. the number of times a word
* appeared in a document given the entire corpus.
* Should we apply a sub-linear function to dampen the effect of recurring tokens?
*
* @var bool
*/
protected $dampening;

/**
* Should we normalize by document length?
*
* @var bool
*/
protected $normalize;

/**
* The document frequencies of each word i.e. the number of times a word appeared in a document.
*
* @var int[]|null
*/
protected $dfs;

/**
* The inverse document frequency values for each feature column.
* The inverse document frequencies for each feature column.
*
* @var float[]|null
*/
protected $idfs;

/**
* The number of tokens fitted so far.
*
* @var int|null
*/
protected $tokenCount;

/**
* The number of documents (samples) that have been fitted so far.
*
* @var int
* @var int|null
*/
protected $n;

/**
* The average token count per document.
*
* @var float|null
*/
protected $n = 0;
protected $averageDocumentLength;

/**
* @param float $smoothing
* @param bool $dampening
* @param bool $normalize
*/
public function __construct(float $smoothing = 1.0)
public function __construct(float $smoothing = 1.0, bool $dampening = false, bool $normalize = false)
{
if ($smoothing <= 0.0) {
throw new InvalidArgumentException('Smoothing must be'
. " greater than 0, $smoothing given.");
}

$this->smoothing = $smoothing;
$this->dampening = $dampening;
$this->normalize = $normalize;
}

/**
Expand All @@ -99,7 +135,7 @@ public function compatibility() : array
*/
public function fitted() : bool
{
return isset($this->idfs);
return isset($this->idfs) and isset($this->averageDocumentLength);
}

/**
Expand All @@ -112,6 +148,16 @@ public function dfs() : ?array
return $this->dfs;
}

/**
* Return the average length of a document in tokens.
*
* @return float|null
*/
public function averageDocumentLength() : ?float
{
return $this->averageDocumentLength;
}

/**
* Fit the transformer to a dataset.
*
Expand All @@ -120,7 +166,7 @@ public function dfs() : ?array
public function fit(Dataset $dataset) : void
{
$this->dfs = array_fill(0, $dataset->numFeatures(), 0);
$this->n = 0;
$this->tokenCount = $this->n = 0;

$this->update($dataset);
}
Expand All @@ -133,7 +179,10 @@ public function fit(Dataset $dataset) : void
*/
public function update(Dataset $dataset) : void
{
SamplesAreCompatibleWithTransformer::with($dataset, $this)->check();
SpecificationChain::with([
new DatasetIsNotEmpty($dataset),
new SamplesAreCompatibleWithTransformer($dataset, $this),
])->check();

if ($this->dfs === null) {
$this->fit($dataset);
Expand All @@ -145,12 +194,16 @@ public function update(Dataset $dataset) : void
foreach ($sample as $column => $value) {
if ($value > 0) {
++$this->dfs[$column];

$this->tokenCount += $value;
}
}
}

$this->n += $dataset->numSamples();

$this->averageDocumentLength = $this->tokenCount / $this->n;

$nHat = $this->n + $this->smoothing;

$idfs = [];
Expand All @@ -170,14 +223,32 @@ public function update(Dataset $dataset) : void
*/
public function transform(array &$samples) : void
{
if ($this->idfs === null) {
if ($this->idfs === null or $this->averageDocumentLength === null) {
throw new RuntimeException('Transformer has not been fitted.');
}

foreach ($samples as &$sample) {
foreach ($sample as $column => &$tf) {
if ($tf > 0) {
$tf *= $this->idfs[$column];
if ($this->normalize) {
$documentLength = array_sum($sample);

if ($documentLength == 0) {
continue;
}

$delta = $this->averageDocumentLength / $documentLength;
}

foreach ($sample as $column => &$value) {
if ($value > 0) {
if (isset($delta)) {
$value *= $delta;
}

if ($this->dampening) {
$value = 1.0 + log($value);
}

$value *= $this->idfs[$column];
}
}
}
Expand All @@ -190,6 +261,8 @@ public function transform(array &$samples) : void
*/
public function __toString() : string
{
return "TF-IDF Transformer (smoothing: {$this->smoothing})";
return "TF-IDF Transformer (smoothing: {$this->smoothing}, dampening: "
. Params::toString($this->dampening) . ', normalize: '
. Params::toString($this->normalize) . ')';
}
}
2 changes: 1 addition & 1 deletion tests/Transformers/TfIdfTransformerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ protected function setUp() : void
[0, 0, 0, 1, 2, 3, 0, 0, 4, 2, 0, 0, 1, 0, 2, 0, 1, 0, 0],
]);

$this->transformer = new TfIdfTransformer(1.0);
$this->transformer = new TfIdfTransformer(1.0, false, false);
}

/**
Expand Down

0 comments on commit e88bb07

Please sign in to comment.