Skip to content

Commit

Permalink
Merge pull request #198 from RubixML/1.3
Browse files Browse the repository at this point in the history
1.3
  • Loading branch information
andrewdalpino authored Dec 4, 2021
2 parents 08ef9b0 + 74195c1 commit ffcae74
Show file tree
Hide file tree
Showing 27 changed files with 1,436 additions and 334 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
- 1.3.0
- Switch back to original fork of Tensor
- Added `maxBins` hyper-parameter to CART-based learners
- Added stream Deduplicator extractor
- Added the SiLU activation function
- Added Swish activation layer

- 1.2.4
- Refactor neural network parameter updates
- Allow set null logger
Expand Down
57 changes: 57 additions & 0 deletions benchmarks/NeuralNet/ActivationFunctions/SiLUBench.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
<?php

namespace Rubix\ML\Benchmarks\NeuralNet\ActivationFunctions;

use Tensor\Matrix;
use Rubix\ML\NeuralNet\ActivationFunctions\SiLU;

/**
* @Groups({"ActivationFunctions"})
* @BeforeMethods({"setUp"})
*/
class SiLUBench
{
/**
* @var \Tensor\Matrix
*/
protected $z;

/**
* @var \Tensor\Matrix
*/
protected $computed;

/**
* @var \Rubix\ML\NeuralNet\ActivationFunctions\SiLU
*/
protected $activationFn;

public function setUp() : void
{
$this->z = Matrix::uniform(500, 500);

$this->computed = Matrix::uniform(500, 500);

$this->activationFn = new SiLU();
}

/**
* @Subject
* @Iterations(3)
* @OutputTimeUnit("milliseconds", precision=3)
*/
public function compute() : void
{
$this->activationFn->activate($this->z);
}

/**
* @Subject
* @Iterations(3)
* @OutputTimeUnit("milliseconds", precision=3)
*/
public function differentiate() : void
{
$this->activationFn->differentiate($this->z, $this->computed);
}
}
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
"ext-json": "*",
"amphp/parallel": "^1.3",
"psr/log": "^1.1",
"scienide/tensor": "^3.0",
"rubix/tensor": "^3.0",
"scienide/okbloomer": "^1.0@beta",
"symfony/polyfill-mbstring": "^1.0",
"symfony/polyfill-php80": "^1.17",
"wamania/php-stemmer": "^2.0"
Expand Down
3 changes: 2 additions & 1 deletion docs/classifiers/classification-tree.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@ A binary tree-based learner that greedily constructs a decision map for classifi
| 2 | maxLeafSize | 3 | int | The max number of samples that a leaf node can contain. |
| 3 | minPurityIncrease | 1e-7 | float | The minimum increase in purity necessary to continue splitting a subtree. |
| 4 | maxFeatures | Auto | int | The max number of feature columns to consider when determining a best split. |
| 5 | maxBins | Auto | int | The maximum number of bins to consider when determining a split with a continuous feature as the split point. |

## Example
```php
use Rubix\ML\Classifiers\ClassificationTree;

$estimator = new ClassificationTree(10, 5, 0.001, null);
$estimator = new ClassificationTree(10, 5, 0.001, null, null);
```

## Additional Methods
Expand Down
31 changes: 31 additions & 0 deletions docs/extractors/deduplicator.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/Extractors/Deduplicator.php">[source]</a></span>

# Deduplicator
Removes duplicate records from a dataset while the records are in flight. Deduplicator uses a Bloom filter under the hood to probabilistically identify records that have already been seen before.

!!! note
Due to its probabilistic nature, Deduplicator may mistakenly drop unique records at a bounded rate.

**Interfaces:** [Extractor](api.md)

## Parameters
| # | Name | Default | Type | Description |
|---|---|---|---|---|
| 1 | iterator | | Traversable | The base iterator. |
| 2 | maxFalsePositiveRate | 0.001 | float | The false positive rate to remain below. |
| 3 | numHashes | 4 | int | The number of hash functions used, i.e. the number of slices per layer. Set to null for auto. |
| 4 | layerSize | 32000000 | int | The size of each layer of the filter in bits. |

## Example
```php
use Rubix\ML\Extractors\Deduplicator;
use Rubix\ML\Extractors\CSV;

$extractor = new Deduplicator(new CSV('example.csv', true), 0.01, 3, 32000000);
```

## Additional Methods
Return the number of records that have been dropped so far.
```php
public dropped() : int
```
17 changes: 17 additions & 0 deletions docs/neural-network/activation-functions/silu.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/ActivationFunctions/SiLU.php">[source]</a></span>

# SiLU
Sigmoid Linear Units are smooth and non-monotonic rectified activation functions. Their inputs are weighted by the [Sigmoid](sigmoid.md) activation function acting as a self-gating mechanism.

## Parameters
This activation function does not have any parameters.

## Example
```php
use Rubix\ML\NeuralNet\ActivationFunctions\SiLU;

$activationFunction = new SiLU();
```

### References
[^1]: S. Elwing et al. (2017). Sigmoid-Weighted Linear Units for Neural Network Function Approximation in Reinforcement Learning.
21 changes: 21 additions & 0 deletions docs/neural-network/hidden-layers/swish.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/NeuralNet/Layers/Swish.php">[source]</a></span>

# Swish
Swish is a parametric activation layer that utilizes smooth rectified activation functions. The trainable *beta* parameter allows each activation function in the layer to tailor its output to the training set by interpolating between the linear function and ReLU.

## Parameters
| # | Name | Default | Type | Description |
|---|---|---|---|---|
| 1 | initializer | Constant | Initializer | The initializer of the beta parameter. |

## Example
```php
use Rubix\ML\NeuralNet\Layers\Swish;
use Rubix\ML\NeuralNet\Initializers\Constant;

$layer = new Swish(new Constant(1.0));
```

## References
[^1]: P. Ramachandran er al. (2017). Swish: A Self-gated Activation Function.
[^2]: P. Ramachandran et al. (2017). Searching for Activation Functions.
3 changes: 2 additions & 1 deletion docs/regressors/regression-tree.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,13 @@ A decision tree based on the CART (*Classification and Regression Tree*) learnin
| 2 | maxLeafSize | 3 | int | The max number of samples that a leaf node can contain. |
| 3 | minPurityIncrease | 1e-7 | float | The minimum increase in purity necessary to continue splitting a subtree. |
| 4 | maxFeatures | Auto | int | The max number of feature columns to consider when determining a best split. |
| 5 | maxBins | Auto | int | The maximum number of bins to consider when determining a split with a continuous feature as the split point. |

## Example
```php
use Rubix\ML\Regressors\RegressionTree;

$estimator = new RegressionTree(20, 2, 1e-3, 10);
$estimator = new RegressionTree(20, 2, 1e-3, 10, null);
```

## Additional Methods
Expand Down
3 changes: 3 additions & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ nav:
- Column Picker: extractors/column-picker.md
- Concatenator: extractors/concatenator.md
- CSV: extractors/csv.md
- Deduplicator: extractors/deduplicator.md
- NDJSON: extractors/ndjson.md
- SQL Table: extractors/sql-table.md
- Dataset Objects:
Expand Down Expand Up @@ -158,6 +159,7 @@ nav:
- Dropout: neural-network/hidden-layers/dropout.md
- Noise: neural-network/hidden-layers/noise.md
- PReLU: neural-network/hidden-layers/prelu.md
- Swish: neural-network/hidden-layers/swish.md
- Activation Functions:
- ELU: neural-network/activation-functions/elu.md
- Hyperbolic Tangent: neural-network/activation-functions/hyperbolic-tangent.md
Expand All @@ -168,6 +170,7 @@ nav:
- Softmax: neural-network/activation-functions/softmax.md
- Soft Plus: neural-network/activation-functions/soft-plus.md
- Soft Sign: neural-network/activation-functions/softsign.md
- SiLU: neural-network/activation-functions/silu.md
- Thresholded ReLU: neural-network/activation-functions/thresholded-relu.md
- Cost Functions:
- Cross Entropy: neural-network/cost-functions/cross-entropy.md
Expand Down
7 changes: 5 additions & 2 deletions src/Classifiers/ClassificationTree.php
Original file line number Diff line number Diff line change
Expand Up @@ -65,14 +65,16 @@ class ClassificationTree extends CART implements Estimator, Learner, Probabilist
* @param int $maxLeafSize
* @param float $minPurityIncrease
* @param int|null $maxFeatures
* @param int|null $maxBins
*/
public function __construct(
int $maxHeight = PHP_INT_MAX,
int $maxLeafSize = 3,
float $minPurityIncrease = 1e-7,
?int $maxFeatures = null
?int $maxFeatures = null,
?int $maxBins = null
) {
parent::__construct($maxHeight, $maxLeafSize, $minPurityIncrease, $maxFeatures);
parent::__construct($maxHeight, $maxLeafSize, $minPurityIncrease, $maxFeatures, $maxBins);
}

/**
Expand Down Expand Up @@ -116,6 +118,7 @@ public function params() : array
'max leaf size' => $this->maxLeafSize,
'min purity increase' => $this->minPurityIncrease,
'max features' => $this->maxFeatures,
'max bins' => $this->maxBins,
];
}

Expand Down
88 changes: 88 additions & 0 deletions src/Extractors/Deduplicator.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
<?php

namespace Rubix\ML\Extractors;

use OkBloomer\BloomFilter;
use Generator;

use function serialize;

/**
* Deduplicator
*
* Removes duplicate records from a dataset while the records are in flight. Deduplicator uses a memory-efficient
* Bloom filter to probabilistically identify records that have already been seen before.
*
* @category Machine Learning
* @package Rubix/ML
* @author Andrew DalPino
*/
class Deduplicator implements Extractor
{
/**
* The base iterator.
*
* @var iterable<array>
*/
protected iterable $iterator;

/**
* The Bloom filter.
*
* @var \OkBloomer\BloomFilter
*/
protected BloomFilter $filter;

/**
* The number of records that have been dropped so far.
*
* @var int
*/
protected int $dropped = 0;

/**
* @param iterable<mixed[]> $iterator
* @param float $maxFalsePositiveRate
* @param int|null $numHashes
* @param int $layerSize
*/
public function __construct(
iterable $iterator,
float $maxFalsePositiveRate = 0.001,
?int $numHashes = 4,
int $layerSize = 32000000
) {
$this->iterator = $iterator;
$this->filter = new BloomFilter($maxFalsePositiveRate, $numHashes, $layerSize);
}

/**
* Return the number of records that have been dropped so far.
*
* @return int
*/
public function dropped() : int
{
return $this->dropped;
}

/**
* Return an iterator for the records in the data table.
*
* @return \Generator<mixed[]>
*/
public function getIterator() : Generator
{
foreach ($this->iterator as $record) {
$token = serialize($record);

if ($this->filter->existsOrInsert($token)) {
++$this->dropped;

continue;
}

yield $record;
}
}
}
2 changes: 1 addition & 1 deletion src/Graph/Nodes/Decision.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
interface Decision extends BinaryNode
{
/**
* Return the impurity of the labels as a result of the decision.
* Return the impurity of the labels within the node.
*
* @return float
*/
Expand Down
Loading

0 comments on commit ffcae74

Please sign in to comment.