diff --git a/.gitignore b/.gitignore index 272c02a..98065dd 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,7 @@ __pycache__/ # Distribution / packaging .Python -build/ +#build/ develop-eggs/ dist/ downloads/ @@ -135,4 +135,10 @@ tase.toml /tase/unknown_errors.txt .idea /.idea/ -/certs/ \ No newline at end of file +/certs/ +/docs/source/ +/docs/build/doctrees/ +*.bats +Makefile +source/ +/docs/source/* diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..906a0af --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,29 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.9" + # You can also specify other tool versions: + # nodejs: "19" + # rust: "1.64" + # golang: "1.19" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + +# If using Sphinx, optionally build your docs in additional formats such as PDF +# formats: +# - pdf + +# Optionally declare the Python requirements required to build your docs +python: + install: + - requirements: docs/requirements.txt \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..747ffb7 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/Makefile b/docs/source/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/source/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..7533eda --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,79 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html +import os +import sys + +# sys.path.insert(0, os.path.abspath("../../transformerx/")) +from datetime import datetime +from pygments.styles import get_style_by_name + +PYTHONPATH = "../../transformerx/" +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "TransformerX" +copyright = "2023, TensorOps" +author = "TensorOps" +release = "v1.0.0-rc" + +# style = get_style_by_name("friendly") +# style.background_color = "#f3f2f1" +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + # "sphinx_rtd_theme", + "furo", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx.ext.mathjax", + "sphinx_markdown_builder", +] + +templates_path = ["_templates"] + +napoleon_use_rtype = False + +napoleon_include_init_with_doc = True +napoleon_google_docstring = True +napoleon_use_param = True +napoleon_use_ivar = True + +# pygments_style = "friendly" + +language = "english" + + +exclude_patterns = [] + + +# html_theme = "sphinx_rtd_theme" +html_theme = "furo" +html_title = "TransformerX Documentation" +html_show_sourcelink = False +html_baseurl = "https://github.com/tensorops/transformerx" + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +# html_theme = "alabaster" +html_static_path = ["_static"] + +html_theme_options = { + "enable_search_shortcuts": True, + "globaltoc_collapse": True, + "prev_next_buttons_location": "both", + # "style_nav_header_background": "#F5A603", + "navigation_depth": 2, + "collapse_navigation": True, + "sticky_navigation": False, + "logo_only": False, + "display_version": True, + "style_external_links": True, + "titles_only": True, +} + +napoleon_use_param = False diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..ebda9f1 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,21 @@ +.. TransformerX documentation master file, created by + sphinx-quickstart on Mon May 1 19:41:56 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to TransformerX's documentation! +======================================== + +.. toctree:: + :maxdepth: 4 + :caption: Contents: + + transformerx + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/source/make.bat b/docs/source/make.bat new file mode 100644 index 0000000..32bb245 --- /dev/null +++ b/docs/source/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/modules.rst b/docs/source/modules.rst new file mode 100644 index 0000000..7a7e678 --- /dev/null +++ b/docs/source/modules.rst @@ -0,0 +1,7 @@ +TransformerX +============ + +.. toctree:: + :maxdepth: 4 + + transformerx diff --git a/docs/source/transformerx.data_loader.rst b/docs/source/transformerx.data_loader.rst new file mode 100644 index 0000000..fe2c160 --- /dev/null +++ b/docs/source/transformerx.data_loader.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.data_loader + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.addnorm.rst b/docs/source/transformerx.layers.addnorm.rst new file mode 100644 index 0000000..9b85526 --- /dev/null +++ b/docs/source/transformerx.layers.addnorm.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.addnorm + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.dot_product_attention.rst b/docs/source/transformerx.layers.dot_product_attention.rst new file mode 100644 index 0000000..7ecd170 --- /dev/null +++ b/docs/source/transformerx.layers.dot_product_attention.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.dot_product_attention + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.masks.global_attention_mask.rst b/docs/source/transformerx.layers.masks.global_attention_mask.rst new file mode 100644 index 0000000..aac5735 --- /dev/null +++ b/docs/source/transformerx.layers.masks.global_attention_mask.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.masks.global_attention_mask + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.masks.rst b/docs/source/transformerx.layers.masks.rst new file mode 100644 index 0000000..a3bcbf7 --- /dev/null +++ b/docs/source/transformerx.layers.masks.rst @@ -0,0 +1,15 @@ +transformerx.layers.masks package +================================= + +.. automodule:: transformerx.layers.masks + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + transformerx.layers.masks.global_attention_mask diff --git a/docs/source/transformerx.layers.multihead_attention.rst b/docs/source/transformerx.layers.multihead_attention.rst new file mode 100644 index 0000000..4c9bada --- /dev/null +++ b/docs/source/transformerx.layers.multihead_attention.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.multihead_attention + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.positional_encoding.rst b/docs/source/transformerx.layers.positional_encoding.rst new file mode 100644 index 0000000..96b96e4 --- /dev/null +++ b/docs/source/transformerx.layers.positional_encoding.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.positional_encoding + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.positionwise_ffn.rst b/docs/source/transformerx.layers.positionwise_ffn.rst new file mode 100644 index 0000000..eb6e534 --- /dev/null +++ b/docs/source/transformerx.layers.positionwise_ffn.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.positionwise_ffn + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.rst b/docs/source/transformerx.layers.rst new file mode 100644 index 0000000..c259fbd --- /dev/null +++ b/docs/source/transformerx.layers.rst @@ -0,0 +1,31 @@ +transformerx.layers package +=========================== + +.. automodule:: transformerx.layers + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + transformerx.layers.masks + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + transformerx.layers.addnorm + transformerx.layers.dot_product_attention + transformerx.layers.multihead_attention + transformerx.layers.positional_encoding + transformerx.layers.positionwise_ffn + transformerx.layers.transformer_decoder + transformerx.layers.transformer_decoder_block + transformerx.layers.transformer_encoder + transformerx.layers.transformer_encoder_block diff --git a/docs/source/transformerx.layers.transformer_decoder.rst b/docs/source/transformerx.layers.transformer_decoder.rst new file mode 100644 index 0000000..3fe4c8f --- /dev/null +++ b/docs/source/transformerx.layers.transformer_decoder.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.transformer_decoder + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.transformer_decoder_block.rst b/docs/source/transformerx.layers.transformer_decoder_block.rst new file mode 100644 index 0000000..d160b68 --- /dev/null +++ b/docs/source/transformerx.layers.transformer_decoder_block.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.transformer_decoder_block + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.transformer_encoder.rst b/docs/source/transformerx.layers.transformer_encoder.rst new file mode 100644 index 0000000..39e34da --- /dev/null +++ b/docs/source/transformerx.layers.transformer_encoder.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.transformer_encoder + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.transformer_encoder_block.rst b/docs/source/transformerx.layers.transformer_encoder_block.rst new file mode 100644 index 0000000..9416970 --- /dev/null +++ b/docs/source/transformerx.layers.transformer_encoder_block.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.transformer_encoder_block + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.rst b/docs/source/transformerx.rst new file mode 100644 index 0000000..e59b212 --- /dev/null +++ b/docs/source/transformerx.rst @@ -0,0 +1,26 @@ +transformerx package +==================== + +.. automodule:: transformerx + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + transformerx.layers + transformerx.training + transformerx.txplot + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + transformerx.data_loader + transformerx.utils diff --git a/docs/source/transformerx.training.base.rst b/docs/source/transformerx.training.base.rst new file mode 100644 index 0000000..e319c1b --- /dev/null +++ b/docs/source/transformerx.training.base.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.training.base + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.training.rst b/docs/source/transformerx.training.rst new file mode 100644 index 0000000..9b9e4bd --- /dev/null +++ b/docs/source/transformerx.training.rst @@ -0,0 +1,15 @@ +transformerx.training package +============================= + +.. automodule:: transformerx.training + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + transformerx.training.base diff --git a/docs/source/transformerx.txplot.plot_pe.rst b/docs/source/transformerx.txplot.plot_pe.rst new file mode 100644 index 0000000..487d915 --- /dev/null +++ b/docs/source/transformerx.txplot.plot_pe.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.txplot.plot_pe + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.txplot.rst b/docs/source/transformerx.txplot.rst new file mode 100644 index 0000000..41dff07 --- /dev/null +++ b/docs/source/transformerx.txplot.rst @@ -0,0 +1,15 @@ +transformerx.txplot package +=========================== + +.. automodule:: transformerx.txplot + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + transformerx.txplot.plot_pe diff --git a/docs/source/transformerx.utils.rst b/docs/source/transformerx.utils.rst new file mode 100644 index 0000000..17b0b90 --- /dev/null +++ b/docs/source/transformerx.utils.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/tests/layers/test_transformer_decoder.py b/tests/layers/test_transformer_decoder.py index 8466b28..d8fb743 100644 --- a/tests/layers/test_transformer_decoder.py +++ b/tests/layers/test_transformer_decoder.py @@ -107,27 +107,38 @@ def test_decoder_attention_weights_values(self, decoder, inputs): class TestTransformerDecoderIntegration: - seq_length = 10 - vocab_size = 32 + seq_length = 5 + vocab_size = 8 @staticmethod def create_toy_dataset( num_samples=1000, seq_length=10, vocab_size=64, num_classes=2 ): # x = np.random.randint(0, vocab_size, size=(num_samples, seq_length)) - x = np.random.normal( - vocab_size / 2, vocab_size / 2 - 1, size=(num_samples, seq_length) + # x = np.random.normal( + # (vocab_size // 2), (vocab_size // 2 - 1), size=(num_samples, seq_length) + # ) + x = tf.random.normal( + shape=(num_samples, seq_length), + mean=vocab_size // 2, + stddev=vocab_size / 2 - 3, ) - y = np.random.randint(0, 2, size=(num_samples, 1)) - y = np.random.normal(1, 1, size=(num_samples, seq_length)) - - x_train = tf.random.uniform( - shape=(num_samples, seq_length), maxval=vocab_size, dtype=tf.int32 - ) - y_train = tf.random.uniform( - shape=(num_samples, 1), maxval=num_classes, dtype=tf.int32 - ) - return x_train, y_train + # y = np.random.randint(0, 2, size=(num_samples, 1)) + # y = np.random.normal(1, 1, size=(num_samples, seq_length)) + y = tf.cast(tf.math.greater(x, vocab_size / 2), tf.int32) + print("x: ", x.shape) + print("y: ", y.shape) + print("vocab: ", vocab_size, vocab_size / 2, vocab_size // 2) + print("x: ", x[:5, :5]) + print("y: ", y[:5, :5]) + + # x_train = tf.random.normal( + # shape=(num_samples, seq_length), mean=vocab_size, dtype=tf.int32 + # ) + # y_train = tf.random.normal( + # shape=(num_samples, 1), maxval=num_classes, dtype=tf.int32 + # ) + return x, y @pytest.fixture(scope="class") def model(self): @@ -138,7 +149,7 @@ def model(self): vocab_size=self.vocab_size, maxlen_position_encoding=self.seq_length, num_heads=num_head, - d_model=64, + d_model=16, n_blocks=1, ) decoder = TransformerDecoder( @@ -155,10 +166,10 @@ def model(self): tgt_inputs = tf.keras.layers.Input(shape=(self.seq_length,)) enc_output, attn_weights = encoder(inputs) print("enc_ouput: ", enc_output.shape) - dec_output, attn_weights_dec = decoder(inputs, enc_output, enc_output) - predictions = tf.keras.layers.Dense(1, activation="softmax")(dec_output) + dec_output, attn_weights_dec = decoder(tgt_inputs, enc_output, enc_output) + predictions = tf.keras.layers.Dense(1, activation="sigmoid")(dec_output) - model = tf.keras.Model(inputs=[inputs], outputs=predictions) + model = tf.keras.Model(inputs=[inputs, tgt_inputs], outputs=predictions) model.compile( optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"] ) @@ -169,6 +180,6 @@ def test_model_creation(self, model): x, y = self.create_toy_dataset( num_samples=100, vocab_size=self.vocab_size, seq_length=self.seq_length ) - history = model.fit(x, y, epochs=50, batch_size=32, validation_split=0.2) + history = model.fit([x, x], y, epochs=100, batch_size=32, validation_split=0.2) assert isinstance(model, tf.keras.Model) assert model is not None diff --git a/tests/layers/test_transformer_encoder.py b/tests/layers/test_transformer_encoder.py index 47954c8..d62779c 100644 --- a/tests/layers/test_transformer_encoder.py +++ b/tests/layers/test_transformer_encoder.py @@ -1,3 +1,5 @@ +import os + import pytest import tensorflow as tf import numpy as np @@ -129,9 +131,9 @@ def test_training(self, model): vocab_size=self.vocab_size, seq_length=self.seq_length, num_samples=100 ) history = model.fit( - x_train, y_train, epochs=50, batch_size=64, validation_split=0.2 + x_train, y_train, epochs=50, batch_size=16, validation_split=0.2 ) - tf.keras.mixed_precision.set_global_policy("mixed_float16") + # tf.keras.mixed_precision.set_global_policy("mixed_float16") assert ( history.history["accuracy"][-1] > 0.5 ), "Training accuracy should be greater than 0.5" diff --git a/transformerx/__version__.py b/transformerx/__version__.py index cb001e5..2068d33 100644 --- a/transformerx/__version__.py +++ b/transformerx/__version__.py @@ -1,5 +1,10 @@ VERSION = (1, 0, 0, "beta", 3) -__version__ = '.'.join(map(str, VERSION)) +if len(VERSION) < 3: + raise ValueError("VERSION must have at least three elements") -print(__version__) \ No newline at end of file +__version__ = ".".join(str(v) for v in VERSION[:3]) +if len(VERSION) > 3: + __version__ += "-" + ".".join(str(v) for v in VERSION[3:]) +# version_str += "-dev" +print(__version__) diff --git a/transformerx/layers/addnorm.py b/transformerx/layers/addnorm.py index 403e6db..daae09f 100644 --- a/transformerx/layers/addnorm.py +++ b/transformerx/layers/addnorm.py @@ -46,7 +46,7 @@ class AddNorm(tf.keras.layers.Layer): Examples -------- >>> x = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32) - >>> y = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32) + >>> y = tf.constant(np.arange(10).reshape(5, 2) * 11, dtype=tf.float32) >>> print(x) tf.Tensor( [[ 0. 10.] @@ -56,14 +56,14 @@ class AddNorm(tf.keras.layers.Layer): [80. 90.]], shape=(5, 2), dtype=float32) >>> addnorm = AddNorm(norm_type='layer', norm_eps=1e-6, dropout_rate=0.2, activation='relu') - >>> output = addnorm([x, y]) + >>> output = addnorm(x, y) >>> print(output) tf.Tensor( - [[0. 0. ] - [4.1565704 3.2312596] - [9.174077 8.174077 ] - [14.191582 13.116871 ] - [19.209087 18.134377 ]], shape=(5, 2), dtype=float32) + [[0. 1.] + [0. 1.] + [0. 1.] + [0. 1.] + [0. 1.]], shape=(5, 2), dtype=float32) References ---------- @@ -99,7 +99,7 @@ def __init__( if dropout_rate >= 1: raise ValueError("Dropout rate must be less than 1") - self.dropout = tf.keras.layers.Dropout(dropout_rate) + self.dropout = tf.keras.layers.Dropout(self.dropout_rate) # Regularizers self.kernel_regularizer = kernel_regularizer self.bias_regularizer = bias_regularizer @@ -154,10 +154,10 @@ def call(self, x: tf.Tensor, residual: tf.Tensor, **kwargs): ) # Apply dropout - residual = self.dropout(residual, training=kwargs.get("training", False)) + # residual = self.dropout(residual) # Add residual connection - x = tf.keras.layers.Add()([x, residual]) + x = tf.add(x, residual) # Apply normalization x = self.norm_layer(x) @@ -184,12 +184,12 @@ def get_config(self): return config -if __name__ == "__main__": - X = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32) - Y = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32) - - addnorm = AddNorm( - norm_type="layer", norm_eps=1e-6, dropout_rate=0.2, activation="relu" - ) - output = addnorm(X, X) - print(output) +# if __name__ == "__main__": +# x = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32) +# y = tf.constant(np.arange(10, 20).reshape(5, 2) * 13, dtype=tf.float32) +# +# addnorm = AddNorm( +# norm_type="layer", norm_eps=1e-6, dropout_rate=0.2, activation="relu" +# ) +# output = addnorm(x, y) +# print(output) diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py index b944303..0f68bda 100644 --- a/transformerx/layers/dot_product_attention.py +++ b/transformerx/layers/dot_product_attention.py @@ -1,8 +1,8 @@ -import numpy as np import tensorflow as tf from transformerx.layers.masks.global_attention_mask import GlobalAttentionMask from transformerx.utils import masked_softmax +from transformerx.layers.masks import LookAheadMask class DotProductAttention(tf.keras.layers.Layer): @@ -25,60 +25,63 @@ class DotProductAttention(tf.keras.layers.Layer): Notes ----- Dot-product attention formulation is as following: - .. math:: Attention(Q, K, V) = softmax(Q K^T) V + + .. math:: + Attention(Q, K, V) = softmax(Q K^T) V And scaled dot-product attention [1]_ is formulated as: - ..math:: Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V + .. math:: + Attention(Q, K, V) = softmax(\\frac{QK^T}{\\sqrt{d_k}}) V Examples -------- Scaled dot-product (scaled multiplicative) self-attention of tensor `x` (we feed `x` to queries, keys, and values). - - >>> x = tf.cast(np.random.random([2, 3, 2]), dtype=tf.float32) + >>> tf.random.set_seed(1) + >>> x = tf.cast(tf.random.uniform([2, 3, 2]), dtype=tf.float32) >>> print(x) tf.Tensor( - [[[0.5418388 0.23626359] - [0.4220487 0.394948 ] - [0.6125364 0.12296485]] - - [[0.17872103 0.5700011 ] - [0.28264287 0.02290592] - [0.24536102 0.39220297]]], shape=(2, 3, 2), dtype=float32) #random + [[[0.16513085 0.9014813 ] + [0.6309742 0.4345461 ] + [0.29193902 0.64250207]] + + [[0.9757855 0.43509948] + [0.6601019 0.60489583] + [0.6366315 0.6144488 ]]], shape=(2, 3, 2), dtype=float32) >>> dot_product = DotProductAttention(0.2) >>> queries, keys, values = x, x, x - >>> output = dot_product(queries, keys, values) + >>> output, attn_weights = dot_product(queries, keys, values) >>> print(output) tf.Tensor( - [[[0.45955482 0.63378114] - [0.48054144 0.62751293] - [0.43684354 0.64026886]] - - [[0.82063836 0.2958246 ] - [0.8300792 0.30486548] - [0.83300924 0.30762452]]], shape=(2, 3, 2), dtype=float32) + [[[0.34450796 0.6787753 ] + [0.36907017 0.65472305] + [0.35440704 0.66882825]] + + [[0.77042043 0.5446019 ] + [0.7632908 0.5484005 ] + [0.7627964 0.5486638 ]]], shape=(2, 3, 2), dtype=float32) The next example shows the dot-product (multiplicative) self-attention of tensor `x`. >>> dot_product = DotProductAttention(dropout_rate=0.1, scaled=False) - >>> output = dot_product(queries, keys, values) + >>> output, attn_weights = dot_product(queries, keys, values) >>> print(output) tf.Tensor( - [[[0.5195807 0.6383675 ] - [0.49765232 0.6440835 ] - [0.5132934 0.64001364]] - - [[0.6074392 0.80120546] - [0.6098373 0.80074203] - [0.5967663 0.7891044 ]]], shape=(2, 3, 2), dtype=float32) + [[[0.33704066 0.6868143 ] + [0.37176722 0.6526886 ] + [0.35094902 0.6727435 ]] + + [[0.7759446 0.54165894] + [0.7657266 0.54710305] + [0.7650213 0.5474789 ]]], shape=(2, 3, 2), dtype=float32) References ---------- .. [1] A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser, I. Polosukhin, Attention - is all you need, in: NIPS, pp. 5998–6008. + is all you need, in: NIPS, pp. 5998–6008. """ def __init__( @@ -135,19 +138,22 @@ def call( # apply causal mask if self.causal_mask: + # Obsolete version of masking. To be removed in the upcomming updates # seq_len = tf.shape(queries)[2] # heads = tf.shape(queries)[1] - batch_size, num_heads, seq_len, _ = tf.unstack(tf.shape(queries)) - causal_mask = tf.ones((num_heads, seq_len)) * -1e9 - causal_mask = tf.linalg.LinearOperatorLowerTriangular( - causal_mask - ).to_dense() - causal_mask = tf.expand_dims(causal_mask, axis=0) # add batch dimension - causal_mask = tf.broadcast_to( - tf.expand_dims(causal_mask, -1), tf.shape(scores) - ) # broadcast across batch dimension - # scores += - scores = scores + causal_mask + # batch_size, num_heads, seq_len, _ = tf.unstack(tf.shape(queries)) + # causal_mask = tf.ones((num_heads, seq_len)) * -1e9 + # causal_mask = tf.linalg.LinearOperatorLowerTriangular( + # causal_mask + # ).to_dense() + # causal_mask = tf.expand_dims(causal_mask, axis=0) # add batch dimension + # causal_mask = tf.broadcast_to( + # tf.expand_dims(causal_mask, -1), tf.shape(scores) + # ) # broadcast across batch dimension + + # New version of masking + look_ahead_mask = LookAheadMask() + scores = look_ahead_mask(scores) # to be uncommented later # apply global mask @@ -160,7 +166,6 @@ def call( # self.attention_weights = tf.nn.softmax(scores, axis=-1, mask=attention_mask) # scores = tf.matmul(self.dropout(self.attention_weights, **kwargs), values) attention_output = tf.matmul(self.dropout(self.attention_weights), values) - return attention_output, self.attention_weights def get_attention_weights(self): diff --git a/transformerx/layers/masks/__init__.py b/transformerx/layers/masks/__init__.py index e69de29..80c100d 100644 --- a/transformerx/layers/masks/__init__.py +++ b/transformerx/layers/masks/__init__.py @@ -0,0 +1 @@ +from .base import LookAheadMask diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py new file mode 100644 index 0000000..a2c597d --- /dev/null +++ b/transformerx/layers/masks/base.py @@ -0,0 +1,125 @@ +import tensorflow as tf + + +class BaseMask(tf.keras.layers.Layer): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def build_mask(self, inputs): + raise NotImplementedError("Subclasses must implement build_mask method") + + def call(self, inputs, *args, **kwargs): + if tf.shape(inputs).shape == 4: + pass + elif tf.shape(inputs).shape == 3: + inputs = tf.expand_dims(inputs, axis=1) + else: + raise f"Invalid input shape. Expected 3D or 4D tensors, but received {len(inputs.shape)}D." + mask = self.build_mask(inputs) + return tf.add(inputs, mask * -1e9) + + +class LookAheadMask(BaseMask): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def build_mask(self, inputs): + input_shape = tf.shape(inputs) + if input_shape.shape == 4: + print("input shape: ", input_shape) + k_seq_len = input_shape[3] + q_seq_len = input_shape[2] + + # mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0) + mask = ( + 1 + - tf.linalg.LinearOperatorLowerTriangular( + tf.ones((q_seq_len, k_seq_len)), -1, 0 + ).to_dense() + ) + return mask + + +class PaddingMask(BaseMask): + def __init__(self, padding_value=0, multi_head=True, **kwargs): + super().__init__(**kwargs) + self.padding_value = padding_value + self.multi_head = multi_head + + def build_mask(self, inputs): + mask = tf.cast(tf.math.equal(inputs, self.padding_value), tf.float32) + return mask + + +class PaddingMaskNew(tf.keras.layers.Layer): + def __init__(self, multi_head=True, padding_value=0, **kwargs): + super(PaddingMask, self).__init__(**kwargs) + self.multi_head = multi_head + self.padding_value = padding_value + + def build(self, input_shape): + pass + + def call(self, inputs): + seq = tf.cast(tf.math.equal(inputs, self.padding_value), tf.float32) + seq = tf.expand_dims(seq, axis=1) + if self.multi_head: + seq = tf.expand_dims(seq, axis=1) + return seq + + def get_config(self): + config = super(PaddingMask, self).get_config() + config.update({"multi_head": self.multi_head}) + return config + + +if __name__ == "__main__": + from transformerx.layers import DotProductAttention, MultiHeadAttention + + input_tensor = tf.random.uniform((2, 4, 6)) + q_input_tensor = tf.random.uniform((2, 4, 6)) + attn_o, attn_w = DotProductAttention()(q_input_tensor, q_input_tensor, input_tensor) + + print("attn_w.shape: ", attn_w.shape) + la_mask = LookAheadMask() + output_tensor = la_mask(attn_w) + print(output_tensor.shape, output_tensor) + + multihead_attn = MultiHeadAttention(d_model=32, num_heads=4, dropout_rate=0.1) + output, attn_w = multihead_attn(q_input_tensor, input_tensor, input_tensor) + + sample_input = tf.random.uniform((1, 1, 4, 2)) + # output_tensor = la_mask(attn_w) + output_tensor = la_mask(sample_input) + print(output_tensor.shape, output_tensor) + + data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]] + # Create a 2D tensor + data = tf.constant([[1, 2, 3], [4, 5, 6]]) + + # Convert the dataset to a tensor + # data_tensor = tf.constant(data, dtype=tf.float32) + + # Create a SequencePadding layer + # sequence_padding_layer = PaddingLayer(0, 4) + + # padded_data = sequence_padding_layer(data) + + # Test input + input_tensor = tf.constant( + [ + [[1, 2, 0], [4, 5, 6], [7, 8, 9], [0, 0, 0]], + [[1, 2, 3], [4, 5, 0], [0, 0, 0], [0, 0, 0]], + ], + dtype=tf.float32, + ) + + # Create a PaddingMask layer + padding_mask_layer = PaddingMask() + + # Generate the padding mask + # padding_mask = padding_mask_layer(input_tensor) + # print(padding_mask.shape, padding_mask) + + lad_mask = la_mask(input_tensor) + # print(lad_mask.shape, lad_mask) diff --git a/transformerx/layers/multihead_attention.py b/transformerx/layers/multihead_attention.py index c3fa618..b7cf77a 100644 --- a/transformerx/layers/multihead_attention.py +++ b/transformerx/layers/multihead_attention.py @@ -1,4 +1,3 @@ -import numpy as np import tensorflow as tf from einops import rearrange @@ -89,7 +88,9 @@ class MultiHeadAttention(tf.keras.layers.Layer): Returns ------- output: - Concatenated tensors + Concatenated tensors. Same shape as the queries. + attention_weights: + Optional tensor of attention weights. Methods ------- @@ -102,38 +103,50 @@ class MultiHeadAttention(tf.keras.layers.Layer): Examples -------- - >>> x = tf.constant(np.random.random([2, 3, 2]), dtype=tf.float32) - >>> multihead = MultiHeadAttention(d_model=8) - >>> print(multihead) - <__main__.MultiHeadAttention object at 0x7ff83c16bb80> + >>> import tensorflow as tf + >>> import random + >>> tf.random.set_seed(1) + >>> random.seed(42) - >>> output = multihead(x, x, x) + + >>> x = tf.constant(tf.random.uniform([2, 3, 2]), dtype=tf.float32) + >>> multihead = MultiHeadAttention(d_model=8, dropout_rate=0) + >>> print(type(multihead)) + + + >>> output, attn_weights = multihead(x, x, x) >>> print(output) tf.Tensor( - [[[ 0.2051548 0.32050014 0.2915167 -0.04056092 0.12072253 - 0.06477361 0.18725544 0.02056682] - [ 0.19823116 0.2983173 0.27711272 -0.04071879 0.11172265 - 0.06080601 0.18654731 0.00577436] - [ 0.19831955 0.30106473 0.27666807 -0.03963682 0.11234044 - 0.0615251 0.18657821 0.00680977]] - [[ 0.14630345 0.21267754 0.26289055 -0.10759152 0.03963668 - 0.04118761 0.11257525 0.05869889] - [ 0.14556082 0.21070784 0.26139364 -0.10755821 0.03894955 - 0.04060047 0.11260018 0.05745776] - [ 0.14547291 0.21081978 0.26109838 -0.10745162 0.03889 - 0.04069766 0.11251941 0.05741404]]], shape=(2, 3, 8), dtype=float32) - - >>> attention = MultiHeadAttention(d_model=16, num_heads=4, dropout=0.1) - >>> queries = tf.random.normal((3, 10, 16)) + [[[ 0.27276292 -0.2744614 -0.06085328 -0.03441356 -0.1577001 + 0.33375 -0.7894692 -0.33158925] + [ 0.2792416 -0.27180034 -0.06341933 -0.02869054 -0.15612581 + 0.33674437 -0.7850623 -0.3237151 ] + [ 0.274466 -0.27393326 -0.06170867 -0.03307929 -0.15757665 + 0.33440444 -0.78846383 -0.3293347 ]] + + [[ 0.44330204 -0.14170787 -0.1372787 0.3109271 -0.30478996 + 0.47728932 -0.8789958 -0.3304574 ] + [ 0.44153026 -0.14282975 -0.13679348 0.30881953 -0.30498797 + 0.476456 -0.8804113 -0.33254212] + [ 0.44139963 -0.14291355 -0.13675913 0.30866385 -0.3050046 + 0.4763937 -0.88051784 -0.3326969 ]]], shape=(2, 3, 8), dtype=float32) + + + + + + >>> tf.random.set_seed(1) + >>> attention = MultiHeadAttention(d_model=16, num_heads=4, dropout_rate=0.1) + >>> queries = tf.random.normal((3, 20, 16)) >>> keys = tf.random.normal((3, 20, 16)) >>> values = tf.random.normal((3, 20, 16)) - >>> valid_lens = tf.constant([10, 15, 20]) - >>> output, _ = attention(queries, keys, values, valid_lens) - >>> output.shape - (3, 10, 16) + >>> valid_lens = tf.constant([3, 20]) + >>> output, _ = attention(queries, keys, values) + >>> print(output.shape) + (3, 20, 16) - >>> window_mask = tf.ones((3, 10, 20)) - >>> output, _ = attention(queries, keys, values, valid_lens, window_mask=window_mask) + >>> window_mask = tf.ones((3, 10)) + >>> output, _ = attention(queries, keys, values, attention_mask=window_mask) >>> output.shape (3, 10, 16) @@ -285,7 +298,7 @@ def call( >>> values = tf.random.normal([batch_size, no_of_key_value_pairs, depth]) >>> valid_lens = tf.random.uniform([batch_size], minval=0, maxval=no_of_queries, dtype=tf.int32) - >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout=dropout) + >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout_rate=dropout) >>> output, attention_weights = multihead_attn(queries, keys, values, valid_lens) Here is an example of how to use the call method with a window mask: @@ -298,7 +311,7 @@ def call( >>> valid_lens = tf.random.uniform([batch_size], minval=0, maxval=no_of_queries, dtype=tf.int32) >>> window_mask = tf.random.uniform([batch_size, no_of_queries, no_of_key_value_pairs], 0, 2, dtype=tf.int32) - >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout=dropout) + >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout_rate=dropout) >>> output, attention_weights = multihead_attn(queries, keys, values, valid_lens, window_mask) """ @@ -320,6 +333,7 @@ def call( # Shape of output: (batch_size * num_heads, no. of queries, # depth / num_heads) + print("multihead q: ", queries.shape) attention_output, attention_weights = self.attention( queries, keys, values, attention_mask, **kwargs ) diff --git a/transformerx/layers/transformer_decoder.py b/transformerx/layers/transformer_decoder.py index b570341..1277bd2 100644 --- a/transformerx/layers/transformer_decoder.py +++ b/transformerx/layers/transformer_decoder.py @@ -270,10 +270,9 @@ def apply_positional_embedding(self, inputs=None, **kwargs): ) def call(self, queries, keys, values, attention_mask=None, **kwargs): - queries = self.apply_positional_embedding(queries, **kwargs) + blk_outputs = self.apply_positional_embedding(queries, **kwargs) # keys = self.apply_positional_embedding(keys, **kwargs) # values = self.apply_positional_embedding(values, **kwargs) - blk_outputs = queries # self.attention_weights = [None] * len(self.blocks) self.attention_weights = [] for i, blk in enumerate(self.blocks): diff --git a/transformerx/utils.py b/transformerx/utils.py index 91f14a2..50e35be 100644 --- a/transformerx/utils.py +++ b/transformerx/utils.py @@ -8,8 +8,8 @@ def sequence_mask(X, attention_mask, value=-1e9): raise TypeError("X must be a Tensor") if not isinstance(attention_mask, tf.Tensor): raise TypeError("attention_mask must be a Tensor") - if len(X.shape) not in (2, 3): - raise ValueError("X must be a 2D or 3D tensor") + if len(X.shape) not in (2, 3, 4): + raise ValueError("X must be a 2D, 3D, or 4D tensor") if len(attention_mask.shape) not in (1, 2): raise ValueError("attention_mask must be a 1D or 2D tensor") @@ -18,8 +18,10 @@ def sequence_mask(X, attention_mask, value=-1e9): mask = tf.range(start=0, limit=maxlen, dtype=tf.float32)[None, :] < tf.cast( attention_mask, dtype=tf.float32 ) + print("mask.shape: ", mask.shape, attention_mask.shape, X.shape) else: maxlen = X.shape[0] + print("attention_mask.shape: ", attention_mask.shape, X.shape) mask = tf.range(start=0, limit=maxlen, dtype=tf.float32) < tf.cast( attention_mask, dtype=tf.float32 )