From cd17b8454d0ad3e8421c7ef1587c472792631be9 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Mon, 1 May 2023 01:22:32 +0100 Subject: [PATCH 01/27] test: minor fix decoder test --- tests/layers/test_transformer_decoder.py | 49 +++++++++++++--------- transformerx/layers/transformer_decoder.py | 3 +- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/tests/layers/test_transformer_decoder.py b/tests/layers/test_transformer_decoder.py index 8466b28..d8fb743 100644 --- a/tests/layers/test_transformer_decoder.py +++ b/tests/layers/test_transformer_decoder.py @@ -107,27 +107,38 @@ def test_decoder_attention_weights_values(self, decoder, inputs): class TestTransformerDecoderIntegration: - seq_length = 10 - vocab_size = 32 + seq_length = 5 + vocab_size = 8 @staticmethod def create_toy_dataset( num_samples=1000, seq_length=10, vocab_size=64, num_classes=2 ): # x = np.random.randint(0, vocab_size, size=(num_samples, seq_length)) - x = np.random.normal( - vocab_size / 2, vocab_size / 2 - 1, size=(num_samples, seq_length) + # x = np.random.normal( + # (vocab_size // 2), (vocab_size // 2 - 1), size=(num_samples, seq_length) + # ) + x = tf.random.normal( + shape=(num_samples, seq_length), + mean=vocab_size // 2, + stddev=vocab_size / 2 - 3, ) - y = np.random.randint(0, 2, size=(num_samples, 1)) - y = np.random.normal(1, 1, size=(num_samples, seq_length)) - - x_train = tf.random.uniform( - shape=(num_samples, seq_length), maxval=vocab_size, dtype=tf.int32 - ) - y_train = tf.random.uniform( - shape=(num_samples, 1), maxval=num_classes, dtype=tf.int32 - ) - return x_train, y_train + # y = np.random.randint(0, 2, size=(num_samples, 1)) + # y = np.random.normal(1, 1, size=(num_samples, seq_length)) + y = tf.cast(tf.math.greater(x, vocab_size / 2), tf.int32) + print("x: ", x.shape) + print("y: ", y.shape) + print("vocab: ", vocab_size, vocab_size / 2, vocab_size // 2) + print("x: ", x[:5, :5]) + print("y: ", y[:5, :5]) + + # x_train = tf.random.normal( + # shape=(num_samples, seq_length), mean=vocab_size, dtype=tf.int32 + # ) + # y_train = tf.random.normal( + # shape=(num_samples, 1), maxval=num_classes, dtype=tf.int32 + # ) + return x, y @pytest.fixture(scope="class") def model(self): @@ -138,7 +149,7 @@ def model(self): vocab_size=self.vocab_size, maxlen_position_encoding=self.seq_length, num_heads=num_head, - d_model=64, + d_model=16, n_blocks=1, ) decoder = TransformerDecoder( @@ -155,10 +166,10 @@ def model(self): tgt_inputs = tf.keras.layers.Input(shape=(self.seq_length,)) enc_output, attn_weights = encoder(inputs) print("enc_ouput: ", enc_output.shape) - dec_output, attn_weights_dec = decoder(inputs, enc_output, enc_output) - predictions = tf.keras.layers.Dense(1, activation="softmax")(dec_output) + dec_output, attn_weights_dec = decoder(tgt_inputs, enc_output, enc_output) + predictions = tf.keras.layers.Dense(1, activation="sigmoid")(dec_output) - model = tf.keras.Model(inputs=[inputs], outputs=predictions) + model = tf.keras.Model(inputs=[inputs, tgt_inputs], outputs=predictions) model.compile( optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"] ) @@ -169,6 +180,6 @@ def test_model_creation(self, model): x, y = self.create_toy_dataset( num_samples=100, vocab_size=self.vocab_size, seq_length=self.seq_length ) - history = model.fit(x, y, epochs=50, batch_size=32, validation_split=0.2) + history = model.fit([x, x], y, epochs=100, batch_size=32, validation_split=0.2) assert isinstance(model, tf.keras.Model) assert model is not None diff --git a/transformerx/layers/transformer_decoder.py b/transformerx/layers/transformer_decoder.py index b570341..1277bd2 100644 --- a/transformerx/layers/transformer_decoder.py +++ b/transformerx/layers/transformer_decoder.py @@ -270,10 +270,9 @@ def apply_positional_embedding(self, inputs=None, **kwargs): ) def call(self, queries, keys, values, attention_mask=None, **kwargs): - queries = self.apply_positional_embedding(queries, **kwargs) + blk_outputs = self.apply_positional_embedding(queries, **kwargs) # keys = self.apply_positional_embedding(keys, **kwargs) # values = self.apply_positional_embedding(values, **kwargs) - blk_outputs = queries # self.attention_weights = [None] * len(self.blocks) self.attention_weights = [] for i, blk in enumerate(self.blocks): From 20eb4ba6b1ecffb6d792bea2d304e3b8395872eb Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Mon, 1 May 2023 03:11:59 +0100 Subject: [PATCH 02/27] fix: __version__.py works properly --- transformerx/__version__.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/transformerx/__version__.py b/transformerx/__version__.py index cb001e5..2068d33 100644 --- a/transformerx/__version__.py +++ b/transformerx/__version__.py @@ -1,5 +1,10 @@ VERSION = (1, 0, 0, "beta", 3) -__version__ = '.'.join(map(str, VERSION)) +if len(VERSION) < 3: + raise ValueError("VERSION must have at least three elements") -print(__version__) \ No newline at end of file +__version__ = ".".join(str(v) for v in VERSION[:3]) +if len(VERSION) > 3: + __version__ += "-" + ".".join(str(v) for v in VERSION[3:]) +# version_str += "-dev" +print(__version__) From 60c828b1f9e64f381190094ab96873b156398ad6 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Mon, 1 May 2023 21:44:05 +0100 Subject: [PATCH 03/27] doc: generate docs --- docs/Makefile | 20 +++++ docs/make.bat | 35 +++++++++ docs/source/Makefile | 20 +++++ docs/source/conf.py | 77 +++++++++++++++++++ docs/source/index.rst | 21 +++++ docs/source/make.bat | 35 +++++++++ docs/source/modules.rst | 7 ++ docs/source/transformerx.data_loader.rst | 4 + docs/source/transformerx.layers.addnorm.rst | 4 + ...nsformerx.layers.dot_product_attention.rst | 4 + ...erx.layers.masks.global_attention_mask.rst | 4 + docs/source/transformerx.layers.masks.rst | 15 ++++ ...ransformerx.layers.multihead_attention.rst | 4 + ...ransformerx.layers.positional_encoding.rst | 4 + .../transformerx.layers.positionwise_ffn.rst | 4 + docs/source/transformerx.layers.rst | 31 ++++++++ ...ransformerx.layers.transformer_decoder.rst | 4 + ...rmerx.layers.transformer_decoder_block.rst | 4 + ...ransformerx.layers.transformer_encoder.rst | 4 + ...rmerx.layers.transformer_encoder_block.rst | 4 + docs/source/transformerx.rst | 26 +++++++ docs/source/transformerx.training.base.rst | 4 + docs/source/transformerx.training.rst | 15 ++++ docs/source/transformerx.txplot.plot_pe.rst | 4 + docs/source/transformerx.txplot.rst | 15 ++++ docs/source/transformerx.utils.rst | 4 + requirements.txt | 2 +- transformerx/layers/dot_product_attention.py | 12 +-- 28 files changed, 380 insertions(+), 7 deletions(-) create mode 100644 docs/Makefile create mode 100644 docs/make.bat create mode 100644 docs/source/Makefile create mode 100644 docs/source/conf.py create mode 100644 docs/source/index.rst create mode 100644 docs/source/make.bat create mode 100644 docs/source/modules.rst create mode 100644 docs/source/transformerx.data_loader.rst create mode 100644 docs/source/transformerx.layers.addnorm.rst create mode 100644 docs/source/transformerx.layers.dot_product_attention.rst create mode 100644 docs/source/transformerx.layers.masks.global_attention_mask.rst create mode 100644 docs/source/transformerx.layers.masks.rst create mode 100644 docs/source/transformerx.layers.multihead_attention.rst create mode 100644 docs/source/transformerx.layers.positional_encoding.rst create mode 100644 docs/source/transformerx.layers.positionwise_ffn.rst create mode 100644 docs/source/transformerx.layers.rst create mode 100644 docs/source/transformerx.layers.transformer_decoder.rst create mode 100644 docs/source/transformerx.layers.transformer_decoder_block.rst create mode 100644 docs/source/transformerx.layers.transformer_encoder.rst create mode 100644 docs/source/transformerx.layers.transformer_encoder_block.rst create mode 100644 docs/source/transformerx.rst create mode 100644 docs/source/transformerx.training.base.rst create mode 100644 docs/source/transformerx.training.rst create mode 100644 docs/source/transformerx.txplot.plot_pe.rst create mode 100644 docs/source/transformerx.txplot.rst create mode 100644 docs/source/transformerx.utils.rst diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..747ffb7 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=source +set BUILDDIR=build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/Makefile b/docs/source/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/source/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 0000000..d9e42bf --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,77 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html +import os +import sys + +# sys.path.insert(0, os.path.abspath("../../transformerx/")) +from datetime import datetime +from pygments.styles import get_style_by_name + +PYTHONPATH = "../../transformerx/" +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "TransformerX" +copyright = "2023, TensorOps" +author = "TensorOps" +release = "v1.0.0-rc" + +# style = get_style_by_name("friendly") +# style.background_color = "#f3f2f1" +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [ + "sphinx_rtd_theme", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx.ext.napoleon", + "sphinx.ext.mathjax", + "sphinx_markdown_builder", +] + +templates_path = ["_templates"] + +napoleon_use_rtype = False + +napoleon_include_init_with_doc = True +napoleon_google_docstring = True +napoleon_use_param = True +napoleon_use_ivar = True + +# pygments_style = "friendly" + +language = "english" + + +exclude_patterns = [] + + +html_theme = "sphinx_rtd_theme" +html_title = "TransformerX Documentation" +html_show_sourcelink = False +html_baseurl = "https://github.com/tensorops/transformerx" + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +# html_theme = "alabaster" +html_static_path = ["_static"] + +html_theme_options = { + "enable_search_shortcuts": True, + "globaltoc_collapse": True, + "prev_next_buttons_location": "both", + # "style_nav_header_background": "#F5A603", + "navigation_depth": 2, + "collapse_navigation": True, + "sticky_navigation": False, + "logo_only": False, + "display_version": True, + "style_external_links": True, + "titles_only": True, +} + +napoleon_use_param = False diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 0000000..ebda9f1 --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,21 @@ +.. TransformerX documentation master file, created by + sphinx-quickstart on Mon May 1 19:41:56 2023. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to TransformerX's documentation! +======================================== + +.. toctree:: + :maxdepth: 4 + :caption: Contents: + + transformerx + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/source/make.bat b/docs/source/make.bat new file mode 100644 index 0000000..32bb245 --- /dev/null +++ b/docs/source/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/source/modules.rst b/docs/source/modules.rst new file mode 100644 index 0000000..7a7e678 --- /dev/null +++ b/docs/source/modules.rst @@ -0,0 +1,7 @@ +TransformerX +============ + +.. toctree:: + :maxdepth: 4 + + transformerx diff --git a/docs/source/transformerx.data_loader.rst b/docs/source/transformerx.data_loader.rst new file mode 100644 index 0000000..fe2c160 --- /dev/null +++ b/docs/source/transformerx.data_loader.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.data_loader + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.addnorm.rst b/docs/source/transformerx.layers.addnorm.rst new file mode 100644 index 0000000..9b85526 --- /dev/null +++ b/docs/source/transformerx.layers.addnorm.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.addnorm + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.dot_product_attention.rst b/docs/source/transformerx.layers.dot_product_attention.rst new file mode 100644 index 0000000..7ecd170 --- /dev/null +++ b/docs/source/transformerx.layers.dot_product_attention.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.dot_product_attention + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.masks.global_attention_mask.rst b/docs/source/transformerx.layers.masks.global_attention_mask.rst new file mode 100644 index 0000000..aac5735 --- /dev/null +++ b/docs/source/transformerx.layers.masks.global_attention_mask.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.masks.global_attention_mask + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.masks.rst b/docs/source/transformerx.layers.masks.rst new file mode 100644 index 0000000..a3bcbf7 --- /dev/null +++ b/docs/source/transformerx.layers.masks.rst @@ -0,0 +1,15 @@ +transformerx.layers.masks package +================================= + +.. automodule:: transformerx.layers.masks + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + transformerx.layers.masks.global_attention_mask diff --git a/docs/source/transformerx.layers.multihead_attention.rst b/docs/source/transformerx.layers.multihead_attention.rst new file mode 100644 index 0000000..4c9bada --- /dev/null +++ b/docs/source/transformerx.layers.multihead_attention.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.multihead_attention + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.positional_encoding.rst b/docs/source/transformerx.layers.positional_encoding.rst new file mode 100644 index 0000000..96b96e4 --- /dev/null +++ b/docs/source/transformerx.layers.positional_encoding.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.positional_encoding + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.positionwise_ffn.rst b/docs/source/transformerx.layers.positionwise_ffn.rst new file mode 100644 index 0000000..eb6e534 --- /dev/null +++ b/docs/source/transformerx.layers.positionwise_ffn.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.positionwise_ffn + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.rst b/docs/source/transformerx.layers.rst new file mode 100644 index 0000000..c259fbd --- /dev/null +++ b/docs/source/transformerx.layers.rst @@ -0,0 +1,31 @@ +transformerx.layers package +=========================== + +.. automodule:: transformerx.layers + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + transformerx.layers.masks + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + transformerx.layers.addnorm + transformerx.layers.dot_product_attention + transformerx.layers.multihead_attention + transformerx.layers.positional_encoding + transformerx.layers.positionwise_ffn + transformerx.layers.transformer_decoder + transformerx.layers.transformer_decoder_block + transformerx.layers.transformer_encoder + transformerx.layers.transformer_encoder_block diff --git a/docs/source/transformerx.layers.transformer_decoder.rst b/docs/source/transformerx.layers.transformer_decoder.rst new file mode 100644 index 0000000..3fe4c8f --- /dev/null +++ b/docs/source/transformerx.layers.transformer_decoder.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.transformer_decoder + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.transformer_decoder_block.rst b/docs/source/transformerx.layers.transformer_decoder_block.rst new file mode 100644 index 0000000..d160b68 --- /dev/null +++ b/docs/source/transformerx.layers.transformer_decoder_block.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.transformer_decoder_block + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.transformer_encoder.rst b/docs/source/transformerx.layers.transformer_encoder.rst new file mode 100644 index 0000000..39e34da --- /dev/null +++ b/docs/source/transformerx.layers.transformer_encoder.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.transformer_encoder + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.layers.transformer_encoder_block.rst b/docs/source/transformerx.layers.transformer_encoder_block.rst new file mode 100644 index 0000000..9416970 --- /dev/null +++ b/docs/source/transformerx.layers.transformer_encoder_block.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.layers.transformer_encoder_block + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.rst b/docs/source/transformerx.rst new file mode 100644 index 0000000..e59b212 --- /dev/null +++ b/docs/source/transformerx.rst @@ -0,0 +1,26 @@ +transformerx package +==================== + +.. automodule:: transformerx + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + transformerx.layers + transformerx.training + transformerx.txplot + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + transformerx.data_loader + transformerx.utils diff --git a/docs/source/transformerx.training.base.rst b/docs/source/transformerx.training.base.rst new file mode 100644 index 0000000..e319c1b --- /dev/null +++ b/docs/source/transformerx.training.base.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.training.base + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.training.rst b/docs/source/transformerx.training.rst new file mode 100644 index 0000000..9b9e4bd --- /dev/null +++ b/docs/source/transformerx.training.rst @@ -0,0 +1,15 @@ +transformerx.training package +============================= + +.. automodule:: transformerx.training + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + transformerx.training.base diff --git a/docs/source/transformerx.txplot.plot_pe.rst b/docs/source/transformerx.txplot.plot_pe.rst new file mode 100644 index 0000000..487d915 --- /dev/null +++ b/docs/source/transformerx.txplot.plot_pe.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.txplot.plot_pe + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/source/transformerx.txplot.rst b/docs/source/transformerx.txplot.rst new file mode 100644 index 0000000..41dff07 --- /dev/null +++ b/docs/source/transformerx.txplot.rst @@ -0,0 +1,15 @@ +transformerx.txplot package +=========================== + +.. automodule:: transformerx.txplot + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + transformerx.txplot.plot_pe diff --git a/docs/source/transformerx.utils.rst b/docs/source/transformerx.utils.rst new file mode 100644 index 0000000..17b0b90 --- /dev/null +++ b/docs/source/transformerx.utils.rst @@ -0,0 +1,4 @@ +.. automodule:: transformerx.utils + :members: + :undoc-members: + :show-inheritance: diff --git a/requirements.txt b/requirements.txt index 803d319..32cf0b6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Automatically generated by https://github.com/damnever/pigar. einops==0.4.1 -matplotlib==3.7.1 +# matplotlib==3.7.1 numpy==1.24.2 pytest==7.2.2 requests==2.28.2 diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py index b944303..b5221fd 100644 --- a/transformerx/layers/dot_product_attention.py +++ b/transformerx/layers/dot_product_attention.py @@ -25,11 +25,14 @@ class DotProductAttention(tf.keras.layers.Layer): Notes ----- Dot-product attention formulation is as following: - .. math:: Attention(Q, K, V) = softmax(Q K^T) V + + .. math:: + Attention(Q, K, V) = softmax(Q K^T) V And scaled dot-product attention [1]_ is formulated as: - ..math:: Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V + .. math:: + Attention(Q, K, V) = softmax(\\frac{QK^T}{\\sqrt{d_k}}) V Examples @@ -43,7 +46,6 @@ class DotProductAttention(tf.keras.layers.Layer): [[[0.5418388 0.23626359] [0.4220487 0.394948 ] [0.6125364 0.12296485]] - [[0.17872103 0.5700011 ] [0.28264287 0.02290592] [0.24536102 0.39220297]]], shape=(2, 3, 2), dtype=float32) #random @@ -56,7 +58,6 @@ class DotProductAttention(tf.keras.layers.Layer): [[[0.45955482 0.63378114] [0.48054144 0.62751293] [0.43684354 0.64026886]] - [[0.82063836 0.2958246 ] [0.8300792 0.30486548] [0.83300924 0.30762452]]], shape=(2, 3, 2), dtype=float32) @@ -70,7 +71,6 @@ class DotProductAttention(tf.keras.layers.Layer): [[[0.5195807 0.6383675 ] [0.49765232 0.6440835 ] [0.5132934 0.64001364]] - [[0.6074392 0.80120546] [0.6098373 0.80074203] [0.5967663 0.7891044 ]]], shape=(2, 3, 2), dtype=float32) @@ -78,7 +78,7 @@ class DotProductAttention(tf.keras.layers.Layer): References ---------- .. [1] A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser, I. Polosukhin, Attention - is all you need, in: NIPS, pp. 5998–6008. + is all you need, in: NIPS, pp. 5998–6008. """ def __init__( From 859c6029265f948190f2e0be7d913674228b44c9 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Mon, 1 May 2023 21:50:46 +0100 Subject: [PATCH 04/27] doc: add .readthedocs.yaml --- .readthedocs.yaml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 .readthedocs.yaml diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 0000000..906a0af --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,29 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.9" + # You can also specify other tool versions: + # nodejs: "19" + # rust: "1.64" + # golang: "1.19" + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: docs/conf.py + +# If using Sphinx, optionally build your docs in additional formats such as PDF +# formats: +# - pdf + +# Optionally declare the Python requirements required to build your docs +python: + install: + - requirements: docs/requirements.txt \ No newline at end of file From 0dfb443701094a4102186b1397a97264eb739831 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Mon, 1 May 2023 21:54:28 +0100 Subject: [PATCH 05/27] doc: update .gitignore --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 272c02a..327d2d3 100644 --- a/.gitignore +++ b/.gitignore @@ -135,4 +135,6 @@ tase.toml /tase/unknown_errors.txt .idea /.idea/ -/certs/ \ No newline at end of file +/certs/ +/docs/source/ +/docs/build/doctrees/ From 2394d7e5dd19472ffca9549424ac9e7844af5560 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Mon, 1 May 2023 22:00:57 +0100 Subject: [PATCH 06/27] doc: update .gitignore 2 --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 327d2d3..db599a5 100644 --- a/.gitignore +++ b/.gitignore @@ -138,3 +138,5 @@ tase.toml /certs/ /docs/source/ /docs/build/doctrees/ +*.bat +Makefile \ No newline at end of file From c2468250fbc2a9ecdd6e04c19fdbad42638a9123 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Mon, 1 May 2023 22:07:27 +0100 Subject: [PATCH 07/27] doc: update .gitignore --- .gitignore | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index db599a5..4fedaf5 100644 --- a/.gitignore +++ b/.gitignore @@ -9,7 +9,7 @@ __pycache__/ # Distribution / packaging .Python -build/ +#build/ develop-eggs/ dist/ downloads/ @@ -138,5 +138,5 @@ tase.toml /certs/ /docs/source/ /docs/build/doctrees/ -*.bat +*.bats Makefile \ No newline at end of file From 9d61cfe24a82cc6f4155d15d2bd7d8b0f0a38a57 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Mon, 1 May 2023 22:08:06 +0100 Subject: [PATCH 08/27] doc: update .gitignore 2 --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 4fedaf5..10dd37b 100644 --- a/.gitignore +++ b/.gitignore @@ -139,4 +139,5 @@ tase.toml /docs/source/ /docs/build/doctrees/ *.bats -Makefile \ No newline at end of file +Makefile +source/ \ No newline at end of file From 1f3b3b411b5960dd1166ba8fd693c7f9dfd1fbbe Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Mon, 1 May 2023 22:08:39 +0100 Subject: [PATCH 09/27] doc: update .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 10dd37b..4c6f29b 100644 --- a/.gitignore +++ b/.gitignore @@ -140,4 +140,5 @@ tase.toml /docs/build/doctrees/ *.bats Makefile -source/ \ No newline at end of file +source/ +/docs/source/ From e32412268118e69959795252081ea787c10f7d45 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Mon, 1 May 2023 22:09:08 +0100 Subject: [PATCH 10/27] doc: update .gitignore d --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 4c6f29b..98065dd 100644 --- a/.gitignore +++ b/.gitignore @@ -141,4 +141,4 @@ tase.toml *.bats Makefile source/ -/docs/source/ +/docs/source/* From 52b267b178a27929f12357d8c09655acd32d618c Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Tue, 2 May 2023 10:33:44 +0100 Subject: [PATCH 11/27] doc: adnorm tests running --- transformerx/layers/addnorm.py | 38 ++++++++++---------- transformerx/layers/dot_product_attention.py | 36 ++++++++++--------- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/transformerx/layers/addnorm.py b/transformerx/layers/addnorm.py index 403e6db..daae09f 100644 --- a/transformerx/layers/addnorm.py +++ b/transformerx/layers/addnorm.py @@ -46,7 +46,7 @@ class AddNorm(tf.keras.layers.Layer): Examples -------- >>> x = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32) - >>> y = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32) + >>> y = tf.constant(np.arange(10).reshape(5, 2) * 11, dtype=tf.float32) >>> print(x) tf.Tensor( [[ 0. 10.] @@ -56,14 +56,14 @@ class AddNorm(tf.keras.layers.Layer): [80. 90.]], shape=(5, 2), dtype=float32) >>> addnorm = AddNorm(norm_type='layer', norm_eps=1e-6, dropout_rate=0.2, activation='relu') - >>> output = addnorm([x, y]) + >>> output = addnorm(x, y) >>> print(output) tf.Tensor( - [[0. 0. ] - [4.1565704 3.2312596] - [9.174077 8.174077 ] - [14.191582 13.116871 ] - [19.209087 18.134377 ]], shape=(5, 2), dtype=float32) + [[0. 1.] + [0. 1.] + [0. 1.] + [0. 1.] + [0. 1.]], shape=(5, 2), dtype=float32) References ---------- @@ -99,7 +99,7 @@ def __init__( if dropout_rate >= 1: raise ValueError("Dropout rate must be less than 1") - self.dropout = tf.keras.layers.Dropout(dropout_rate) + self.dropout = tf.keras.layers.Dropout(self.dropout_rate) # Regularizers self.kernel_regularizer = kernel_regularizer self.bias_regularizer = bias_regularizer @@ -154,10 +154,10 @@ def call(self, x: tf.Tensor, residual: tf.Tensor, **kwargs): ) # Apply dropout - residual = self.dropout(residual, training=kwargs.get("training", False)) + # residual = self.dropout(residual) # Add residual connection - x = tf.keras.layers.Add()([x, residual]) + x = tf.add(x, residual) # Apply normalization x = self.norm_layer(x) @@ -184,12 +184,12 @@ def get_config(self): return config -if __name__ == "__main__": - X = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32) - Y = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32) - - addnorm = AddNorm( - norm_type="layer", norm_eps=1e-6, dropout_rate=0.2, activation="relu" - ) - output = addnorm(X, X) - print(output) +# if __name__ == "__main__": +# x = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32) +# y = tf.constant(np.arange(10, 20).reshape(5, 2) * 13, dtype=tf.float32) +# +# addnorm = AddNorm( +# norm_type="layer", norm_eps=1e-6, dropout_rate=0.2, activation="relu" +# ) +# output = addnorm(x, y) +# print(output) diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py index b5221fd..c226521 100644 --- a/transformerx/layers/dot_product_attention.py +++ b/transformerx/layers/dot_product_attention.py @@ -39,16 +39,17 @@ class DotProductAttention(tf.keras.layers.Layer): -------- Scaled dot-product (scaled multiplicative) self-attention of tensor `x` (we feed `x` to queries, keys, and values). - - >>> x = tf.cast(np.random.random([2, 3, 2]), dtype=tf.float32) + >>> tf.random.set_seed(1) + >>> x = tf.cast(tf.random.uniform([2, 3, 2]), dtype=tf.float32) >>> print(x) tf.Tensor( - [[[0.5418388 0.23626359] - [0.4220487 0.394948 ] - [0.6125364 0.12296485]] - [[0.17872103 0.5700011 ] - [0.28264287 0.02290592] - [0.24536102 0.39220297]]], shape=(2, 3, 2), dtype=float32) #random + [[[0.16513085 0.9014813 ] + [0.6309742 0.4345461 ] + [0.29193902 0.64250207]] + + [[0.9757855 0.43509948] + [0.6601019 0.60489583] + [0.6366315 0.6144488 ]]], shape=(2, 3, 2), dtype=float32) >>> dot_product = DotProductAttention(0.2) >>> queries, keys, values = x, x, x @@ -65,15 +66,16 @@ class DotProductAttention(tf.keras.layers.Layer): The next example shows the dot-product (multiplicative) self-attention of tensor `x`. >>> dot_product = DotProductAttention(dropout_rate=0.1, scaled=False) - >>> output = dot_product(queries, keys, values) + >>> output, attn_weights = dot_product(queries, keys, values) >>> print(output) - tf.Tensor( - [[[0.5195807 0.6383675 ] - [0.49765232 0.6440835 ] - [0.5132934 0.64001364]] - [[0.6074392 0.80120546] - [0.6098373 0.80074203] - [0.5967663 0.7891044 ]]], shape=(2, 3, 2), dtype=float32) + tf.Tensor: shape=(2, 3, 2), dtype=float32, numpy= + array([[[0.34450796, 0.6787753 ], + [0.36907017, 0.65472305], + [0.35440704, 0.66882825]], + + [[0.77042043, 0.5446019 ], + [0.7632908 , 0.5484005 ], + [0.7627964 , 0.5486638 ]]], dtype=float32) References ---------- @@ -160,7 +162,7 @@ def call( # self.attention_weights = tf.nn.softmax(scores, axis=-1, mask=attention_mask) # scores = tf.matmul(self.dropout(self.attention_weights, **kwargs), values) attention_output = tf.matmul(self.dropout(self.attention_weights), values) - + print(attention_output.shape, self.attention_weights.shape) return attention_output, self.attention_weights def get_attention_weights(self): From 6bad9cecf60bb93cdb9a362427d796a63c6a45ee Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Tue, 2 May 2023 10:46:43 +0100 Subject: [PATCH 12/27] doc: DotProduct tests running --- transformerx/layers/dot_product_attention.py | 31 ++++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py index c226521..44fa7f8 100644 --- a/transformerx/layers/dot_product_attention.py +++ b/transformerx/layers/dot_product_attention.py @@ -53,29 +53,30 @@ class DotProductAttention(tf.keras.layers.Layer): >>> dot_product = DotProductAttention(0.2) >>> queries, keys, values = x, x, x - >>> output = dot_product(queries, keys, values) + >>> output, attn_weights = dot_product(queries, keys, values) >>> print(output) tf.Tensor( - [[[0.45955482 0.63378114] - [0.48054144 0.62751293] - [0.43684354 0.64026886]] - [[0.82063836 0.2958246 ] - [0.8300792 0.30486548] - [0.83300924 0.30762452]]], shape=(2, 3, 2), dtype=float32) + [[[0.34450796 0.6787753 ] + [0.36907017 0.65472305] + [0.35440704 0.66882825]] + + [[0.77042043 0.5446019 ] + [0.7632908 0.5484005 ] + [0.7627964 0.5486638 ]]], shape=(2, 3, 2), dtype=float32) The next example shows the dot-product (multiplicative) self-attention of tensor `x`. >>> dot_product = DotProductAttention(dropout_rate=0.1, scaled=False) >>> output, attn_weights = dot_product(queries, keys, values) >>> print(output) - tf.Tensor: shape=(2, 3, 2), dtype=float32, numpy= - array([[[0.34450796, 0.6787753 ], - [0.36907017, 0.65472305], - [0.35440704, 0.66882825]], + tf.Tensor( + [[[0.33704066 0.6868143 ] + [0.37176722 0.6526886 ] + [0.35094902 0.6727435 ]] - [[0.77042043, 0.5446019 ], - [0.7632908 , 0.5484005 ], - [0.7627964 , 0.5486638 ]]], dtype=float32) + [[0.7759446 0.54165894] + [0.7657266 0.54710305] + [0.7650213 0.5474789 ]]], shape=(2, 3, 2), dtype=float32) References ---------- @@ -148,7 +149,6 @@ def call( causal_mask = tf.broadcast_to( tf.expand_dims(causal_mask, -1), tf.shape(scores) ) # broadcast across batch dimension - # scores += scores = scores + causal_mask # to be uncommented later @@ -162,7 +162,6 @@ def call( # self.attention_weights = tf.nn.softmax(scores, axis=-1, mask=attention_mask) # scores = tf.matmul(self.dropout(self.attention_weights, **kwargs), values) attention_output = tf.matmul(self.dropout(self.attention_weights), values) - print(attention_output.shape, self.attention_weights.shape) return attention_output, self.attention_weights def get_attention_weights(self): From 1baa64027d2ec8dd9ad9f83b0224a9bcea5b35c0 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Tue, 2 May 2023 20:59:16 +0100 Subject: [PATCH 13/27] doc: DotProduct tests modification --- transformerx/layers/dot_product_attention.py | 1 - transformerx/layers/multihead_attention.py | 69 ++++++++++++-------- transformerx/utils.py | 6 +- 3 files changed, 45 insertions(+), 31 deletions(-) diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py index 44fa7f8..7f17e37 100644 --- a/transformerx/layers/dot_product_attention.py +++ b/transformerx/layers/dot_product_attention.py @@ -1,4 +1,3 @@ -import numpy as np import tensorflow as tf from transformerx.layers.masks.global_attention_mask import GlobalAttentionMask diff --git a/transformerx/layers/multihead_attention.py b/transformerx/layers/multihead_attention.py index c3fa618..9d5d66d 100644 --- a/transformerx/layers/multihead_attention.py +++ b/transformerx/layers/multihead_attention.py @@ -1,4 +1,3 @@ -import numpy as np import tensorflow as tf from einops import rearrange @@ -89,7 +88,9 @@ class MultiHeadAttention(tf.keras.layers.Layer): Returns ------- output: - Concatenated tensors + Concatenated tensors. Same shape as the queries. + attention_weights: + Optional tensor of attention weights. Methods ------- @@ -102,38 +103,50 @@ class MultiHeadAttention(tf.keras.layers.Layer): Examples -------- - >>> x = tf.constant(np.random.random([2, 3, 2]), dtype=tf.float32) - >>> multihead = MultiHeadAttention(d_model=8) - >>> print(multihead) - <__main__.MultiHeadAttention object at 0x7ff83c16bb80> + >>> import tensorflow as tf + >>> import random + >>> tf.random.set_seed(1) + >>> random.seed(42) - >>> output = multihead(x, x, x) + + >>> x = tf.constant(tf.random.uniform([2, 3, 2]), dtype=tf.float32) + >>> multihead = MultiHeadAttention(d_model=8, dropout_rate=0) + >>> print(type(multihead)) + + + >>> output, attn_weights = multihead(x, x, x) >>> print(output) tf.Tensor( - [[[ 0.2051548 0.32050014 0.2915167 -0.04056092 0.12072253 - 0.06477361 0.18725544 0.02056682] - [ 0.19823116 0.2983173 0.27711272 -0.04071879 0.11172265 - 0.06080601 0.18654731 0.00577436] - [ 0.19831955 0.30106473 0.27666807 -0.03963682 0.11234044 - 0.0615251 0.18657821 0.00680977]] - [[ 0.14630345 0.21267754 0.26289055 -0.10759152 0.03963668 - 0.04118761 0.11257525 0.05869889] - [ 0.14556082 0.21070784 0.26139364 -0.10755821 0.03894955 - 0.04060047 0.11260018 0.05745776] - [ 0.14547291 0.21081978 0.26109838 -0.10745162 0.03889 - 0.04069766 0.11251941 0.05741404]]], shape=(2, 3, 8), dtype=float32) - - >>> attention = MultiHeadAttention(d_model=16, num_heads=4, dropout=0.1) - >>> queries = tf.random.normal((3, 10, 16)) + [[[ 0.27276292 -0.2744614 -0.06085328 -0.03441356 -0.1577001 + 0.33375 -0.7894692 -0.33158925] + [ 0.2792416 -0.27180034 -0.06341933 -0.02869054 -0.15612581 + 0.33674437 -0.7850623 -0.3237151 ] + [ 0.274466 -0.27393326 -0.06170867 -0.03307929 -0.15757665 + 0.33440444 -0.78846383 -0.3293347 ]] + + [[ 0.44330204 -0.14170787 -0.1372787 0.3109271 -0.30478996 + 0.47728932 -0.8789958 -0.3304574 ] + [ 0.44153026 -0.14282975 -0.13679348 0.30881953 -0.30498797 + 0.476456 -0.8804113 -0.33254212] + [ 0.44139963 -0.14291355 -0.13675913 0.30866385 -0.3050046 + 0.4763937 -0.88051784 -0.3326969 ]]], shape=(2, 3, 8), dtype=float32) + + + + + + >>> tf.random.set_seed(1) + >>> attention = MultiHeadAttention(d_model=16, num_heads=4, dropout_rate=0.1) + >>> queries = tf.random.normal((3, 20, 16)) >>> keys = tf.random.normal((3, 20, 16)) >>> values = tf.random.normal((3, 20, 16)) - >>> valid_lens = tf.constant([10, 15, 20]) - >>> output, _ = attention(queries, keys, values, valid_lens) - >>> output.shape - (3, 10, 16) + >>> valid_lens = tf.constant([3, 20]) + >>> output, _ = attention(queries, keys, values) + >>> print(output.shape) + (3, 20, 16) - >>> window_mask = tf.ones((3, 10, 20)) - >>> output, _ = attention(queries, keys, values, valid_lens, window_mask=window_mask) + >>> window_mask = tf.ones((3, 10)) + >>> output, _ = attention(queries, keys, values, attention_mask=window_mask) >>> output.shape (3, 10, 16) diff --git a/transformerx/utils.py b/transformerx/utils.py index 91f14a2..50e35be 100644 --- a/transformerx/utils.py +++ b/transformerx/utils.py @@ -8,8 +8,8 @@ def sequence_mask(X, attention_mask, value=-1e9): raise TypeError("X must be a Tensor") if not isinstance(attention_mask, tf.Tensor): raise TypeError("attention_mask must be a Tensor") - if len(X.shape) not in (2, 3): - raise ValueError("X must be a 2D or 3D tensor") + if len(X.shape) not in (2, 3, 4): + raise ValueError("X must be a 2D, 3D, or 4D tensor") if len(attention_mask.shape) not in (1, 2): raise ValueError("attention_mask must be a 1D or 2D tensor") @@ -18,8 +18,10 @@ def sequence_mask(X, attention_mask, value=-1e9): mask = tf.range(start=0, limit=maxlen, dtype=tf.float32)[None, :] < tf.cast( attention_mask, dtype=tf.float32 ) + print("mask.shape: ", mask.shape, attention_mask.shape, X.shape) else: maxlen = X.shape[0] + print("attention_mask.shape: ", attention_mask.shape, X.shape) mask = tf.range(start=0, limit=maxlen, dtype=tf.float32) < tf.cast( attention_mask, dtype=tf.float32 ) From 32143f68e8e0cb5d0ba9244c334fa0dabdf51e56 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Tue, 2 May 2023 21:02:59 +0100 Subject: [PATCH 14/27] feat: adding BaseMask --- transformerx/layers/masks/base.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 transformerx/layers/masks/base.py diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py new file mode 100644 index 0000000..41efc1b --- /dev/null +++ b/transformerx/layers/masks/base.py @@ -0,0 +1,13 @@ +import tensorflow as tf + + +class BaseMask(tf.keras.layers.Layer): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def build_mask(self, input_shape): + raise NotImplementedError("Subclasses must implement build_mask method") + + def call(self, inputs, *args, **kwargs): + mask = self.build_mask(inputs.shape) + return inputs * mask From bfdbca56f50d4e960678acbd2def911bf91352a0 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Tue, 2 May 2023 21:11:48 +0100 Subject: [PATCH 15/27] feat: adding AttentionMask --- transformerx/layers/masks/base.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py index 41efc1b..137122b 100644 --- a/transformerx/layers/masks/base.py +++ b/transformerx/layers/masks/base.py @@ -11,3 +11,13 @@ def build_mask(self, input_shape): def call(self, inputs, *args, **kwargs): mask = self.build_mask(inputs.shape) return inputs * mask + + +class AttentionMask(BaseMask): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def build_mask(self, input_shape): + seq_len = input_shape[1] + mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) + return tf.expand_dims(mask, axis=0) From b8b3e2883ca6bb61ed5746330a544385a2455878 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Wed, 3 May 2023 23:37:21 +0100 Subject: [PATCH 16/27] feat: Make AttentionMask running --- transformerx/layers/dot_product_attention.py | 2 +- transformerx/layers/masks/base.py | 28 +++++++++++++++++--- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py index 7f17e37..6d4ad26 100644 --- a/transformerx/layers/dot_product_attention.py +++ b/transformerx/layers/dot_product_attention.py @@ -156,7 +156,7 @@ def call( # masked_attention_scores = tf.math.multiply(scores, gmask) # attention_probs = tf.nn.softmax(masked_attention_scores, axis=-1) # uncomment until here - + print("dot product: ", scores.shape) self.attention_weights = masked_softmax(scores, attention_mask) # self.attention_weights = tf.nn.softmax(scores, axis=-1, mask=attention_mask) # scores = tf.matmul(self.dropout(self.attention_weights, **kwargs), values) diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py index 137122b..7107b3e 100644 --- a/transformerx/layers/masks/base.py +++ b/transformerx/layers/masks/base.py @@ -9,8 +9,8 @@ def build_mask(self, input_shape): raise NotImplementedError("Subclasses must implement build_mask method") def call(self, inputs, *args, **kwargs): - mask = self.build_mask(inputs.shape) - return inputs * mask + mask = self.build_mask(tf.shape(inputs)) + return tf.multiply(inputs, mask) class AttentionMask(BaseMask): @@ -19,5 +19,25 @@ def __init__(self, **kwargs): def build_mask(self, input_shape): seq_len = input_shape[1] - mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) - return tf.expand_dims(mask, axis=0) + print(tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)) + mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) * -1e9 + mask = tf.expand_dims(mask, axis=0) + # mask = tf.expand_dims(mask, axis=2) + # mask = tf.tile(mask, [input_shape[0], 1, 1]) + print(mask) + # return tf.expand_dims(mask, axis=0) + return mask + + +if __name__ == "__main__": + from transformerx.layers import DotProductAttention + + input_tensor = tf.random.uniform((2, 3, 6)) + attn_o, attn_w = DotProductAttention()(input_tensor, input_tensor, input_tensor) + print("attn_o.shape: ", attn_o.shape) + print("attn_w.shape:", attn_w.shape) + print("attn_w:", attn_w) + mask = AttentionMask() + output_tensor = mask(attn_w) + print(output_tensor) + print(tf.nn.softmax(output_tensor, axis=-1)) From 42819f0f0b9ce182e7092c27e0ce1daff4fd6649 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Thu, 4 May 2023 11:18:30 +0100 Subject: [PATCH 17/27] feat: AttentionMask compatible with Qs and Ks with different sequence lengths --- transformerx/layers/masks/base.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py index 7107b3e..d75dbe8 100644 --- a/transformerx/layers/masks/base.py +++ b/transformerx/layers/masks/base.py @@ -18,9 +18,11 @@ def __init__(self, **kwargs): super().__init__(**kwargs) def build_mask(self, input_shape): - seq_len = input_shape[1] - print(tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)) - mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) * -1e9 + q_seq_len = input_shape[1] + k_seq_len = input_shape[2] + print("input_shape: ", input_shape[1]) + print(tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0)) + mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0) * -1e9 mask = tf.expand_dims(mask, axis=0) # mask = tf.expand_dims(mask, axis=2) # mask = tf.tile(mask, [input_shape[0], 1, 1]) @@ -33,7 +35,8 @@ def build_mask(self, input_shape): from transformerx.layers import DotProductAttention input_tensor = tf.random.uniform((2, 3, 6)) - attn_o, attn_w = DotProductAttention()(input_tensor, input_tensor, input_tensor) + q_input_tensor = tf.random.uniform((2, 6, 6)) + attn_o, attn_w = DotProductAttention()(q_input_tensor, input_tensor, input_tensor) print("attn_o.shape: ", attn_o.shape) print("attn_w.shape:", attn_w.shape) print("attn_w:", attn_w) From 65a66e69783167d503c05e40b5d9911fcb0fcf02 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Fri, 5 May 2023 01:06:47 +0100 Subject: [PATCH 18/27] feat: Rename AttentionMask to LookAheadMask test multihead and dotproduct layers on LookAheadMask --- transformerx/layers/dot_product_attention.py | 1 + transformerx/layers/masks/base.py | 44 +++++++++++--------- transformerx/layers/multihead_attention.py | 5 ++- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py index 6d4ad26..cd6f48d 100644 --- a/transformerx/layers/dot_product_attention.py +++ b/transformerx/layers/dot_product_attention.py @@ -157,6 +157,7 @@ def call( # attention_probs = tf.nn.softmax(masked_attention_scores, axis=-1) # uncomment until here print("dot product: ", scores.shape) + print("dot product q: ", queries.shape) self.attention_weights = masked_softmax(scores, attention_mask) # self.attention_weights = tf.nn.softmax(scores, axis=-1, mask=attention_mask) # scores = tf.matmul(self.dropout(self.attention_weights, **kwargs), values) diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py index d75dbe8..6c161ee 100644 --- a/transformerx/layers/masks/base.py +++ b/transformerx/layers/masks/base.py @@ -9,38 +9,44 @@ def build_mask(self, input_shape): raise NotImplementedError("Subclasses must implement build_mask method") def call(self, inputs, *args, **kwargs): - mask = self.build_mask(tf.shape(inputs)) + if len(inputs.shape) == 3: + m_inputs = tf.expand_dims(inputs, axis=1) + else: + m_inputs = inputs + mask = self.build_mask(tf.shape(m_inputs)) return tf.multiply(inputs, mask) -class AttentionMask(BaseMask): +class LookAheadMask(BaseMask): def __init__(self, **kwargs): super().__init__(**kwargs) def build_mask(self, input_shape): - q_seq_len = input_shape[1] - k_seq_len = input_shape[2] - print("input_shape: ", input_shape[1]) - print(tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0)) - mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0) * -1e9 + q_seq_len = input_shape[2] + k_seq_len = input_shape[3] + mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0) mask = tf.expand_dims(mask, axis=0) - # mask = tf.expand_dims(mask, axis=2) - # mask = tf.tile(mask, [input_shape[0], 1, 1]) - print(mask) - # return tf.expand_dims(mask, axis=0) return mask if __name__ == "__main__": - from transformerx.layers import DotProductAttention + from transformerx.layers import DotProductAttention, MultiHeadAttention - input_tensor = tf.random.uniform((2, 3, 6)) - q_input_tensor = tf.random.uniform((2, 6, 6)) + input_tensor = tf.random.uniform((2, 4, 6)) + q_input_tensor = tf.random.uniform((2, 4, 6)) attn_o, attn_w = DotProductAttention()(q_input_tensor, input_tensor, input_tensor) - print("attn_o.shape: ", attn_o.shape) - print("attn_w.shape:", attn_w.shape) - print("attn_w:", attn_w) - mask = AttentionMask() + # print("mask attn_o.shape: ", attn_o.shape) + # print("mask attn_w.shape:", attn_w.shape) + # print("mask attn_w:", attn_w) + mask = LookAheadMask() output_tensor = mask(attn_w) - print(output_tensor) + # print("masked ouptut shape: ", output_tensor.shape, output_tensor) + # print(tf.nn.softmax(output_tensor, axis=-1)) print(tf.nn.softmax(output_tensor, axis=-1)) + + multihead_attn = MultiHeadAttention(d_model=32, num_heads=4, dropout_rate=0.1) + output, attn_w = multihead_attn(q_input_tensor, input_tensor, input_tensor) + output_tensor = mask(attn_w) + # print("mask output_tensor.shape: ", output_tensor.shape) + # print("mask output_tensor.shape: ", attn_w) + # print(tf.nn.softmax(output_tensor, axis=-1)) diff --git a/transformerx/layers/multihead_attention.py b/transformerx/layers/multihead_attention.py index 9d5d66d..b7cf77a 100644 --- a/transformerx/layers/multihead_attention.py +++ b/transformerx/layers/multihead_attention.py @@ -298,7 +298,7 @@ def call( >>> values = tf.random.normal([batch_size, no_of_key_value_pairs, depth]) >>> valid_lens = tf.random.uniform([batch_size], minval=0, maxval=no_of_queries, dtype=tf.int32) - >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout=dropout) + >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout_rate=dropout) >>> output, attention_weights = multihead_attn(queries, keys, values, valid_lens) Here is an example of how to use the call method with a window mask: @@ -311,7 +311,7 @@ def call( >>> valid_lens = tf.random.uniform([batch_size], minval=0, maxval=no_of_queries, dtype=tf.int32) >>> window_mask = tf.random.uniform([batch_size, no_of_queries, no_of_key_value_pairs], 0, 2, dtype=tf.int32) - >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout=dropout) + >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout_rate=dropout) >>> output, attention_weights = multihead_attn(queries, keys, values, valid_lens, window_mask) """ @@ -333,6 +333,7 @@ def call( # Shape of output: (batch_size * num_heads, no. of queries, # depth / num_heads) + print("multihead q: ", queries.shape) attention_output, attention_weights = self.attention( queries, keys, values, attention_mask, **kwargs ) From f96e6a39dcd42e2e0d7e9fe860a255b5bf7bc83e Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Fri, 5 May 2023 01:18:06 +0100 Subject: [PATCH 19/27] Refactor: Modify BaseMask --- transformerx/layers/masks/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py index 6c161ee..d459a49 100644 --- a/transformerx/layers/masks/base.py +++ b/transformerx/layers/masks/base.py @@ -14,7 +14,8 @@ def call(self, inputs, *args, **kwargs): else: m_inputs = inputs mask = self.build_mask(tf.shape(m_inputs)) - return tf.multiply(inputs, mask) + print("mask: ", mask) + return tf.add(inputs, mask * -1e9) class LookAheadMask(BaseMask): @@ -40,7 +41,7 @@ def build_mask(self, input_shape): # print("mask attn_w:", attn_w) mask = LookAheadMask() output_tensor = mask(attn_w) - # print("masked ouptut shape: ", output_tensor.shape, output_tensor) + print("masked ouptut shape: ", output_tensor.shape, output_tensor) # print(tf.nn.softmax(output_tensor, axis=-1)) print(tf.nn.softmax(output_tensor, axis=-1)) From 7e69a143c9a39b78fedc56efbd54de3edd4afd44 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Sat, 6 May 2023 19:22:30 +0100 Subject: [PATCH 20/27] doc: new theme --- docs/source/conf.py | 6 ++++-- transformerx/layers/masks/base.py | 9 +++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index d9e42bf..7533eda 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -24,7 +24,8 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration extensions = [ - "sphinx_rtd_theme", + # "sphinx_rtd_theme", + "furo", "sphinx.ext.autodoc", "sphinx.ext.autosummary", "sphinx.ext.napoleon", @@ -49,7 +50,8 @@ exclude_patterns = [] -html_theme = "sphinx_rtd_theme" +# html_theme = "sphinx_rtd_theme" +html_theme = "furo" html_title = "TransformerX Documentation" html_show_sourcelink = False html_baseurl = "https://github.com/tensorops/transformerx" diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py index d459a49..45fc2d7 100644 --- a/transformerx/layers/masks/base.py +++ b/transformerx/layers/masks/base.py @@ -30,6 +30,15 @@ def build_mask(self, input_shape): return mask +class PaddingMask(BaseMask): + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def build_mask(self, input_shape): + mask = tf.cast(tf.math.equal(input_shape, 0), tf.float32) + return mask + + if __name__ == "__main__": from transformerx.layers import DotProductAttention, MultiHeadAttention From bcc57aae631130cbac306adfec6ca3940573da1f Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Sat, 6 May 2023 19:23:31 +0100 Subject: [PATCH 21/27] feat: new implementation of PaddingMaskNew --- transformerx/layers/masks/base.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py index 45fc2d7..2a8a7a9 100644 --- a/transformerx/layers/masks/base.py +++ b/transformerx/layers/masks/base.py @@ -60,3 +60,24 @@ def build_mask(self, input_shape): # print("mask output_tensor.shape: ", output_tensor.shape) # print("mask output_tensor.shape: ", attn_w) # print(tf.nn.softmax(output_tensor, axis=-1)) + + +class PaddingMaskNew(tf.keras.layers.Layer): + def __init__(self, multi_head=True, **kwargs): + super(PaddingMask, self).__init__(**kwargs) + self.multi_head = multi_head + + def build(self, input_shape): + pass + + def call(self, inputs): + seq = tf.cast(tf.math.equal(inputs, 0), tf.float32) + seq = tf.expand_dims(seq, axis=1) + if self.multi_head: + seq = tf.expand_dims(seq, axis=1) + return seq + + def get_config(self): + config = super(PaddingMask, self).get_config() + config.update({"multi_head": self.multi_head}) + return config From ec0434f836f7e05587b01c4219e674db425873c1 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Sat, 6 May 2023 22:58:25 +0100 Subject: [PATCH 22/27] feat: implementation of SequencePadding --- transformerx/layers/masks/base.py | 35 +++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py index 2a8a7a9..847718c 100644 --- a/transformerx/layers/masks/base.py +++ b/transformerx/layers/masks/base.py @@ -63,15 +63,16 @@ def build_mask(self, input_shape): class PaddingMaskNew(tf.keras.layers.Layer): - def __init__(self, multi_head=True, **kwargs): + def __init__(self, multi_head=True, padding_value=0, **kwargs): super(PaddingMask, self).__init__(**kwargs) self.multi_head = multi_head + self.padding_value = padding_value def build(self, input_shape): pass def call(self, inputs): - seq = tf.cast(tf.math.equal(inputs, 0), tf.float32) + seq = tf.cast(tf.math.equal(inputs, self.padding_value), tf.float32) seq = tf.expand_dims(seq, axis=1) if self.multi_head: seq = tf.expand_dims(seq, axis=1) @@ -81,3 +82,33 @@ def get_config(self): config = super(PaddingMask, self).get_config() config.update({"multi_head": self.multi_head}) return config + + +class SequencePadding(tf.keras.layers.Layer): + def __init__(self, padding_value=0, max_sequence_length=None, **kwargs): + super(SequencePadding, self).__init__(**kwargs) + self.padding_value = padding_value + self.max_sequence_length = max_sequence_length + + def call(self, inputs): + if self.max_sequence_length is None: + max_sequence_length = tf.reduce_max(tf.shape(inputs)[1]) + else: + max_sequence_length = self.max_sequence_length + + padded_inputs = tf.pad( + inputs, + paddings=[[0, 0], [0, max_sequence_length - tf.shape(inputs)[1]]], + constant_values=self.padding_value, + ) + return padded_inputs + + def get_config(self): + config = super(SequencePadding, self).get_config() + config.update( + { + "padding_value": self.padding_value, + "max_sequence_length": self.max_sequence_length, + } + ) + return config From 380f9525ebe36b0a4c5f2d2cb1d6da5d351dc023 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Sat, 27 May 2023 00:36:41 +0100 Subject: [PATCH 23/27] refactor: Modify BaseMask --- transformerx/layers/masks/base.py | 104 ++++++++++++++---------------- 1 file changed, 48 insertions(+), 56 deletions(-) diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py index 847718c..635d4a4 100644 --- a/transformerx/layers/masks/base.py +++ b/transformerx/layers/masks/base.py @@ -5,16 +5,17 @@ class BaseMask(tf.keras.layers.Layer): def __init__(self, **kwargs): super().__init__(**kwargs) - def build_mask(self, input_shape): + def build_mask(self, inputs): raise NotImplementedError("Subclasses must implement build_mask method") def call(self, inputs, *args, **kwargs): - if len(inputs.shape) == 3: - m_inputs = tf.expand_dims(inputs, axis=1) + if len(inputs.shape) == 4: + pass + elif len(inputs.shape) == 3: + inputs = tf.expand_dims(inputs, axis=1) else: - m_inputs = inputs - mask = self.build_mask(tf.shape(m_inputs)) - print("mask: ", mask) + raise f"Invalid input shape. Expected 3D or 4D tensors, but received {len(inputs.shape)}D." + mask = self.build_mask() return tf.add(inputs, mask * -1e9) @@ -22,11 +23,13 @@ class LookAheadMask(BaseMask): def __init__(self, **kwargs): super().__init__(**kwargs) - def build_mask(self, input_shape): + def build_mask(self, inputs): + input_shape = tf.shape(inputs) q_seq_len = input_shape[2] k_seq_len = input_shape[3] mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0) - mask = tf.expand_dims(mask, axis=0) + mask = tf.expand_dims(mask, axis=1) + mask = tf.expand_dims(mask, axis=1) return mask @@ -39,29 +42,6 @@ def build_mask(self, input_shape): return mask -if __name__ == "__main__": - from transformerx.layers import DotProductAttention, MultiHeadAttention - - input_tensor = tf.random.uniform((2, 4, 6)) - q_input_tensor = tf.random.uniform((2, 4, 6)) - attn_o, attn_w = DotProductAttention()(q_input_tensor, input_tensor, input_tensor) - # print("mask attn_o.shape: ", attn_o.shape) - # print("mask attn_w.shape:", attn_w.shape) - # print("mask attn_w:", attn_w) - mask = LookAheadMask() - output_tensor = mask(attn_w) - print("masked ouptut shape: ", output_tensor.shape, output_tensor) - # print(tf.nn.softmax(output_tensor, axis=-1)) - print(tf.nn.softmax(output_tensor, axis=-1)) - - multihead_attn = MultiHeadAttention(d_model=32, num_heads=4, dropout_rate=0.1) - output, attn_w = multihead_attn(q_input_tensor, input_tensor, input_tensor) - output_tensor = mask(attn_w) - # print("mask output_tensor.shape: ", output_tensor.shape) - # print("mask output_tensor.shape: ", attn_w) - # print(tf.nn.softmax(output_tensor, axis=-1)) - - class PaddingMaskNew(tf.keras.layers.Layer): def __init__(self, multi_head=True, padding_value=0, **kwargs): super(PaddingMask, self).__init__(**kwargs) @@ -84,31 +64,43 @@ def get_config(self): return config -class SequencePadding(tf.keras.layers.Layer): - def __init__(self, padding_value=0, max_sequence_length=None, **kwargs): - super(SequencePadding, self).__init__(**kwargs) - self.padding_value = padding_value - self.max_sequence_length = max_sequence_length +if __name__ == "__main__": + from transformerx.layers import DotProductAttention, MultiHeadAttention - def call(self, inputs): - if self.max_sequence_length is None: - max_sequence_length = tf.reduce_max(tf.shape(inputs)[1]) - else: - max_sequence_length = self.max_sequence_length + input_tensor = tf.random.uniform((2, 4, 6)) + q_input_tensor = tf.random.uniform((2, 4, 6)) + attn_o, attn_w = DotProductAttention()(q_input_tensor, input_tensor, input_tensor) + # print("mask attn_o.shape: ", attn_o.shape) + # print("mask attn_w.shape:", attn_w.shape) + # print("mask attn_w:", attn_w) + mask = LookAheadMask() + output_tensor = mask(attn_w) + # print("masked ouptut shape: ", output_tensor.shape, output_tensor) + # print(tf.nn.softmax(output_tensor, axis=-1)) - padded_inputs = tf.pad( - inputs, - paddings=[[0, 0], [0, max_sequence_length - tf.shape(inputs)[1]]], - constant_values=self.padding_value, - ) - return padded_inputs + # print(tf.nn.softmax(output_tensor, axis=-1)) - def get_config(self): - config = super(SequencePadding, self).get_config() - config.update( - { - "padding_value": self.padding_value, - "max_sequence_length": self.max_sequence_length, - } - ) - return config + multihead_attn = MultiHeadAttention(d_model=32, num_heads=4, dropout_rate=0.1) + output, attn_w = multihead_attn(q_input_tensor, input_tensor, input_tensor) + output_tensor = mask(attn_w) + # print("mask output_tensor.shape: ", output_tensor.shape) + # print("mask output_tensor.shape: ", attn_w) + # print(tf.nn.softmax(output_tensor, axis=-1)) + + data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]] + # Create a 2D tensor + data = tf.constant([[1, 2, 3], [4, 5, 6]]) + + # Convert the dataset to a tensor + # data_tensor = tf.constant(data, dtype=tf.float32) + + # Create a SequencePadding layer + sequence_padding_layer = PaddingLayer(0, 4) + + padded_data = sequence_padding_layer(data) + + # Create a PaddingMask layer + padding_mask_layer = PaddingMask() + + # Generate the padding mask + padding_mask = padding_mask_layer(padded_data) From 4ee7744b8b6298815eee8a9646aa9806cc61cb0a Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Sat, 27 May 2023 01:13:02 +0100 Subject: [PATCH 24/27] refactor: Modify all maskings they seem to work now --- transformerx/layers/masks/base.py | 58 ++++++++++++++++++------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py index 635d4a4..811de36 100644 --- a/transformerx/layers/masks/base.py +++ b/transformerx/layers/masks/base.py @@ -9,13 +9,13 @@ def build_mask(self, inputs): raise NotImplementedError("Subclasses must implement build_mask method") def call(self, inputs, *args, **kwargs): - if len(inputs.shape) == 4: + if tf.shape(inputs).shape == 4: pass - elif len(inputs.shape) == 3: + elif tf.shape(inputs).shape == 3: inputs = tf.expand_dims(inputs, axis=1) else: raise f"Invalid input shape. Expected 3D or 4D tensors, but received {len(inputs.shape)}D." - mask = self.build_mask() + mask = self.build_mask(inputs) return tf.add(inputs, mask * -1e9) @@ -25,11 +25,12 @@ def __init__(self, **kwargs): def build_mask(self, inputs): input_shape = tf.shape(inputs) - q_seq_len = input_shape[2] - k_seq_len = input_shape[3] + if input_shape.shape == 4: + print("input shape: ", input_shape) + k_seq_len = input_shape[3] + q_seq_len = input_shape[2] + mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0) - mask = tf.expand_dims(mask, axis=1) - mask = tf.expand_dims(mask, axis=1) return mask @@ -37,8 +38,8 @@ class PaddingMask(BaseMask): def __init__(self, **kwargs): super().__init__(**kwargs) - def build_mask(self, input_shape): - mask = tf.cast(tf.math.equal(input_shape, 0), tf.float32) + def build_mask(self, inputs): + mask = tf.cast(tf.math.equal(inputs, 0), tf.float32) return mask @@ -69,23 +70,17 @@ def get_config(self): input_tensor = tf.random.uniform((2, 4, 6)) q_input_tensor = tf.random.uniform((2, 4, 6)) - attn_o, attn_w = DotProductAttention()(q_input_tensor, input_tensor, input_tensor) - # print("mask attn_o.shape: ", attn_o.shape) - # print("mask attn_w.shape:", attn_w.shape) - # print("mask attn_w:", attn_w) - mask = LookAheadMask() - output_tensor = mask(attn_w) - # print("masked ouptut shape: ", output_tensor.shape, output_tensor) - # print(tf.nn.softmax(output_tensor, axis=-1)) + attn_o, attn_w = DotProductAttention()(q_input_tensor, q_input_tensor, input_tensor) - # print(tf.nn.softmax(output_tensor, axis=-1)) + print("attn_w.shape: ", attn_w.shape) + la_mask = LookAheadMask() + output_tensor = la_mask(attn_w) + print(output_tensor.shape, output_tensor) multihead_attn = MultiHeadAttention(d_model=32, num_heads=4, dropout_rate=0.1) output, attn_w = multihead_attn(q_input_tensor, input_tensor, input_tensor) - output_tensor = mask(attn_w) - # print("mask output_tensor.shape: ", output_tensor.shape) - # print("mask output_tensor.shape: ", attn_w) - # print(tf.nn.softmax(output_tensor, axis=-1)) + output_tensor = la_mask(attn_w) + print(output_tensor.shape, output_tensor) data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]] # Create a 2D tensor @@ -95,12 +90,25 @@ def get_config(self): # data_tensor = tf.constant(data, dtype=tf.float32) # Create a SequencePadding layer - sequence_padding_layer = PaddingLayer(0, 4) + # sequence_padding_layer = PaddingLayer(0, 4) - padded_data = sequence_padding_layer(data) + # padded_data = sequence_padding_layer(data) + + # Test input + input_tensor = tf.constant( + [ + [[1, 2, 0], [4, 5, 6], [7, 8, 9], [0, 0, 0]], + [[1, 2, 3], [4, 5, 0], [0, 0, 0], [0, 0, 0]], + ], + dtype=tf.float32, + ) # Create a PaddingMask layer padding_mask_layer = PaddingMask() # Generate the padding mask - padding_mask = padding_mask_layer(padded_data) + padding_mask = padding_mask_layer(input_tensor) + print(padding_mask.shape, padding_mask) + + lad_mask = la_mask(input_tensor) + print(lad_mask.shape, lad_mask) From 141b075bff8de236e7f2ad93d5d03f0e375b3166 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Tue, 30 May 2023 00:36:36 +0100 Subject: [PATCH 25/27] refactor: Incorporate the new masking system Changed the causal (lookahead) mask to the new system of masking. Tested-by: Soran Ghaderi Acked-by: Soran Ghaderi --- transformerx/layers/dot_product_attention.py | 25 ++++++++++++-------- transformerx/layers/masks/__init__.py | 1 + transformerx/layers/masks/base.py | 25 ++++++++++++++------ 3 files changed, 34 insertions(+), 17 deletions(-) diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py index cd6f48d..ecb24d3 100644 --- a/transformerx/layers/dot_product_attention.py +++ b/transformerx/layers/dot_product_attention.py @@ -2,6 +2,7 @@ from transformerx.layers.masks.global_attention_mask import GlobalAttentionMask from transformerx.utils import masked_softmax +from transformerx.layers.masks import LookAheadMask class DotProductAttention(tf.keras.layers.Layer): @@ -137,18 +138,22 @@ def call( # apply causal mask if self.causal_mask: + # Obsolete version of masking. To be removed in the upcomming updates # seq_len = tf.shape(queries)[2] # heads = tf.shape(queries)[1] - batch_size, num_heads, seq_len, _ = tf.unstack(tf.shape(queries)) - causal_mask = tf.ones((num_heads, seq_len)) * -1e9 - causal_mask = tf.linalg.LinearOperatorLowerTriangular( - causal_mask - ).to_dense() - causal_mask = tf.expand_dims(causal_mask, axis=0) # add batch dimension - causal_mask = tf.broadcast_to( - tf.expand_dims(causal_mask, -1), tf.shape(scores) - ) # broadcast across batch dimension - scores = scores + causal_mask + # batch_size, num_heads, seq_len, _ = tf.unstack(tf.shape(queries)) + # causal_mask = tf.ones((num_heads, seq_len)) * -1e9 + # causal_mask = tf.linalg.LinearOperatorLowerTriangular( + # causal_mask + # ).to_dense() + # causal_mask = tf.expand_dims(causal_mask, axis=0) # add batch dimension + # causal_mask = tf.broadcast_to( + # tf.expand_dims(causal_mask, -1), tf.shape(scores) + # ) # broadcast across batch dimension + + # New version of masking + look_ahead_mask = LookAheadMask() + scores = look_ahead_mask(scores) # to be uncommented later # apply global mask diff --git a/transformerx/layers/masks/__init__.py b/transformerx/layers/masks/__init__.py index e69de29..80c100d 100644 --- a/transformerx/layers/masks/__init__.py +++ b/transformerx/layers/masks/__init__.py @@ -0,0 +1 @@ +from .base import LookAheadMask diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py index 811de36..a2c597d 100644 --- a/transformerx/layers/masks/base.py +++ b/transformerx/layers/masks/base.py @@ -30,16 +30,24 @@ def build_mask(self, inputs): k_seq_len = input_shape[3] q_seq_len = input_shape[2] - mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0) + # mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0) + mask = ( + 1 + - tf.linalg.LinearOperatorLowerTriangular( + tf.ones((q_seq_len, k_seq_len)), -1, 0 + ).to_dense() + ) return mask class PaddingMask(BaseMask): - def __init__(self, **kwargs): + def __init__(self, padding_value=0, multi_head=True, **kwargs): super().__init__(**kwargs) + self.padding_value = padding_value + self.multi_head = multi_head def build_mask(self, inputs): - mask = tf.cast(tf.math.equal(inputs, 0), tf.float32) + mask = tf.cast(tf.math.equal(inputs, self.padding_value), tf.float32) return mask @@ -79,7 +87,10 @@ def get_config(self): multihead_attn = MultiHeadAttention(d_model=32, num_heads=4, dropout_rate=0.1) output, attn_w = multihead_attn(q_input_tensor, input_tensor, input_tensor) - output_tensor = la_mask(attn_w) + + sample_input = tf.random.uniform((1, 1, 4, 2)) + # output_tensor = la_mask(attn_w) + output_tensor = la_mask(sample_input) print(output_tensor.shape, output_tensor) data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]] @@ -107,8 +118,8 @@ def get_config(self): padding_mask_layer = PaddingMask() # Generate the padding mask - padding_mask = padding_mask_layer(input_tensor) - print(padding_mask.shape, padding_mask) + # padding_mask = padding_mask_layer(input_tensor) + # print(padding_mask.shape, padding_mask) lad_mask = la_mask(input_tensor) - print(lad_mask.shape, lad_mask) + # print(lad_mask.shape, lad_mask) From 504a2fc7b8c9ece107180bb0a9c14f60acdbd456 Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Tue, 30 May 2023 15:00:26 +0100 Subject: [PATCH 26/27] test: All tests are running --- tests/layers/test_addnorm.py | 3 +++ tests/layers/test_transformer_encoder.py | 6 ++++-- transformerx/layers/dot_product_attention.py | 3 +-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/layers/test_addnorm.py b/tests/layers/test_addnorm.py index 680b58e..932cfb2 100644 --- a/tests/layers/test_addnorm.py +++ b/tests/layers/test_addnorm.py @@ -4,6 +4,9 @@ from transformerx.layers import AddNorm +physical_devices = tf.config.list_physical_devices("GPU") +tf.config.experimental.set_memory_growth(physical_devices[0], True) + class TestAddNorm: def test_init(self): diff --git a/tests/layers/test_transformer_encoder.py b/tests/layers/test_transformer_encoder.py index 47954c8..d62779c 100644 --- a/tests/layers/test_transformer_encoder.py +++ b/tests/layers/test_transformer_encoder.py @@ -1,3 +1,5 @@ +import os + import pytest import tensorflow as tf import numpy as np @@ -129,9 +131,9 @@ def test_training(self, model): vocab_size=self.vocab_size, seq_length=self.seq_length, num_samples=100 ) history = model.fit( - x_train, y_train, epochs=50, batch_size=64, validation_split=0.2 + x_train, y_train, epochs=50, batch_size=16, validation_split=0.2 ) - tf.keras.mixed_precision.set_global_policy("mixed_float16") + # tf.keras.mixed_precision.set_global_policy("mixed_float16") assert ( history.history["accuracy"][-1] > 0.5 ), "Training accuracy should be greater than 0.5" diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py index ecb24d3..0f68bda 100644 --- a/transformerx/layers/dot_product_attention.py +++ b/transformerx/layers/dot_product_attention.py @@ -161,8 +161,7 @@ def call( # masked_attention_scores = tf.math.multiply(scores, gmask) # attention_probs = tf.nn.softmax(masked_attention_scores, axis=-1) # uncomment until here - print("dot product: ", scores.shape) - print("dot product q: ", queries.shape) + self.attention_weights = masked_softmax(scores, attention_mask) # self.attention_weights = tf.nn.softmax(scores, axis=-1, mask=attention_mask) # scores = tf.matmul(self.dropout(self.attention_weights, **kwargs), values) From 4a4ada71d605b35ce36b9ada59c9ac6dc4aecd6d Mon Sep 17 00:00:00 2001 From: Soran Ghaderi Date: Tue, 30 May 2023 15:42:14 +0100 Subject: [PATCH 27/27] test: remove gpu memory allocation to run the test by Github actions --- tests/layers/test_addnorm.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/layers/test_addnorm.py b/tests/layers/test_addnorm.py index 932cfb2..680b58e 100644 --- a/tests/layers/test_addnorm.py +++ b/tests/layers/test_addnorm.py @@ -4,9 +4,6 @@ from transformerx.layers import AddNorm -physical_devices = tf.config.list_physical_devices("GPU") -tf.config.experimental.set_memory_growth(physical_devices[0], True) - class TestAddNorm: def test_init(self):