From cd17b8454d0ad3e8421c7ef1587c472792631be9 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Mon, 1 May 2023 01:22:32 +0100
Subject: [PATCH 01/27] test: minor fix decoder test

---
 tests/layers/test_transformer_decoder.py   | 49 +++++++++++++---------
 transformerx/layers/transformer_decoder.py |  3 +-
 2 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/tests/layers/test_transformer_decoder.py b/tests/layers/test_transformer_decoder.py
index 8466b28..d8fb743 100644
--- a/tests/layers/test_transformer_decoder.py
+++ b/tests/layers/test_transformer_decoder.py
@@ -107,27 +107,38 @@ def test_decoder_attention_weights_values(self, decoder, inputs):
 
 
 class TestTransformerDecoderIntegration:
-    seq_length = 10
-    vocab_size = 32
+    seq_length = 5
+    vocab_size = 8
 
     @staticmethod
     def create_toy_dataset(
         num_samples=1000, seq_length=10, vocab_size=64, num_classes=2
     ):
         # x = np.random.randint(0, vocab_size, size=(num_samples, seq_length))
-        x = np.random.normal(
-            vocab_size / 2, vocab_size / 2 - 1, size=(num_samples, seq_length)
+        # x = np.random.normal(
+        #     (vocab_size // 2), (vocab_size // 2 - 1), size=(num_samples, seq_length)
+        # )
+        x = tf.random.normal(
+            shape=(num_samples, seq_length),
+            mean=vocab_size // 2,
+            stddev=vocab_size / 2 - 3,
         )
-        y = np.random.randint(0, 2, size=(num_samples, 1))
-        y = np.random.normal(1, 1, size=(num_samples, seq_length))
-
-        x_train = tf.random.uniform(
-            shape=(num_samples, seq_length), maxval=vocab_size, dtype=tf.int32
-        )
-        y_train = tf.random.uniform(
-            shape=(num_samples, 1), maxval=num_classes, dtype=tf.int32
-        )
-        return x_train, y_train
+        # y = np.random.randint(0, 2, size=(num_samples, 1))
+        # y = np.random.normal(1, 1, size=(num_samples, seq_length))
+        y = tf.cast(tf.math.greater(x, vocab_size / 2), tf.int32)
+        print("x: ", x.shape)
+        print("y: ", y.shape)
+        print("vocab: ", vocab_size, vocab_size / 2, vocab_size // 2)
+        print("x: ", x[:5, :5])
+        print("y: ", y[:5, :5])
+
+        # x_train = tf.random.normal(
+        #     shape=(num_samples, seq_length), mean=vocab_size, dtype=tf.int32
+        # )
+        # y_train = tf.random.normal(
+        #     shape=(num_samples, 1), maxval=num_classes, dtype=tf.int32
+        # )
+        return x, y
 
     @pytest.fixture(scope="class")
     def model(self):
@@ -138,7 +149,7 @@ def model(self):
             vocab_size=self.vocab_size,
             maxlen_position_encoding=self.seq_length,
             num_heads=num_head,
-            d_model=64,
+            d_model=16,
             n_blocks=1,
         )
         decoder = TransformerDecoder(
@@ -155,10 +166,10 @@ def model(self):
         tgt_inputs = tf.keras.layers.Input(shape=(self.seq_length,))
         enc_output, attn_weights = encoder(inputs)
         print("enc_ouput: ", enc_output.shape)
-        dec_output, attn_weights_dec = decoder(inputs, enc_output, enc_output)
-        predictions = tf.keras.layers.Dense(1, activation="softmax")(dec_output)
+        dec_output, attn_weights_dec = decoder(tgt_inputs, enc_output, enc_output)
+        predictions = tf.keras.layers.Dense(1, activation="sigmoid")(dec_output)
 
-        model = tf.keras.Model(inputs=[inputs], outputs=predictions)
+        model = tf.keras.Model(inputs=[inputs, tgt_inputs], outputs=predictions)
         model.compile(
             optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
         )
@@ -169,6 +180,6 @@ def test_model_creation(self, model):
         x, y = self.create_toy_dataset(
             num_samples=100, vocab_size=self.vocab_size, seq_length=self.seq_length
         )
-        history = model.fit(x, y, epochs=50, batch_size=32, validation_split=0.2)
+        history = model.fit([x, x], y, epochs=100, batch_size=32, validation_split=0.2)
         assert isinstance(model, tf.keras.Model)
         assert model is not None
diff --git a/transformerx/layers/transformer_decoder.py b/transformerx/layers/transformer_decoder.py
index b570341..1277bd2 100644
--- a/transformerx/layers/transformer_decoder.py
+++ b/transformerx/layers/transformer_decoder.py
@@ -270,10 +270,9 @@ def apply_positional_embedding(self, inputs=None, **kwargs):
         )
 
     def call(self, queries, keys, values, attention_mask=None, **kwargs):
-        queries = self.apply_positional_embedding(queries, **kwargs)
+        blk_outputs = self.apply_positional_embedding(queries, **kwargs)
         # keys = self.apply_positional_embedding(keys, **kwargs)
         # values = self.apply_positional_embedding(values, **kwargs)
-        blk_outputs = queries
         # self.attention_weights = [None] * len(self.blocks)
         self.attention_weights = []
         for i, blk in enumerate(self.blocks):

From 20eb4ba6b1ecffb6d792bea2d304e3b8395872eb Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Mon, 1 May 2023 03:11:59 +0100
Subject: [PATCH 02/27] fix: __version__.py works properly

---
 transformerx/__version__.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/transformerx/__version__.py b/transformerx/__version__.py
index cb001e5..2068d33 100644
--- a/transformerx/__version__.py
+++ b/transformerx/__version__.py
@@ -1,5 +1,10 @@
 VERSION = (1, 0, 0, "beta", 3)
 
-__version__ = '.'.join(map(str, VERSION))
+if len(VERSION) < 3:
+    raise ValueError("VERSION must have at least three elements")
 
-print(__version__)
\ No newline at end of file
+__version__ = ".".join(str(v) for v in VERSION[:3])
+if len(VERSION) > 3:
+    __version__ += "-" + ".".join(str(v) for v in VERSION[3:])
+# version_str += "-dev"
+print(__version__)

From 60c828b1f9e64f381190094ab96873b156398ad6 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Mon, 1 May 2023 21:44:05 +0100
Subject: [PATCH 03/27] doc: generate docs

---
 docs/Makefile                                 | 20 +++++
 docs/make.bat                                 | 35 +++++++++
 docs/source/Makefile                          | 20 +++++
 docs/source/conf.py                           | 77 +++++++++++++++++++
 docs/source/index.rst                         | 21 +++++
 docs/source/make.bat                          | 35 +++++++++
 docs/source/modules.rst                       |  7 ++
 docs/source/transformerx.data_loader.rst      |  4 +
 docs/source/transformerx.layers.addnorm.rst   |  4 +
 ...nsformerx.layers.dot_product_attention.rst |  4 +
 ...erx.layers.masks.global_attention_mask.rst |  4 +
 docs/source/transformerx.layers.masks.rst     | 15 ++++
 ...ransformerx.layers.multihead_attention.rst |  4 +
 ...ransformerx.layers.positional_encoding.rst |  4 +
 .../transformerx.layers.positionwise_ffn.rst  |  4 +
 docs/source/transformerx.layers.rst           | 31 ++++++++
 ...ransformerx.layers.transformer_decoder.rst |  4 +
 ...rmerx.layers.transformer_decoder_block.rst |  4 +
 ...ransformerx.layers.transformer_encoder.rst |  4 +
 ...rmerx.layers.transformer_encoder_block.rst |  4 +
 docs/source/transformerx.rst                  | 26 +++++++
 docs/source/transformerx.training.base.rst    |  4 +
 docs/source/transformerx.training.rst         | 15 ++++
 docs/source/transformerx.txplot.plot_pe.rst   |  4 +
 docs/source/transformerx.txplot.rst           | 15 ++++
 docs/source/transformerx.utils.rst            |  4 +
 requirements.txt                              |  2 +-
 transformerx/layers/dot_product_attention.py  | 12 +--
 28 files changed, 380 insertions(+), 7 deletions(-)
 create mode 100644 docs/Makefile
 create mode 100644 docs/make.bat
 create mode 100644 docs/source/Makefile
 create mode 100644 docs/source/conf.py
 create mode 100644 docs/source/index.rst
 create mode 100644 docs/source/make.bat
 create mode 100644 docs/source/modules.rst
 create mode 100644 docs/source/transformerx.data_loader.rst
 create mode 100644 docs/source/transformerx.layers.addnorm.rst
 create mode 100644 docs/source/transformerx.layers.dot_product_attention.rst
 create mode 100644 docs/source/transformerx.layers.masks.global_attention_mask.rst
 create mode 100644 docs/source/transformerx.layers.masks.rst
 create mode 100644 docs/source/transformerx.layers.multihead_attention.rst
 create mode 100644 docs/source/transformerx.layers.positional_encoding.rst
 create mode 100644 docs/source/transformerx.layers.positionwise_ffn.rst
 create mode 100644 docs/source/transformerx.layers.rst
 create mode 100644 docs/source/transformerx.layers.transformer_decoder.rst
 create mode 100644 docs/source/transformerx.layers.transformer_decoder_block.rst
 create mode 100644 docs/source/transformerx.layers.transformer_encoder.rst
 create mode 100644 docs/source/transformerx.layers.transformer_encoder_block.rst
 create mode 100644 docs/source/transformerx.rst
 create mode 100644 docs/source/transformerx.training.base.rst
 create mode 100644 docs/source/transformerx.training.rst
 create mode 100644 docs/source/transformerx.txplot.plot_pe.rst
 create mode 100644 docs/source/transformerx.txplot.rst
 create mode 100644 docs/source/transformerx.utils.rst

diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..d0c3cbf
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..747ffb7
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/source/Makefile b/docs/source/Makefile
new file mode 100644
index 0000000..d4bb2cb
--- /dev/null
+++ b/docs/source/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..d9e42bf
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,77 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+import os
+import sys
+
+# sys.path.insert(0, os.path.abspath("../../transformerx/"))
+from datetime import datetime
+from pygments.styles import get_style_by_name
+
+PYTHONPATH = "../../transformerx/"
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = "TransformerX"
+copyright = "2023, TensorOps"
+author = "TensorOps"
+release = "v1.0.0-rc"
+
+# style = get_style_by_name("friendly")
+# style.background_color = "#f3f2f1"
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    "sphinx_rtd_theme",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.mathjax",
+    "sphinx_markdown_builder",
+]
+
+templates_path = ["_templates"]
+
+napoleon_use_rtype = False
+
+napoleon_include_init_with_doc = True
+napoleon_google_docstring = True
+napoleon_use_param = True
+napoleon_use_ivar = True
+
+# pygments_style = "friendly"
+
+language = "english"
+
+
+exclude_patterns = []
+
+
+html_theme = "sphinx_rtd_theme"
+html_title = "TransformerX Documentation"
+html_show_sourcelink = False
+html_baseurl = "https://github.com/tensorops/transformerx"
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+# html_theme = "alabaster"
+html_static_path = ["_static"]
+
+html_theme_options = {
+    "enable_search_shortcuts": True,
+    "globaltoc_collapse": True,
+    "prev_next_buttons_location": "both",
+    # "style_nav_header_background": "#F5A603",
+    "navigation_depth": 2,
+    "collapse_navigation": True,
+    "sticky_navigation": False,
+    "logo_only": False,
+    "display_version": True,
+    "style_external_links": True,
+    "titles_only": True,
+}
+
+napoleon_use_param = False
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..ebda9f1
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,21 @@
+.. TransformerX documentation master file, created by
+   sphinx-quickstart on Mon May  1 19:41:56 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to TransformerX's documentation!
+========================================
+
+.. toctree::
+   :maxdepth: 4
+   :caption: Contents:
+
+   transformerx
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/source/make.bat b/docs/source/make.bat
new file mode 100644
index 0000000..32bb245
--- /dev/null
+++ b/docs/source/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/source/modules.rst b/docs/source/modules.rst
new file mode 100644
index 0000000..7a7e678
--- /dev/null
+++ b/docs/source/modules.rst
@@ -0,0 +1,7 @@
+TransformerX
+============
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx
diff --git a/docs/source/transformerx.data_loader.rst b/docs/source/transformerx.data_loader.rst
new file mode 100644
index 0000000..fe2c160
--- /dev/null
+++ b/docs/source/transformerx.data_loader.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.data_loader
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.addnorm.rst b/docs/source/transformerx.layers.addnorm.rst
new file mode 100644
index 0000000..9b85526
--- /dev/null
+++ b/docs/source/transformerx.layers.addnorm.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.addnorm
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.dot_product_attention.rst b/docs/source/transformerx.layers.dot_product_attention.rst
new file mode 100644
index 0000000..7ecd170
--- /dev/null
+++ b/docs/source/transformerx.layers.dot_product_attention.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.dot_product_attention
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.masks.global_attention_mask.rst b/docs/source/transformerx.layers.masks.global_attention_mask.rst
new file mode 100644
index 0000000..aac5735
--- /dev/null
+++ b/docs/source/transformerx.layers.masks.global_attention_mask.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.masks.global_attention_mask
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.masks.rst b/docs/source/transformerx.layers.masks.rst
new file mode 100644
index 0000000..a3bcbf7
--- /dev/null
+++ b/docs/source/transformerx.layers.masks.rst
@@ -0,0 +1,15 @@
+transformerx.layers.masks package
+=================================
+
+.. automodule:: transformerx.layers.masks
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx.layers.masks.global_attention_mask
diff --git a/docs/source/transformerx.layers.multihead_attention.rst b/docs/source/transformerx.layers.multihead_attention.rst
new file mode 100644
index 0000000..4c9bada
--- /dev/null
+++ b/docs/source/transformerx.layers.multihead_attention.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.multihead_attention
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.positional_encoding.rst b/docs/source/transformerx.layers.positional_encoding.rst
new file mode 100644
index 0000000..96b96e4
--- /dev/null
+++ b/docs/source/transformerx.layers.positional_encoding.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.positional_encoding
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.positionwise_ffn.rst b/docs/source/transformerx.layers.positionwise_ffn.rst
new file mode 100644
index 0000000..eb6e534
--- /dev/null
+++ b/docs/source/transformerx.layers.positionwise_ffn.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.positionwise_ffn
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.rst b/docs/source/transformerx.layers.rst
new file mode 100644
index 0000000..c259fbd
--- /dev/null
+++ b/docs/source/transformerx.layers.rst
@@ -0,0 +1,31 @@
+transformerx.layers package
+===========================
+
+.. automodule:: transformerx.layers
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx.layers.masks
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx.layers.addnorm
+   transformerx.layers.dot_product_attention
+   transformerx.layers.multihead_attention
+   transformerx.layers.positional_encoding
+   transformerx.layers.positionwise_ffn
+   transformerx.layers.transformer_decoder
+   transformerx.layers.transformer_decoder_block
+   transformerx.layers.transformer_encoder
+   transformerx.layers.transformer_encoder_block
diff --git a/docs/source/transformerx.layers.transformer_decoder.rst b/docs/source/transformerx.layers.transformer_decoder.rst
new file mode 100644
index 0000000..3fe4c8f
--- /dev/null
+++ b/docs/source/transformerx.layers.transformer_decoder.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.transformer_decoder
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.transformer_decoder_block.rst b/docs/source/transformerx.layers.transformer_decoder_block.rst
new file mode 100644
index 0000000..d160b68
--- /dev/null
+++ b/docs/source/transformerx.layers.transformer_decoder_block.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.transformer_decoder_block
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.transformer_encoder.rst b/docs/source/transformerx.layers.transformer_encoder.rst
new file mode 100644
index 0000000..39e34da
--- /dev/null
+++ b/docs/source/transformerx.layers.transformer_encoder.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.transformer_encoder
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.transformer_encoder_block.rst b/docs/source/transformerx.layers.transformer_encoder_block.rst
new file mode 100644
index 0000000..9416970
--- /dev/null
+++ b/docs/source/transformerx.layers.transformer_encoder_block.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.transformer_encoder_block
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.rst b/docs/source/transformerx.rst
new file mode 100644
index 0000000..e59b212
--- /dev/null
+++ b/docs/source/transformerx.rst
@@ -0,0 +1,26 @@
+transformerx package
+====================
+
+.. automodule:: transformerx
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx.layers
+   transformerx.training
+   transformerx.txplot
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx.data_loader
+   transformerx.utils
diff --git a/docs/source/transformerx.training.base.rst b/docs/source/transformerx.training.base.rst
new file mode 100644
index 0000000..e319c1b
--- /dev/null
+++ b/docs/source/transformerx.training.base.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.training.base
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.training.rst b/docs/source/transformerx.training.rst
new file mode 100644
index 0000000..9b9e4bd
--- /dev/null
+++ b/docs/source/transformerx.training.rst
@@ -0,0 +1,15 @@
+transformerx.training package
+=============================
+
+.. automodule:: transformerx.training
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx.training.base
diff --git a/docs/source/transformerx.txplot.plot_pe.rst b/docs/source/transformerx.txplot.plot_pe.rst
new file mode 100644
index 0000000..487d915
--- /dev/null
+++ b/docs/source/transformerx.txplot.plot_pe.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.txplot.plot_pe
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.txplot.rst b/docs/source/transformerx.txplot.rst
new file mode 100644
index 0000000..41dff07
--- /dev/null
+++ b/docs/source/transformerx.txplot.rst
@@ -0,0 +1,15 @@
+transformerx.txplot package
+===========================
+
+.. automodule:: transformerx.txplot
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx.txplot.plot_pe
diff --git a/docs/source/transformerx.utils.rst b/docs/source/transformerx.utils.rst
new file mode 100644
index 0000000..17b0b90
--- /dev/null
+++ b/docs/source/transformerx.utils.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/requirements.txt b/requirements.txt
index 803d319..32cf0b6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Automatically generated by https://github.com/damnever/pigar.
 
 einops==0.4.1
-matplotlib==3.7.1
+# matplotlib==3.7.1
 numpy==1.24.2
 pytest==7.2.2
 requests==2.28.2
diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py
index b944303..b5221fd 100644
--- a/transformerx/layers/dot_product_attention.py
+++ b/transformerx/layers/dot_product_attention.py
@@ -25,11 +25,14 @@ class DotProductAttention(tf.keras.layers.Layer):
     Notes
     -----
     Dot-product attention formulation is as following:
-    .. math:: Attention(Q, K, V) = softmax(Q K^T) V
+
+    .. math::
+        Attention(Q, K, V) = softmax(Q K^T) V
 
     And scaled dot-product attention [1]_ is formulated as:
 
-    ..math:: Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V
+    .. math::
+        Attention(Q, K, V) = softmax(\\frac{QK^T}{\\sqrt{d_k}}) V
 
 
     Examples
@@ -43,7 +46,6 @@ class DotProductAttention(tf.keras.layers.Layer):
     [[[0.5418388  0.23626359]
       [0.4220487  0.394948  ]
       [0.6125364  0.12296485]]
-
      [[0.17872103 0.5700011 ]
       [0.28264287 0.02290592]
       [0.24536102 0.39220297]]], shape=(2, 3, 2), dtype=float32)  #random
@@ -56,7 +58,6 @@ class DotProductAttention(tf.keras.layers.Layer):
     [[[0.45955482 0.63378114]
       [0.48054144 0.62751293]
       [0.43684354 0.64026886]]
-
      [[0.82063836 0.2958246 ]
       [0.8300792  0.30486548]
       [0.83300924 0.30762452]]], shape=(2, 3, 2), dtype=float32)
@@ -70,7 +71,6 @@ class DotProductAttention(tf.keras.layers.Layer):
     [[[0.5195807  0.6383675 ]
       [0.49765232 0.6440835 ]
       [0.5132934  0.64001364]]
-
      [[0.6074392  0.80120546]
       [0.6098373  0.80074203]
       [0.5967663  0.7891044 ]]], shape=(2, 3, 2), dtype=float32)
@@ -78,7 +78,7 @@ class DotProductAttention(tf.keras.layers.Layer):
     References
     ----------
     .. [1] A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser, I. Polosukhin, Attention
-    is all you need, in: NIPS, pp. 5998–6008.
+        is all you need, in: NIPS, pp. 5998–6008.
     """
 
     def __init__(

From 859c6029265f948190f2e0be7d913674228b44c9 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Mon, 1 May 2023 21:50:46 +0100
Subject: [PATCH 04/27] doc: add .readthedocs.yaml

---
 .readthedocs.yaml | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 .readthedocs.yaml

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000..906a0af
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,29 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.9"
+    # You can also specify other tool versions:
+    # nodejs: "19"
+    # rust: "1.64"
+    # golang: "1.19"
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+   configuration: docs/conf.py
+
+# If using Sphinx, optionally build your docs in additional formats such as PDF
+# formats:
+#    - pdf
+
+# Optionally declare the Python requirements required to build your docs
+python:
+   install:
+   - requirements: docs/requirements.txt
\ No newline at end of file

From 0dfb443701094a4102186b1397a97264eb739831 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Mon, 1 May 2023 21:54:28 +0100
Subject: [PATCH 05/27] doc: update .gitignore

---
 .gitignore | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 272c02a..327d2d3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -135,4 +135,6 @@ tase.toml
 /tase/unknown_errors.txt
 .idea
 /.idea/
-/certs/
\ No newline at end of file
+/certs/
+/docs/source/
+/docs/build/doctrees/

From 2394d7e5dd19472ffca9549424ac9e7844af5560 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Mon, 1 May 2023 22:00:57 +0100
Subject: [PATCH 06/27] doc: update .gitignore 2

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 327d2d3..db599a5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -138,3 +138,5 @@ tase.toml
 /certs/
 /docs/source/
 /docs/build/doctrees/
+*.bat
+Makefile
\ No newline at end of file

From c2468250fbc2a9ecdd6e04c19fdbad42638a9123 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Mon, 1 May 2023 22:07:27 +0100
Subject: [PATCH 07/27] doc: update .gitignore

---
 .gitignore | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index db599a5..4fedaf5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,7 @@ __pycache__/
 
 # Distribution / packaging
 .Python
-build/
+#build/
 develop-eggs/
 dist/
 downloads/
@@ -138,5 +138,5 @@ tase.toml
 /certs/
 /docs/source/
 /docs/build/doctrees/
-*.bat
+*.bats
 Makefile
\ No newline at end of file

From 9d61cfe24a82cc6f4155d15d2bd7d8b0f0a38a57 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Mon, 1 May 2023 22:08:06 +0100
Subject: [PATCH 08/27] doc: update .gitignore 2

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 4fedaf5..10dd37b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -139,4 +139,5 @@ tase.toml
 /docs/source/
 /docs/build/doctrees/
 *.bats
-Makefile
\ No newline at end of file
+Makefile
+source/
\ No newline at end of file

From 1f3b3b411b5960dd1166ba8fd693c7f9dfd1fbbe Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Mon, 1 May 2023 22:08:39 +0100
Subject: [PATCH 09/27] doc: update .gitignore

---
 .gitignore | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 10dd37b..4c6f29b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -140,4 +140,5 @@ tase.toml
 /docs/build/doctrees/
 *.bats
 Makefile
-source/
\ No newline at end of file
+source/
+/docs/source/

From e32412268118e69959795252081ea787c10f7d45 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Mon, 1 May 2023 22:09:08 +0100
Subject: [PATCH 10/27] doc: update .gitignore d

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 4c6f29b..98065dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -141,4 +141,4 @@ tase.toml
 *.bats
 Makefile
 source/
-/docs/source/
+/docs/source/*

From 52b267b178a27929f12357d8c09655acd32d618c Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Tue, 2 May 2023 10:33:44 +0100
Subject: [PATCH 11/27] doc: adnorm tests running

---
 transformerx/layers/addnorm.py               | 38 ++++++++++----------
 transformerx/layers/dot_product_attention.py | 36 ++++++++++---------
 2 files changed, 38 insertions(+), 36 deletions(-)

diff --git a/transformerx/layers/addnorm.py b/transformerx/layers/addnorm.py
index 403e6db..daae09f 100644
--- a/transformerx/layers/addnorm.py
+++ b/transformerx/layers/addnorm.py
@@ -46,7 +46,7 @@ class AddNorm(tf.keras.layers.Layer):
     Examples
     --------
     >>> x = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32)
-    >>> y = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32)
+    >>> y = tf.constant(np.arange(10).reshape(5, 2) * 11, dtype=tf.float32)
     >>> print(x)
     tf.Tensor(
     [[ 0. 10.]
@@ -56,14 +56,14 @@ class AddNorm(tf.keras.layers.Layer):
      [80. 90.]], shape=(5, 2), dtype=float32)
 
     >>> addnorm = AddNorm(norm_type='layer', norm_eps=1e-6, dropout_rate=0.2, activation='relu')
-    >>> output = addnorm([x, y])
+    >>> output = addnorm(x, y)
     >>> print(output)
     tf.Tensor(
-    [[0.        0.        ]
-     [4.1565704 3.2312596]
-     [9.174077  8.174077 ]
-     [14.191582 13.116871 ]
-     [19.209087 18.134377 ]], shape=(5, 2), dtype=float32)
+    [[0. 1.]
+     [0. 1.]
+     [0. 1.]
+     [0. 1.]
+     [0. 1.]], shape=(5, 2), dtype=float32)
 
     References
     ----------
@@ -99,7 +99,7 @@ def __init__(
 
         if dropout_rate >= 1:
             raise ValueError("Dropout rate must be less than 1")
-        self.dropout = tf.keras.layers.Dropout(dropout_rate)
+        self.dropout = tf.keras.layers.Dropout(self.dropout_rate)
         # Regularizers
         self.kernel_regularizer = kernel_regularizer
         self.bias_regularizer = bias_regularizer
@@ -154,10 +154,10 @@ def call(self, x: tf.Tensor, residual: tf.Tensor, **kwargs):
             )
 
         # Apply dropout
-        residual = self.dropout(residual, training=kwargs.get("training", False))
+        # residual = self.dropout(residual)
 
         # Add residual connection
-        x = tf.keras.layers.Add()([x, residual])
+        x = tf.add(x, residual)
 
         # Apply normalization
         x = self.norm_layer(x)
@@ -184,12 +184,12 @@ def get_config(self):
         return config
 
 
-if __name__ == "__main__":
-    X = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32)
-    Y = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32)
-
-    addnorm = AddNorm(
-        norm_type="layer", norm_eps=1e-6, dropout_rate=0.2, activation="relu"
-    )
-    output = addnorm(X, X)
-    print(output)
+# if __name__ == "__main__":
+#     x = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32)
+#     y = tf.constant(np.arange(10, 20).reshape(5, 2) * 13, dtype=tf.float32)
+#
+#     addnorm = AddNorm(
+#         norm_type="layer", norm_eps=1e-6, dropout_rate=0.2, activation="relu"
+#     )
+#     output = addnorm(x, y)
+#     print(output)
diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py
index b5221fd..c226521 100644
--- a/transformerx/layers/dot_product_attention.py
+++ b/transformerx/layers/dot_product_attention.py
@@ -39,16 +39,17 @@ class DotProductAttention(tf.keras.layers.Layer):
     --------
     Scaled dot-product (scaled multiplicative) self-attention of tensor `x` (we feed `x` to queries, keys, and
     values).
-
-    >>> x = tf.cast(np.random.random([2, 3, 2]), dtype=tf.float32)
+    >>> tf.random.set_seed(1)
+    >>> x = tf.cast(tf.random.uniform([2, 3, 2]), dtype=tf.float32)
     >>> print(x)
     tf.Tensor(
-    [[[0.5418388  0.23626359]
-      [0.4220487  0.394948  ]
-      [0.6125364  0.12296485]]
-     [[0.17872103 0.5700011 ]
-      [0.28264287 0.02290592]
-      [0.24536102 0.39220297]]], shape=(2, 3, 2), dtype=float32)  #random
+    [[[0.16513085 0.9014813 ]
+      [0.6309742  0.4345461 ]
+      [0.29193902 0.64250207]]
+    <BLANKLINE>
+     [[0.9757855  0.43509948]
+      [0.6601019  0.60489583]
+      [0.6366315  0.6144488 ]]], shape=(2, 3, 2), dtype=float32)
 
     >>> dot_product = DotProductAttention(0.2)
     >>> queries, keys, values = x, x, x
@@ -65,15 +66,16 @@ class DotProductAttention(tf.keras.layers.Layer):
     The next example shows the dot-product (multiplicative) self-attention of tensor `x`.
 
     >>> dot_product = DotProductAttention(dropout_rate=0.1, scaled=False)
-    >>> output = dot_product(queries, keys, values)
+    >>> output, attn_weights = dot_product(queries, keys, values)
     >>> print(output)
-    tf.Tensor(
-    [[[0.5195807  0.6383675 ]
-      [0.49765232 0.6440835 ]
-      [0.5132934  0.64001364]]
-     [[0.6074392  0.80120546]
-      [0.6098373  0.80074203]
-      [0.5967663  0.7891044 ]]], shape=(2, 3, 2), dtype=float32)
+    tf.Tensor: shape=(2, 3, 2), dtype=float32, numpy=
+    array([[[0.34450796, 0.6787753 ],
+    [0.36907017, 0.65472305],
+    [0.35440704, 0.66882825]],
+    <BLANKLINE>
+    [[0.77042043, 0.5446019 ],
+    [0.7632908 , 0.5484005 ],
+    [0.7627964 , 0.5486638 ]]], dtype=float32)
 
     References
     ----------
@@ -160,7 +162,7 @@ def call(
         # self.attention_weights = tf.nn.softmax(scores, axis=-1, mask=attention_mask)
         # scores = tf.matmul(self.dropout(self.attention_weights, **kwargs), values)
         attention_output = tf.matmul(self.dropout(self.attention_weights), values)
-
+        print(attention_output.shape, self.attention_weights.shape)
         return attention_output, self.attention_weights
 
     def get_attention_weights(self):

From 6bad9cecf60bb93cdb9a362427d796a63c6a45ee Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Tue, 2 May 2023 10:46:43 +0100
Subject: [PATCH 12/27] doc: DotProduct tests running

---
 transformerx/layers/dot_product_attention.py | 31 ++++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py
index c226521..44fa7f8 100644
--- a/transformerx/layers/dot_product_attention.py
+++ b/transformerx/layers/dot_product_attention.py
@@ -53,29 +53,30 @@ class DotProductAttention(tf.keras.layers.Layer):
 
     >>> dot_product = DotProductAttention(0.2)
     >>> queries, keys, values = x, x, x
-    >>> output = dot_product(queries, keys, values)
+    >>> output, attn_weights = dot_product(queries, keys, values)
     >>> print(output)
     tf.Tensor(
-    [[[0.45955482 0.63378114]
-      [0.48054144 0.62751293]
-      [0.43684354 0.64026886]]
-     [[0.82063836 0.2958246 ]
-      [0.8300792  0.30486548]
-      [0.83300924 0.30762452]]], shape=(2, 3, 2), dtype=float32)
+    [[[0.34450796 0.6787753 ]
+      [0.36907017 0.65472305]
+      [0.35440704 0.66882825]]
+    <BLANKLINE>
+     [[0.77042043 0.5446019 ]
+      [0.7632908  0.5484005 ]
+      [0.7627964  0.5486638 ]]], shape=(2, 3, 2), dtype=float32)
 
     The next example shows the dot-product (multiplicative) self-attention of tensor `x`.
 
     >>> dot_product = DotProductAttention(dropout_rate=0.1, scaled=False)
     >>> output, attn_weights = dot_product(queries, keys, values)
     >>> print(output)
-    tf.Tensor: shape=(2, 3, 2), dtype=float32, numpy=
-    array([[[0.34450796, 0.6787753 ],
-    [0.36907017, 0.65472305],
-    [0.35440704, 0.66882825]],
+    tf.Tensor(
+    [[[0.33704066 0.6868143 ]
+      [0.37176722 0.6526886 ]
+      [0.35094902 0.6727435 ]]
     <BLANKLINE>
-    [[0.77042043, 0.5446019 ],
-    [0.7632908 , 0.5484005 ],
-    [0.7627964 , 0.5486638 ]]], dtype=float32)
+     [[0.7759446  0.54165894]
+      [0.7657266  0.54710305]
+      [0.7650213  0.5474789 ]]], shape=(2, 3, 2), dtype=float32)
 
     References
     ----------
@@ -148,7 +149,6 @@ def call(
             causal_mask = tf.broadcast_to(
                 tf.expand_dims(causal_mask, -1), tf.shape(scores)
             )  # broadcast across batch dimension
-            # scores +=
             scores = scores + causal_mask
 
         # to be uncommented later
@@ -162,7 +162,6 @@ def call(
         # self.attention_weights = tf.nn.softmax(scores, axis=-1, mask=attention_mask)
         # scores = tf.matmul(self.dropout(self.attention_weights, **kwargs), values)
         attention_output = tf.matmul(self.dropout(self.attention_weights), values)
-        print(attention_output.shape, self.attention_weights.shape)
         return attention_output, self.attention_weights
 
     def get_attention_weights(self):

From 1baa64027d2ec8dd9ad9f83b0224a9bcea5b35c0 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Tue, 2 May 2023 20:59:16 +0100
Subject: [PATCH 13/27] doc: DotProduct tests modification

---
 transformerx/layers/dot_product_attention.py |  1 -
 transformerx/layers/multihead_attention.py   | 69 ++++++++++++--------
 transformerx/utils.py                        |  6 +-
 3 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py
index 44fa7f8..7f17e37 100644
--- a/transformerx/layers/dot_product_attention.py
+++ b/transformerx/layers/dot_product_attention.py
@@ -1,4 +1,3 @@
-import numpy as np
 import tensorflow as tf
 
 from transformerx.layers.masks.global_attention_mask import GlobalAttentionMask
diff --git a/transformerx/layers/multihead_attention.py b/transformerx/layers/multihead_attention.py
index c3fa618..9d5d66d 100644
--- a/transformerx/layers/multihead_attention.py
+++ b/transformerx/layers/multihead_attention.py
@@ -1,4 +1,3 @@
-import numpy as np
 import tensorflow as tf
 from einops import rearrange
 
@@ -89,7 +88,9 @@ class MultiHeadAttention(tf.keras.layers.Layer):
     Returns
     -------
     output:
-        Concatenated tensors
+        Concatenated tensors. Same shape as the queries.
+    attention_weights:
+            Optional tensor of attention weights.
 
     Methods
     -------
@@ -102,38 +103,50 @@ class MultiHeadAttention(tf.keras.layers.Layer):
 
     Examples
     --------
-    >>> x = tf.constant(np.random.random([2, 3, 2]), dtype=tf.float32)
-    >>> multihead = MultiHeadAttention(d_model=8)
-    >>> print(multihead)
-    <__main__.MultiHeadAttention object at 0x7ff83c16bb80>
+    >>> import tensorflow as tf
+    >>> import random
+    >>> tf.random.set_seed(1)
+    >>> random.seed(42)
 
-    >>> output = multihead(x, x, x)
+
+    >>> x = tf.constant(tf.random.uniform([2, 3, 2]), dtype=tf.float32)
+    >>> multihead = MultiHeadAttention(d_model=8, dropout_rate=0)
+    >>> print(type(multihead))
+    <class 'multihead_attention.MultiHeadAttention'>
+
+    >>> output, attn_weights = multihead(x, x, x)
     >>> print(output)
     tf.Tensor(
-    [[[ 0.2051548   0.32050014  0.2915167  -0.04056092  0.12072253
-        0.06477361  0.18725544  0.02056682]
-      [ 0.19823116  0.2983173   0.27711272 -0.04071879  0.11172265
-        0.06080601  0.18654731  0.00577436]
-      [ 0.19831955  0.30106473  0.27666807 -0.03963682  0.11234044
-        0.0615251   0.18657821  0.00680977]]
-     [[ 0.14630345  0.21267754  0.26289055 -0.10759152  0.03963668
-        0.04118761  0.11257525  0.05869889]
-      [ 0.14556082  0.21070784  0.26139364 -0.10755821  0.03894955
-        0.04060047  0.11260018  0.05745776]
-      [ 0.14547291  0.21081978  0.26109838 -0.10745162  0.03889
-        0.04069766  0.11251941  0.05741404]]], shape=(2, 3, 8), dtype=float32)
-
-    >>> attention = MultiHeadAttention(d_model=16, num_heads=4, dropout=0.1)
-    >>> queries = tf.random.normal((3, 10, 16))
+    [[[ 0.27276292 -0.2744614  -0.06085328 -0.03441356 -0.1577001
+        0.33375    -0.7894692  -0.33158925]
+      [ 0.2792416  -0.27180034 -0.06341933 -0.02869054 -0.15612581
+        0.33674437 -0.7850623  -0.3237151 ]
+      [ 0.274466   -0.27393326 -0.06170867 -0.03307929 -0.15757665
+        0.33440444 -0.78846383 -0.3293347 ]]
+    <BLANKLINE>
+     [[ 0.44330204 -0.14170787 -0.1372787   0.3109271  -0.30478996
+        0.47728932 -0.8789958  -0.3304574 ]
+      [ 0.44153026 -0.14282975 -0.13679348  0.30881953 -0.30498797
+        0.476456   -0.8804113  -0.33254212]
+      [ 0.44139963 -0.14291355 -0.13675913  0.30866385 -0.3050046
+        0.4763937  -0.88051784 -0.3326969 ]]], shape=(2, 3, 8), dtype=float32)
+
+
+
+
+
+    >>> tf.random.set_seed(1)
+    >>> attention = MultiHeadAttention(d_model=16, num_heads=4, dropout_rate=0.1)
+    >>> queries = tf.random.normal((3, 20, 16))
     >>> keys = tf.random.normal((3, 20, 16))
     >>> values = tf.random.normal((3, 20, 16))
-    >>> valid_lens = tf.constant([10, 15, 20])
-    >>> output, _ = attention(queries, keys, values, valid_lens)
-    >>> output.shape
-    (3, 10, 16)
+    >>> valid_lens = tf.constant([3, 20])
+    >>> output, _ = attention(queries, keys, values)
+    >>> print(output.shape)
+    (3, 20, 16)
 
-    >>> window_mask = tf.ones((3, 10, 20))
-    >>> output, _ = attention(queries, keys, values, valid_lens, window_mask=window_mask)
+    >>> window_mask = tf.ones((3, 10))
+    >>> output, _ = attention(queries, keys, values, attention_mask=window_mask)
     >>> output.shape
     (3, 10, 16)
 
diff --git a/transformerx/utils.py b/transformerx/utils.py
index 91f14a2..50e35be 100644
--- a/transformerx/utils.py
+++ b/transformerx/utils.py
@@ -8,8 +8,8 @@ def sequence_mask(X, attention_mask, value=-1e9):
         raise TypeError("X must be a Tensor")
     if not isinstance(attention_mask, tf.Tensor):
         raise TypeError("attention_mask must be a Tensor")
-    if len(X.shape) not in (2, 3):
-        raise ValueError("X must be a 2D or 3D tensor")
+    if len(X.shape) not in (2, 3, 4):
+        raise ValueError("X must be a 2D, 3D, or 4D tensor")
     if len(attention_mask.shape) not in (1, 2):
         raise ValueError("attention_mask must be a 1D or 2D tensor")
 
@@ -18,8 +18,10 @@ def sequence_mask(X, attention_mask, value=-1e9):
         mask = tf.range(start=0, limit=maxlen, dtype=tf.float32)[None, :] < tf.cast(
             attention_mask, dtype=tf.float32
         )
+        print("mask.shape: ", mask.shape, attention_mask.shape, X.shape)
     else:
         maxlen = X.shape[0]
+        print("attention_mask.shape: ", attention_mask.shape, X.shape)
         mask = tf.range(start=0, limit=maxlen, dtype=tf.float32) < tf.cast(
             attention_mask, dtype=tf.float32
         )

From 32143f68e8e0cb5d0ba9244c334fa0dabdf51e56 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Tue, 2 May 2023 21:02:59 +0100
Subject: [PATCH 14/27] feat: adding BaseMask

---
 transformerx/layers/masks/base.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 transformerx/layers/masks/base.py

diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py
new file mode 100644
index 0000000..41efc1b
--- /dev/null
+++ b/transformerx/layers/masks/base.py
@@ -0,0 +1,13 @@
+import tensorflow as tf
+
+
+class BaseMask(tf.keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def build_mask(self, input_shape):
+        raise NotImplementedError("Subclasses must implement build_mask method")
+
+    def call(self, inputs, *args, **kwargs):
+        mask = self.build_mask(inputs.shape)
+        return inputs * mask

From bfdbca56f50d4e960678acbd2def911bf91352a0 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Tue, 2 May 2023 21:11:48 +0100
Subject: [PATCH 15/27] feat: adding AttentionMask

---
 transformerx/layers/masks/base.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py
index 41efc1b..137122b 100644
--- a/transformerx/layers/masks/base.py
+++ b/transformerx/layers/masks/base.py
@@ -11,3 +11,13 @@ def build_mask(self, input_shape):
     def call(self, inputs, *args, **kwargs):
         mask = self.build_mask(inputs.shape)
         return inputs * mask
+
+
+class AttentionMask(BaseMask):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def build_mask(self, input_shape):
+        seq_len = input_shape[1]
+        mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
+        return tf.expand_dims(mask, axis=0)

From b8b3e2883ca6bb61ed5746330a544385a2455878 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Wed, 3 May 2023 23:37:21 +0100
Subject: [PATCH 16/27] feat: Make AttentionMask running

---
 transformerx/layers/dot_product_attention.py |  2 +-
 transformerx/layers/masks/base.py            | 28 +++++++++++++++++---
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py
index 7f17e37..6d4ad26 100644
--- a/transformerx/layers/dot_product_attention.py
+++ b/transformerx/layers/dot_product_attention.py
@@ -156,7 +156,7 @@ def call(
         # masked_attention_scores = tf.math.multiply(scores, gmask)
         # attention_probs = tf.nn.softmax(masked_attention_scores, axis=-1)
         # uncomment until here
-
+        print("dot product: ", scores.shape)
         self.attention_weights = masked_softmax(scores, attention_mask)
         # self.attention_weights = tf.nn.softmax(scores, axis=-1, mask=attention_mask)
         # scores = tf.matmul(self.dropout(self.attention_weights, **kwargs), values)
diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py
index 137122b..7107b3e 100644
--- a/transformerx/layers/masks/base.py
+++ b/transformerx/layers/masks/base.py
@@ -9,8 +9,8 @@ def build_mask(self, input_shape):
         raise NotImplementedError("Subclasses must implement build_mask method")
 
     def call(self, inputs, *args, **kwargs):
-        mask = self.build_mask(inputs.shape)
-        return inputs * mask
+        mask = self.build_mask(tf.shape(inputs))
+        return tf.multiply(inputs, mask)
 
 
 class AttentionMask(BaseMask):
@@ -19,5 +19,25 @@ def __init__(self, **kwargs):
 
     def build_mask(self, input_shape):
         seq_len = input_shape[1]
-        mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
-        return tf.expand_dims(mask, axis=0)
+        print(tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0))
+        mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) * -1e9
+        mask = tf.expand_dims(mask, axis=0)
+        # mask = tf.expand_dims(mask, axis=2)
+        # mask = tf.tile(mask, [input_shape[0], 1, 1])
+        print(mask)
+        # return tf.expand_dims(mask, axis=0)
+        return mask
+
+
+if __name__ == "__main__":
+    from transformerx.layers import DotProductAttention
+
+    input_tensor = tf.random.uniform((2, 3, 6))
+    attn_o, attn_w = DotProductAttention()(input_tensor, input_tensor, input_tensor)
+    print("attn_o.shape: ", attn_o.shape)
+    print("attn_w.shape:", attn_w.shape)
+    print("attn_w:", attn_w)
+    mask = AttentionMask()
+    output_tensor = mask(attn_w)
+    print(output_tensor)
+    print(tf.nn.softmax(output_tensor, axis=-1))

From 42819f0f0b9ce182e7092c27e0ce1daff4fd6649 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Thu, 4 May 2023 11:18:30 +0100
Subject: [PATCH 17/27] feat: AttentionMask compatible with Qs and Ks with
 different sequence lengths

---
 transformerx/layers/masks/base.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py
index 7107b3e..d75dbe8 100644
--- a/transformerx/layers/masks/base.py
+++ b/transformerx/layers/masks/base.py
@@ -18,9 +18,11 @@ def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
     def build_mask(self, input_shape):
-        seq_len = input_shape[1]
-        print(tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0))
-        mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0) * -1e9
+        q_seq_len = input_shape[1]
+        k_seq_len = input_shape[2]
+        print("input_shape: ", input_shape[1])
+        print(tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0))
+        mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0) * -1e9
         mask = tf.expand_dims(mask, axis=0)
         # mask = tf.expand_dims(mask, axis=2)
         # mask = tf.tile(mask, [input_shape[0], 1, 1])
@@ -33,7 +35,8 @@ def build_mask(self, input_shape):
     from transformerx.layers import DotProductAttention
 
     input_tensor = tf.random.uniform((2, 3, 6))
-    attn_o, attn_w = DotProductAttention()(input_tensor, input_tensor, input_tensor)
+    q_input_tensor = tf.random.uniform((2, 6, 6))
+    attn_o, attn_w = DotProductAttention()(q_input_tensor, input_tensor, input_tensor)
     print("attn_o.shape: ", attn_o.shape)
     print("attn_w.shape:", attn_w.shape)
     print("attn_w:", attn_w)

From 65a66e69783167d503c05e40b5d9911fcb0fcf02 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Fri, 5 May 2023 01:06:47 +0100
Subject: [PATCH 18/27] feat: Rename AttentionMask to LookAheadMask

test multihead and dotproduct layers on LookAheadMask
---
 transformerx/layers/dot_product_attention.py |  1 +
 transformerx/layers/masks/base.py            | 44 +++++++++++---------
 transformerx/layers/multihead_attention.py   |  5 ++-
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py
index 6d4ad26..cd6f48d 100644
--- a/transformerx/layers/dot_product_attention.py
+++ b/transformerx/layers/dot_product_attention.py
@@ -157,6 +157,7 @@ def call(
         # attention_probs = tf.nn.softmax(masked_attention_scores, axis=-1)
         # uncomment until here
         print("dot product: ", scores.shape)
+        print("dot product q: ", queries.shape)
         self.attention_weights = masked_softmax(scores, attention_mask)
         # self.attention_weights = tf.nn.softmax(scores, axis=-1, mask=attention_mask)
         # scores = tf.matmul(self.dropout(self.attention_weights, **kwargs), values)
diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py
index d75dbe8..6c161ee 100644
--- a/transformerx/layers/masks/base.py
+++ b/transformerx/layers/masks/base.py
@@ -9,38 +9,44 @@ def build_mask(self, input_shape):
         raise NotImplementedError("Subclasses must implement build_mask method")
 
     def call(self, inputs, *args, **kwargs):
-        mask = self.build_mask(tf.shape(inputs))
+        if len(inputs.shape) == 3:
+            m_inputs = tf.expand_dims(inputs, axis=1)
+        else:
+            m_inputs = inputs
+        mask = self.build_mask(tf.shape(m_inputs))
         return tf.multiply(inputs, mask)
 
 
-class AttentionMask(BaseMask):
+class LookAheadMask(BaseMask):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
     def build_mask(self, input_shape):
-        q_seq_len = input_shape[1]
-        k_seq_len = input_shape[2]
-        print("input_shape: ", input_shape[1])
-        print(tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0))
-        mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0) * -1e9
+        q_seq_len = input_shape[2]
+        k_seq_len = input_shape[3]
+        mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0)
         mask = tf.expand_dims(mask, axis=0)
-        # mask = tf.expand_dims(mask, axis=2)
-        # mask = tf.tile(mask, [input_shape[0], 1, 1])
-        print(mask)
-        # return tf.expand_dims(mask, axis=0)
         return mask
 
 
 if __name__ == "__main__":
-    from transformerx.layers import DotProductAttention
+    from transformerx.layers import DotProductAttention, MultiHeadAttention
 
-    input_tensor = tf.random.uniform((2, 3, 6))
-    q_input_tensor = tf.random.uniform((2, 6, 6))
+    input_tensor = tf.random.uniform((2, 4, 6))
+    q_input_tensor = tf.random.uniform((2, 4, 6))
     attn_o, attn_w = DotProductAttention()(q_input_tensor, input_tensor, input_tensor)
-    print("attn_o.shape: ", attn_o.shape)
-    print("attn_w.shape:", attn_w.shape)
-    print("attn_w:", attn_w)
-    mask = AttentionMask()
+    # print("mask attn_o.shape: ", attn_o.shape)
+    # print("mask attn_w.shape:", attn_w.shape)
+    # print("mask attn_w:", attn_w)
+    mask = LookAheadMask()
     output_tensor = mask(attn_w)
-    print(output_tensor)
+    # print("masked ouptut shape: ", output_tensor.shape, output_tensor)
+    # print(tf.nn.softmax(output_tensor, axis=-1))
     print(tf.nn.softmax(output_tensor, axis=-1))
+
+    multihead_attn = MultiHeadAttention(d_model=32, num_heads=4, dropout_rate=0.1)
+    output, attn_w = multihead_attn(q_input_tensor, input_tensor, input_tensor)
+    output_tensor = mask(attn_w)
+    # print("mask output_tensor.shape: ", output_tensor.shape)
+    # print("mask output_tensor.shape: ", attn_w)
+    # print(tf.nn.softmax(output_tensor, axis=-1))
diff --git a/transformerx/layers/multihead_attention.py b/transformerx/layers/multihead_attention.py
index 9d5d66d..b7cf77a 100644
--- a/transformerx/layers/multihead_attention.py
+++ b/transformerx/layers/multihead_attention.py
@@ -298,7 +298,7 @@ def call(
         >>> values = tf.random.normal([batch_size, no_of_key_value_pairs, depth])
         >>> valid_lens = tf.random.uniform([batch_size], minval=0, maxval=no_of_queries, dtype=tf.int32)
 
-        >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout=dropout)
+        >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout_rate=dropout)
         >>> output, attention_weights = multihead_attn(queries, keys, values, valid_lens)
 
         Here is an example of how to use the call method with a window mask:
@@ -311,7 +311,7 @@ def call(
         >>> valid_lens = tf.random.uniform([batch_size], minval=0, maxval=no_of_queries, dtype=tf.int32)
         >>> window_mask = tf.random.uniform([batch_size, no_of_queries, no_of_key_value_pairs], 0, 2, dtype=tf.int32)
 
-        >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout=dropout)
+        >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout_rate=dropout)
         >>> output, attention_weights = multihead_attn(queries, keys, values, valid_lens, window_mask)
         """
 
@@ -333,6 +333,7 @@ def call(
 
         # Shape of output: (batch_size * num_heads, no. of queries,
         # depth / num_heads)
+        print("multihead q: ", queries.shape)
         attention_output, attention_weights = self.attention(
             queries, keys, values, attention_mask, **kwargs
         )

From f96e6a39dcd42e2e0d7e9fe860a255b5bf7bc83e Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Fri, 5 May 2023 01:18:06 +0100
Subject: [PATCH 19/27] Refactor: Modify BaseMask

---
 transformerx/layers/masks/base.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py
index 6c161ee..d459a49 100644
--- a/transformerx/layers/masks/base.py
+++ b/transformerx/layers/masks/base.py
@@ -14,7 +14,8 @@ def call(self, inputs, *args, **kwargs):
         else:
             m_inputs = inputs
         mask = self.build_mask(tf.shape(m_inputs))
-        return tf.multiply(inputs, mask)
+        print("mask: ", mask)
+        return tf.add(inputs, mask * -1e9)
 
 
 class LookAheadMask(BaseMask):
@@ -40,7 +41,7 @@ def build_mask(self, input_shape):
     # print("mask attn_w:", attn_w)
     mask = LookAheadMask()
     output_tensor = mask(attn_w)
-    # print("masked ouptut shape: ", output_tensor.shape, output_tensor)
+    print("masked ouptut shape: ", output_tensor.shape, output_tensor)
     # print(tf.nn.softmax(output_tensor, axis=-1))
     print(tf.nn.softmax(output_tensor, axis=-1))
 

From 7e69a143c9a39b78fedc56efbd54de3edd4afd44 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Sat, 6 May 2023 19:22:30 +0100
Subject: [PATCH 20/27] doc: new theme

---
 docs/source/conf.py               | 6 ++++--
 transformerx/layers/masks/base.py | 9 +++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index d9e42bf..7533eda 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -24,7 +24,8 @@
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
 
 extensions = [
-    "sphinx_rtd_theme",
+    # "sphinx_rtd_theme",
+    "furo",
     "sphinx.ext.autodoc",
     "sphinx.ext.autosummary",
     "sphinx.ext.napoleon",
@@ -49,7 +50,8 @@
 exclude_patterns = []
 
 
-html_theme = "sphinx_rtd_theme"
+# html_theme = "sphinx_rtd_theme"
+html_theme = "furo"
 html_title = "TransformerX Documentation"
 html_show_sourcelink = False
 html_baseurl = "https://github.com/tensorops/transformerx"
diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py
index d459a49..45fc2d7 100644
--- a/transformerx/layers/masks/base.py
+++ b/transformerx/layers/masks/base.py
@@ -30,6 +30,15 @@ def build_mask(self, input_shape):
         return mask
 
 
+class PaddingMask(BaseMask):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def build_mask(self, input_shape):
+        mask = tf.cast(tf.math.equal(input_shape, 0), tf.float32)
+        return mask
+
+
 if __name__ == "__main__":
     from transformerx.layers import DotProductAttention, MultiHeadAttention
 

From bcc57aae631130cbac306adfec6ca3940573da1f Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Sat, 6 May 2023 19:23:31 +0100
Subject: [PATCH 21/27] feat: new implementation of PaddingMaskNew

---
 transformerx/layers/masks/base.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py
index 45fc2d7..2a8a7a9 100644
--- a/transformerx/layers/masks/base.py
+++ b/transformerx/layers/masks/base.py
@@ -60,3 +60,24 @@ def build_mask(self, input_shape):
     # print("mask output_tensor.shape: ", output_tensor.shape)
     # print("mask output_tensor.shape: ", attn_w)
     # print(tf.nn.softmax(output_tensor, axis=-1))
+
+
+class PaddingMaskNew(tf.keras.layers.Layer):
+    def __init__(self, multi_head=True, **kwargs):
+        super(PaddingMask, self).__init__(**kwargs)
+        self.multi_head = multi_head
+
+    def build(self, input_shape):
+        pass
+
+    def call(self, inputs):
+        seq = tf.cast(tf.math.equal(inputs, 0), tf.float32)
+        seq = tf.expand_dims(seq, axis=1)
+        if self.multi_head:
+            seq = tf.expand_dims(seq, axis=1)
+        return seq
+
+    def get_config(self):
+        config = super(PaddingMask, self).get_config()
+        config.update({"multi_head": self.multi_head})
+        return config

From ec0434f836f7e05587b01c4219e674db425873c1 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Sat, 6 May 2023 22:58:25 +0100
Subject: [PATCH 22/27] feat: implementation of SequencePadding

---
 transformerx/layers/masks/base.py | 35 +++++++++++++++++++++++++++++--
 1 file changed, 33 insertions(+), 2 deletions(-)

diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py
index 2a8a7a9..847718c 100644
--- a/transformerx/layers/masks/base.py
+++ b/transformerx/layers/masks/base.py
@@ -63,15 +63,16 @@ def build_mask(self, input_shape):
 
 
 class PaddingMaskNew(tf.keras.layers.Layer):
-    def __init__(self, multi_head=True, **kwargs):
+    def __init__(self, multi_head=True, padding_value=0, **kwargs):
         super(PaddingMask, self).__init__(**kwargs)
         self.multi_head = multi_head
+        self.padding_value = padding_value
 
     def build(self, input_shape):
         pass
 
     def call(self, inputs):
-        seq = tf.cast(tf.math.equal(inputs, 0), tf.float32)
+        seq = tf.cast(tf.math.equal(inputs, self.padding_value), tf.float32)
         seq = tf.expand_dims(seq, axis=1)
         if self.multi_head:
             seq = tf.expand_dims(seq, axis=1)
@@ -81,3 +82,33 @@ def get_config(self):
         config = super(PaddingMask, self).get_config()
         config.update({"multi_head": self.multi_head})
         return config
+
+
+class SequencePadding(tf.keras.layers.Layer):
+    def __init__(self, padding_value=0, max_sequence_length=None, **kwargs):
+        super(SequencePadding, self).__init__(**kwargs)
+        self.padding_value = padding_value
+        self.max_sequence_length = max_sequence_length
+
+    def call(self, inputs):
+        if self.max_sequence_length is None:
+            max_sequence_length = tf.reduce_max(tf.shape(inputs)[1])
+        else:
+            max_sequence_length = self.max_sequence_length
+
+        padded_inputs = tf.pad(
+            inputs,
+            paddings=[[0, 0], [0, max_sequence_length - tf.shape(inputs)[1]]],
+            constant_values=self.padding_value,
+        )
+        return padded_inputs
+
+    def get_config(self):
+        config = super(SequencePadding, self).get_config()
+        config.update(
+            {
+                "padding_value": self.padding_value,
+                "max_sequence_length": self.max_sequence_length,
+            }
+        )
+        return config

From 380f9525ebe36b0a4c5f2d2cb1d6da5d351dc023 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Sat, 27 May 2023 00:36:41 +0100
Subject: [PATCH 23/27] refactor: Modify BaseMask

---
 transformerx/layers/masks/base.py | 104 ++++++++++++++----------------
 1 file changed, 48 insertions(+), 56 deletions(-)

diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py
index 847718c..635d4a4 100644
--- a/transformerx/layers/masks/base.py
+++ b/transformerx/layers/masks/base.py
@@ -5,16 +5,17 @@ class BaseMask(tf.keras.layers.Layer):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def build_mask(self, input_shape):
+    def build_mask(self, inputs):
         raise NotImplementedError("Subclasses must implement build_mask method")
 
     def call(self, inputs, *args, **kwargs):
-        if len(inputs.shape) == 3:
-            m_inputs = tf.expand_dims(inputs, axis=1)
+        if len(inputs.shape) == 4:
+            pass
+        elif len(inputs.shape) == 3:
+            inputs = tf.expand_dims(inputs, axis=1)
         else:
-            m_inputs = inputs
-        mask = self.build_mask(tf.shape(m_inputs))
-        print("mask: ", mask)
+            raise f"Invalid input shape. Expected 3D or 4D tensors, but received {len(inputs.shape)}D."
+        mask = self.build_mask()
         return tf.add(inputs, mask * -1e9)
 
 
@@ -22,11 +23,13 @@ class LookAheadMask(BaseMask):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def build_mask(self, input_shape):
+    def build_mask(self, inputs):
+        input_shape = tf.shape(inputs)
         q_seq_len = input_shape[2]
         k_seq_len = input_shape[3]
         mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0)
-        mask = tf.expand_dims(mask, axis=0)
+        mask = tf.expand_dims(mask, axis=1)
+        mask = tf.expand_dims(mask, axis=1)
         return mask
 
 
@@ -39,29 +42,6 @@ def build_mask(self, input_shape):
         return mask
 
 
-if __name__ == "__main__":
-    from transformerx.layers import DotProductAttention, MultiHeadAttention
-
-    input_tensor = tf.random.uniform((2, 4, 6))
-    q_input_tensor = tf.random.uniform((2, 4, 6))
-    attn_o, attn_w = DotProductAttention()(q_input_tensor, input_tensor, input_tensor)
-    # print("mask attn_o.shape: ", attn_o.shape)
-    # print("mask attn_w.shape:", attn_w.shape)
-    # print("mask attn_w:", attn_w)
-    mask = LookAheadMask()
-    output_tensor = mask(attn_w)
-    print("masked ouptut shape: ", output_tensor.shape, output_tensor)
-    # print(tf.nn.softmax(output_tensor, axis=-1))
-    print(tf.nn.softmax(output_tensor, axis=-1))
-
-    multihead_attn = MultiHeadAttention(d_model=32, num_heads=4, dropout_rate=0.1)
-    output, attn_w = multihead_attn(q_input_tensor, input_tensor, input_tensor)
-    output_tensor = mask(attn_w)
-    # print("mask output_tensor.shape: ", output_tensor.shape)
-    # print("mask output_tensor.shape: ", attn_w)
-    # print(tf.nn.softmax(output_tensor, axis=-1))
-
-
 class PaddingMaskNew(tf.keras.layers.Layer):
     def __init__(self, multi_head=True, padding_value=0, **kwargs):
         super(PaddingMask, self).__init__(**kwargs)
@@ -84,31 +64,43 @@ def get_config(self):
         return config
 
 
-class SequencePadding(tf.keras.layers.Layer):
-    def __init__(self, padding_value=0, max_sequence_length=None, **kwargs):
-        super(SequencePadding, self).__init__(**kwargs)
-        self.padding_value = padding_value
-        self.max_sequence_length = max_sequence_length
+if __name__ == "__main__":
+    from transformerx.layers import DotProductAttention, MultiHeadAttention
 
-    def call(self, inputs):
-        if self.max_sequence_length is None:
-            max_sequence_length = tf.reduce_max(tf.shape(inputs)[1])
-        else:
-            max_sequence_length = self.max_sequence_length
+    input_tensor = tf.random.uniform((2, 4, 6))
+    q_input_tensor = tf.random.uniform((2, 4, 6))
+    attn_o, attn_w = DotProductAttention()(q_input_tensor, input_tensor, input_tensor)
+    # print("mask attn_o.shape: ", attn_o.shape)
+    # print("mask attn_w.shape:", attn_w.shape)
+    # print("mask attn_w:", attn_w)
+    mask = LookAheadMask()
+    output_tensor = mask(attn_w)
+    # print("masked ouptut shape: ", output_tensor.shape, output_tensor)
+    # print(tf.nn.softmax(output_tensor, axis=-1))
 
-        padded_inputs = tf.pad(
-            inputs,
-            paddings=[[0, 0], [0, max_sequence_length - tf.shape(inputs)[1]]],
-            constant_values=self.padding_value,
-        )
-        return padded_inputs
+    # print(tf.nn.softmax(output_tensor, axis=-1))
 
-    def get_config(self):
-        config = super(SequencePadding, self).get_config()
-        config.update(
-            {
-                "padding_value": self.padding_value,
-                "max_sequence_length": self.max_sequence_length,
-            }
-        )
-        return config
+    multihead_attn = MultiHeadAttention(d_model=32, num_heads=4, dropout_rate=0.1)
+    output, attn_w = multihead_attn(q_input_tensor, input_tensor, input_tensor)
+    output_tensor = mask(attn_w)
+    # print("mask output_tensor.shape: ", output_tensor.shape)
+    # print("mask output_tensor.shape: ", attn_w)
+    # print(tf.nn.softmax(output_tensor, axis=-1))
+
+    data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
+    # Create a 2D tensor
+    data = tf.constant([[1, 2, 3], [4, 5, 6]])
+
+    # Convert the dataset to a tensor
+    # data_tensor = tf.constant(data, dtype=tf.float32)
+
+    # Create a SequencePadding layer
+    sequence_padding_layer = PaddingLayer(0, 4)
+
+    padded_data = sequence_padding_layer(data)
+
+    # Create a PaddingMask layer
+    padding_mask_layer = PaddingMask()
+
+    # Generate the padding mask
+    padding_mask = padding_mask_layer(padded_data)

From 4ee7744b8b6298815eee8a9646aa9806cc61cb0a Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Sat, 27 May 2023 01:13:02 +0100
Subject: [PATCH 24/27] refactor: Modify all maskings

they seem to work now
---
 transformerx/layers/masks/base.py | 58 ++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py
index 635d4a4..811de36 100644
--- a/transformerx/layers/masks/base.py
+++ b/transformerx/layers/masks/base.py
@@ -9,13 +9,13 @@ def build_mask(self, inputs):
         raise NotImplementedError("Subclasses must implement build_mask method")
 
     def call(self, inputs, *args, **kwargs):
-        if len(inputs.shape) == 4:
+        if tf.shape(inputs).shape == 4:
             pass
-        elif len(inputs.shape) == 3:
+        elif tf.shape(inputs).shape == 3:
             inputs = tf.expand_dims(inputs, axis=1)
         else:
             raise f"Invalid input shape. Expected 3D or 4D tensors, but received {len(inputs.shape)}D."
-        mask = self.build_mask()
+        mask = self.build_mask(inputs)
         return tf.add(inputs, mask * -1e9)
 
 
@@ -25,11 +25,12 @@ def __init__(self, **kwargs):
 
     def build_mask(self, inputs):
         input_shape = tf.shape(inputs)
-        q_seq_len = input_shape[2]
-        k_seq_len = input_shape[3]
+        if input_shape.shape == 4:
+            print("input shape: ", input_shape)
+            k_seq_len = input_shape[3]
+            q_seq_len = input_shape[2]
+
         mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0)
-        mask = tf.expand_dims(mask, axis=1)
-        mask = tf.expand_dims(mask, axis=1)
         return mask
 
 
@@ -37,8 +38,8 @@ class PaddingMask(BaseMask):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
 
-    def build_mask(self, input_shape):
-        mask = tf.cast(tf.math.equal(input_shape, 0), tf.float32)
+    def build_mask(self, inputs):
+        mask = tf.cast(tf.math.equal(inputs, 0), tf.float32)
         return mask
 
 
@@ -69,23 +70,17 @@ def get_config(self):
 
     input_tensor = tf.random.uniform((2, 4, 6))
     q_input_tensor = tf.random.uniform((2, 4, 6))
-    attn_o, attn_w = DotProductAttention()(q_input_tensor, input_tensor, input_tensor)
-    # print("mask attn_o.shape: ", attn_o.shape)
-    # print("mask attn_w.shape:", attn_w.shape)
-    # print("mask attn_w:", attn_w)
-    mask = LookAheadMask()
-    output_tensor = mask(attn_w)
-    # print("masked ouptut shape: ", output_tensor.shape, output_tensor)
-    # print(tf.nn.softmax(output_tensor, axis=-1))
+    attn_o, attn_w = DotProductAttention()(q_input_tensor, q_input_tensor, input_tensor)
 
-    # print(tf.nn.softmax(output_tensor, axis=-1))
+    print("attn_w.shape: ", attn_w.shape)
+    la_mask = LookAheadMask()
+    output_tensor = la_mask(attn_w)
+    print(output_tensor.shape, output_tensor)
 
     multihead_attn = MultiHeadAttention(d_model=32, num_heads=4, dropout_rate=0.1)
     output, attn_w = multihead_attn(q_input_tensor, input_tensor, input_tensor)
-    output_tensor = mask(attn_w)
-    # print("mask output_tensor.shape: ", output_tensor.shape)
-    # print("mask output_tensor.shape: ", attn_w)
-    # print(tf.nn.softmax(output_tensor, axis=-1))
+    output_tensor = la_mask(attn_w)
+    print(output_tensor.shape, output_tensor)
 
     data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
     # Create a 2D tensor
@@ -95,12 +90,25 @@ def get_config(self):
     # data_tensor = tf.constant(data, dtype=tf.float32)
 
     # Create a SequencePadding layer
-    sequence_padding_layer = PaddingLayer(0, 4)
+    # sequence_padding_layer = PaddingLayer(0, 4)
 
-    padded_data = sequence_padding_layer(data)
+    # padded_data = sequence_padding_layer(data)
+
+    # Test input
+    input_tensor = tf.constant(
+        [
+            [[1, 2, 0], [4, 5, 6], [7, 8, 9], [0, 0, 0]],
+            [[1, 2, 3], [4, 5, 0], [0, 0, 0], [0, 0, 0]],
+        ],
+        dtype=tf.float32,
+    )
 
     # Create a PaddingMask layer
     padding_mask_layer = PaddingMask()
 
     # Generate the padding mask
-    padding_mask = padding_mask_layer(padded_data)
+    padding_mask = padding_mask_layer(input_tensor)
+    print(padding_mask.shape, padding_mask)
+
+    lad_mask = la_mask(input_tensor)
+    print(lad_mask.shape, lad_mask)

From 141b075bff8de236e7f2ad93d5d03f0e375b3166 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Tue, 30 May 2023 00:36:36 +0100
Subject: [PATCH 25/27] refactor: Incorporate the new masking system

Changed the causal (lookahead) mask to the new system of masking.

Tested-by: Soran Ghaderi <soran.gdr.cs@gmail.com>
Acked-by: Soran Ghaderi <soran.gdr.cs@gmail.com>
---
 transformerx/layers/dot_product_attention.py | 25 ++++++++++++--------
 transformerx/layers/masks/__init__.py        |  1 +
 transformerx/layers/masks/base.py            | 25 ++++++++++++++------
 3 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py
index cd6f48d..ecb24d3 100644
--- a/transformerx/layers/dot_product_attention.py
+++ b/transformerx/layers/dot_product_attention.py
@@ -2,6 +2,7 @@
 
 from transformerx.layers.masks.global_attention_mask import GlobalAttentionMask
 from transformerx.utils import masked_softmax
+from transformerx.layers.masks import LookAheadMask
 
 
 class DotProductAttention(tf.keras.layers.Layer):
@@ -137,18 +138,22 @@ def call(
 
         # apply causal mask
         if self.causal_mask:
+            # Obsolete version of masking. To be removed in the upcomming updates
             # seq_len = tf.shape(queries)[2]
             # heads = tf.shape(queries)[1]
-            batch_size, num_heads, seq_len, _ = tf.unstack(tf.shape(queries))
-            causal_mask = tf.ones((num_heads, seq_len)) * -1e9
-            causal_mask = tf.linalg.LinearOperatorLowerTriangular(
-                causal_mask
-            ).to_dense()
-            causal_mask = tf.expand_dims(causal_mask, axis=0)  # add batch dimension
-            causal_mask = tf.broadcast_to(
-                tf.expand_dims(causal_mask, -1), tf.shape(scores)
-            )  # broadcast across batch dimension
-            scores = scores + causal_mask
+            # batch_size, num_heads, seq_len, _ = tf.unstack(tf.shape(queries))
+            # causal_mask = tf.ones((num_heads, seq_len)) * -1e9
+            # causal_mask = tf.linalg.LinearOperatorLowerTriangular(
+            #     causal_mask
+            # ).to_dense()
+            # causal_mask = tf.expand_dims(causal_mask, axis=0)  # add batch dimension
+            # causal_mask = tf.broadcast_to(
+            #     tf.expand_dims(causal_mask, -1), tf.shape(scores)
+            # )  # broadcast across batch dimension
+
+            # New version of masking
+            look_ahead_mask = LookAheadMask()
+            scores = look_ahead_mask(scores)
 
         # to be uncommented later
         # apply global mask
diff --git a/transformerx/layers/masks/__init__.py b/transformerx/layers/masks/__init__.py
index e69de29..80c100d 100644
--- a/transformerx/layers/masks/__init__.py
+++ b/transformerx/layers/masks/__init__.py
@@ -0,0 +1 @@
+from .base import LookAheadMask
diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py
index 811de36..a2c597d 100644
--- a/transformerx/layers/masks/base.py
+++ b/transformerx/layers/masks/base.py
@@ -30,16 +30,24 @@ def build_mask(self, inputs):
             k_seq_len = input_shape[3]
             q_seq_len = input_shape[2]
 
-        mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0)
+        # mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0)
+        mask = (
+            1
+            - tf.linalg.LinearOperatorLowerTriangular(
+                tf.ones((q_seq_len, k_seq_len)), -1, 0
+            ).to_dense()
+        )
         return mask
 
 
 class PaddingMask(BaseMask):
-    def __init__(self, **kwargs):
+    def __init__(self, padding_value=0, multi_head=True, **kwargs):
         super().__init__(**kwargs)
+        self.padding_value = padding_value
+        self.multi_head = multi_head
 
     def build_mask(self, inputs):
-        mask = tf.cast(tf.math.equal(inputs, 0), tf.float32)
+        mask = tf.cast(tf.math.equal(inputs, self.padding_value), tf.float32)
         return mask
 
 
@@ -79,7 +87,10 @@ def get_config(self):
 
     multihead_attn = MultiHeadAttention(d_model=32, num_heads=4, dropout_rate=0.1)
     output, attn_w = multihead_attn(q_input_tensor, input_tensor, input_tensor)
-    output_tensor = la_mask(attn_w)
+
+    sample_input = tf.random.uniform((1, 1, 4, 2))
+    # output_tensor = la_mask(attn_w)
+    output_tensor = la_mask(sample_input)
     print(output_tensor.shape, output_tensor)
 
     data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
@@ -107,8 +118,8 @@ def get_config(self):
     padding_mask_layer = PaddingMask()
 
     # Generate the padding mask
-    padding_mask = padding_mask_layer(input_tensor)
-    print(padding_mask.shape, padding_mask)
+    # padding_mask = padding_mask_layer(input_tensor)
+    # print(padding_mask.shape, padding_mask)
 
     lad_mask = la_mask(input_tensor)
-    print(lad_mask.shape, lad_mask)
+    # print(lad_mask.shape, lad_mask)

From 504a2fc7b8c9ece107180bb0a9c14f60acdbd456 Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Tue, 30 May 2023 15:00:26 +0100
Subject: [PATCH 26/27] test: All tests are running

---
 tests/layers/test_addnorm.py                 | 3 +++
 tests/layers/test_transformer_encoder.py     | 6 ++++--
 transformerx/layers/dot_product_attention.py | 3 +--
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tests/layers/test_addnorm.py b/tests/layers/test_addnorm.py
index 680b58e..932cfb2 100644
--- a/tests/layers/test_addnorm.py
+++ b/tests/layers/test_addnorm.py
@@ -4,6 +4,9 @@
 
 from transformerx.layers import AddNorm
 
+physical_devices = tf.config.list_physical_devices("GPU")
+tf.config.experimental.set_memory_growth(physical_devices[0], True)
+
 
 class TestAddNorm:
     def test_init(self):
diff --git a/tests/layers/test_transformer_encoder.py b/tests/layers/test_transformer_encoder.py
index 47954c8..d62779c 100644
--- a/tests/layers/test_transformer_encoder.py
+++ b/tests/layers/test_transformer_encoder.py
@@ -1,3 +1,5 @@
+import os
+
 import pytest
 import tensorflow as tf
 import numpy as np
@@ -129,9 +131,9 @@ def test_training(self, model):
             vocab_size=self.vocab_size, seq_length=self.seq_length, num_samples=100
         )
         history = model.fit(
-            x_train, y_train, epochs=50, batch_size=64, validation_split=0.2
+            x_train, y_train, epochs=50, batch_size=16, validation_split=0.2
         )
-        tf.keras.mixed_precision.set_global_policy("mixed_float16")
+        # tf.keras.mixed_precision.set_global_policy("mixed_float16")
         assert (
             history.history["accuracy"][-1] > 0.5
         ), "Training accuracy should be greater than 0.5"
diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py
index ecb24d3..0f68bda 100644
--- a/transformerx/layers/dot_product_attention.py
+++ b/transformerx/layers/dot_product_attention.py
@@ -161,8 +161,7 @@ def call(
         # masked_attention_scores = tf.math.multiply(scores, gmask)
         # attention_probs = tf.nn.softmax(masked_attention_scores, axis=-1)
         # uncomment until here
-        print("dot product: ", scores.shape)
-        print("dot product q: ", queries.shape)
+
         self.attention_weights = masked_softmax(scores, attention_mask)
         # self.attention_weights = tf.nn.softmax(scores, axis=-1, mask=attention_mask)
         # scores = tf.matmul(self.dropout(self.attention_weights, **kwargs), values)

From 4a4ada71d605b35ce36b9ada59c9ac6dc4aecd6d Mon Sep 17 00:00:00 2001
From: Soran Ghaderi <soran.gdr.cs@gmail.com>
Date: Tue, 30 May 2023 15:42:14 +0100
Subject: [PATCH 27/27] test: remove gpu memory allocation

to run the test by Github actions
---
 tests/layers/test_addnorm.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/layers/test_addnorm.py b/tests/layers/test_addnorm.py
index 932cfb2..680b58e 100644
--- a/tests/layers/test_addnorm.py
+++ b/tests/layers/test_addnorm.py
@@ -4,9 +4,6 @@
 
 from transformerx.layers import AddNorm
 
-physical_devices = tf.config.list_physical_devices("GPU")
-tf.config.experimental.set_memory_growth(physical_devices[0], True)
-
 
 class TestAddNorm:
     def test_init(self):