diff --git a/.gitignore b/.gitignore
index 272c02a..98065dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,7 +9,7 @@ __pycache__/
 
 # Distribution / packaging
 .Python
-build/
+#build/
 develop-eggs/
 dist/
 downloads/
@@ -135,4 +135,10 @@ tase.toml
 /tase/unknown_errors.txt
 .idea
 /.idea/
-/certs/
\ No newline at end of file
+/certs/
+/docs/source/
+/docs/build/doctrees/
+*.bats
+Makefile
+source/
+/docs/source/*
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
new file mode 100644
index 0000000..906a0af
--- /dev/null
+++ b/.readthedocs.yaml
@@ -0,0 +1,29 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.9"
+    # You can also specify other tool versions:
+    # nodejs: "19"
+    # rust: "1.64"
+    # golang: "1.19"
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+   configuration: docs/conf.py
+
+# If using Sphinx, optionally build your docs in additional formats such as PDF
+# formats:
+#    - pdf
+
+# Optionally declare the Python requirements required to build your docs
+python:
+   install:
+   - requirements: docs/requirements.txt
\ No newline at end of file
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..d0c3cbf
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..747ffb7
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/source/Makefile b/docs/source/Makefile
new file mode 100644
index 0000000..d4bb2cb
--- /dev/null
+++ b/docs/source/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/source/conf.py b/docs/source/conf.py
new file mode 100644
index 0000000..7533eda
--- /dev/null
+++ b/docs/source/conf.py
@@ -0,0 +1,79 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+import os
+import sys
+
+# sys.path.insert(0, os.path.abspath("../../transformerx/"))
+from datetime import datetime
+from pygments.styles import get_style_by_name
+
+PYTHONPATH = "../../transformerx/"
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = "TransformerX"
+copyright = "2023, TensorOps"
+author = "TensorOps"
+release = "v1.0.0-rc"
+
+# style = get_style_by_name("friendly")
+# style.background_color = "#f3f2f1"
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    # "sphinx_rtd_theme",
+    "furo",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.napoleon",
+    "sphinx.ext.mathjax",
+    "sphinx_markdown_builder",
+]
+
+templates_path = ["_templates"]
+
+napoleon_use_rtype = False
+
+napoleon_include_init_with_doc = True
+napoleon_google_docstring = True
+napoleon_use_param = True
+napoleon_use_ivar = True
+
+# pygments_style = "friendly"
+
+language = "english"
+
+
+exclude_patterns = []
+
+
+# html_theme = "sphinx_rtd_theme"
+html_theme = "furo"
+html_title = "TransformerX Documentation"
+html_show_sourcelink = False
+html_baseurl = "https://github.com/tensorops/transformerx"
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+# html_theme = "alabaster"
+html_static_path = ["_static"]
+
+html_theme_options = {
+    "enable_search_shortcuts": True,
+    "globaltoc_collapse": True,
+    "prev_next_buttons_location": "both",
+    # "style_nav_header_background": "#F5A603",
+    "navigation_depth": 2,
+    "collapse_navigation": True,
+    "sticky_navigation": False,
+    "logo_only": False,
+    "display_version": True,
+    "style_external_links": True,
+    "titles_only": True,
+}
+
+napoleon_use_param = False
diff --git a/docs/source/index.rst b/docs/source/index.rst
new file mode 100644
index 0000000..ebda9f1
--- /dev/null
+++ b/docs/source/index.rst
@@ -0,0 +1,21 @@
+.. TransformerX documentation master file, created by
+   sphinx-quickstart on Mon May  1 19:41:56 2023.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to TransformerX's documentation!
+========================================
+
+.. toctree::
+   :maxdepth: 4
+   :caption: Contents:
+
+   transformerx
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
diff --git a/docs/source/make.bat b/docs/source/make.bat
new file mode 100644
index 0000000..32bb245
--- /dev/null
+++ b/docs/source/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/source/modules.rst b/docs/source/modules.rst
new file mode 100644
index 0000000..7a7e678
--- /dev/null
+++ b/docs/source/modules.rst
@@ -0,0 +1,7 @@
+TransformerX
+============
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx
diff --git a/docs/source/transformerx.data_loader.rst b/docs/source/transformerx.data_loader.rst
new file mode 100644
index 0000000..fe2c160
--- /dev/null
+++ b/docs/source/transformerx.data_loader.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.data_loader
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.addnorm.rst b/docs/source/transformerx.layers.addnorm.rst
new file mode 100644
index 0000000..9b85526
--- /dev/null
+++ b/docs/source/transformerx.layers.addnorm.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.addnorm
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.dot_product_attention.rst b/docs/source/transformerx.layers.dot_product_attention.rst
new file mode 100644
index 0000000..7ecd170
--- /dev/null
+++ b/docs/source/transformerx.layers.dot_product_attention.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.dot_product_attention
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.masks.global_attention_mask.rst b/docs/source/transformerx.layers.masks.global_attention_mask.rst
new file mode 100644
index 0000000..aac5735
--- /dev/null
+++ b/docs/source/transformerx.layers.masks.global_attention_mask.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.masks.global_attention_mask
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.masks.rst b/docs/source/transformerx.layers.masks.rst
new file mode 100644
index 0000000..a3bcbf7
--- /dev/null
+++ b/docs/source/transformerx.layers.masks.rst
@@ -0,0 +1,15 @@
+transformerx.layers.masks package
+=================================
+
+.. automodule:: transformerx.layers.masks
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx.layers.masks.global_attention_mask
diff --git a/docs/source/transformerx.layers.multihead_attention.rst b/docs/source/transformerx.layers.multihead_attention.rst
new file mode 100644
index 0000000..4c9bada
--- /dev/null
+++ b/docs/source/transformerx.layers.multihead_attention.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.multihead_attention
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.positional_encoding.rst b/docs/source/transformerx.layers.positional_encoding.rst
new file mode 100644
index 0000000..96b96e4
--- /dev/null
+++ b/docs/source/transformerx.layers.positional_encoding.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.positional_encoding
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.positionwise_ffn.rst b/docs/source/transformerx.layers.positionwise_ffn.rst
new file mode 100644
index 0000000..eb6e534
--- /dev/null
+++ b/docs/source/transformerx.layers.positionwise_ffn.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.positionwise_ffn
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.rst b/docs/source/transformerx.layers.rst
new file mode 100644
index 0000000..c259fbd
--- /dev/null
+++ b/docs/source/transformerx.layers.rst
@@ -0,0 +1,31 @@
+transformerx.layers package
+===========================
+
+.. automodule:: transformerx.layers
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx.layers.masks
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx.layers.addnorm
+   transformerx.layers.dot_product_attention
+   transformerx.layers.multihead_attention
+   transformerx.layers.positional_encoding
+   transformerx.layers.positionwise_ffn
+   transformerx.layers.transformer_decoder
+   transformerx.layers.transformer_decoder_block
+   transformerx.layers.transformer_encoder
+   transformerx.layers.transformer_encoder_block
diff --git a/docs/source/transformerx.layers.transformer_decoder.rst b/docs/source/transformerx.layers.transformer_decoder.rst
new file mode 100644
index 0000000..3fe4c8f
--- /dev/null
+++ b/docs/source/transformerx.layers.transformer_decoder.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.transformer_decoder
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.transformer_decoder_block.rst b/docs/source/transformerx.layers.transformer_decoder_block.rst
new file mode 100644
index 0000000..d160b68
--- /dev/null
+++ b/docs/source/transformerx.layers.transformer_decoder_block.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.transformer_decoder_block
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.transformer_encoder.rst b/docs/source/transformerx.layers.transformer_encoder.rst
new file mode 100644
index 0000000..39e34da
--- /dev/null
+++ b/docs/source/transformerx.layers.transformer_encoder.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.transformer_encoder
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.layers.transformer_encoder_block.rst b/docs/source/transformerx.layers.transformer_encoder_block.rst
new file mode 100644
index 0000000..9416970
--- /dev/null
+++ b/docs/source/transformerx.layers.transformer_encoder_block.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.layers.transformer_encoder_block
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.rst b/docs/source/transformerx.rst
new file mode 100644
index 0000000..e59b212
--- /dev/null
+++ b/docs/source/transformerx.rst
@@ -0,0 +1,26 @@
+transformerx package
+====================
+
+.. automodule:: transformerx
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Subpackages
+-----------
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx.layers
+   transformerx.training
+   transformerx.txplot
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx.data_loader
+   transformerx.utils
diff --git a/docs/source/transformerx.training.base.rst b/docs/source/transformerx.training.base.rst
new file mode 100644
index 0000000..e319c1b
--- /dev/null
+++ b/docs/source/transformerx.training.base.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.training.base
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.training.rst b/docs/source/transformerx.training.rst
new file mode 100644
index 0000000..9b9e4bd
--- /dev/null
+++ b/docs/source/transformerx.training.rst
@@ -0,0 +1,15 @@
+transformerx.training package
+=============================
+
+.. automodule:: transformerx.training
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx.training.base
diff --git a/docs/source/transformerx.txplot.plot_pe.rst b/docs/source/transformerx.txplot.plot_pe.rst
new file mode 100644
index 0000000..487d915
--- /dev/null
+++ b/docs/source/transformerx.txplot.plot_pe.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.txplot.plot_pe
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/docs/source/transformerx.txplot.rst b/docs/source/transformerx.txplot.rst
new file mode 100644
index 0000000..41dff07
--- /dev/null
+++ b/docs/source/transformerx.txplot.rst
@@ -0,0 +1,15 @@
+transformerx.txplot package
+===========================
+
+.. automodule:: transformerx.txplot
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+Submodules
+----------
+
+.. toctree::
+   :maxdepth: 4
+
+   transformerx.txplot.plot_pe
diff --git a/docs/source/transformerx.utils.rst b/docs/source/transformerx.utils.rst
new file mode 100644
index 0000000..17b0b90
--- /dev/null
+++ b/docs/source/transformerx.utils.rst
@@ -0,0 +1,4 @@
+.. automodule:: transformerx.utils
+   :members:
+   :undoc-members:
+   :show-inheritance:
diff --git a/tests/layers/test_transformer_decoder.py b/tests/layers/test_transformer_decoder.py
index 8466b28..d8fb743 100644
--- a/tests/layers/test_transformer_decoder.py
+++ b/tests/layers/test_transformer_decoder.py
@@ -107,27 +107,38 @@ def test_decoder_attention_weights_values(self, decoder, inputs):
 
 
 class TestTransformerDecoderIntegration:
-    seq_length = 10
-    vocab_size = 32
+    seq_length = 5
+    vocab_size = 8
 
     @staticmethod
     def create_toy_dataset(
         num_samples=1000, seq_length=10, vocab_size=64, num_classes=2
     ):
         # x = np.random.randint(0, vocab_size, size=(num_samples, seq_length))
-        x = np.random.normal(
-            vocab_size / 2, vocab_size / 2 - 1, size=(num_samples, seq_length)
+        # x = np.random.normal(
+        #     (vocab_size // 2), (vocab_size // 2 - 1), size=(num_samples, seq_length)
+        # )
+        x = tf.random.normal(
+            shape=(num_samples, seq_length),
+            mean=vocab_size // 2,
+            stddev=vocab_size / 2 - 3,
         )
-        y = np.random.randint(0, 2, size=(num_samples, 1))
-        y = np.random.normal(1, 1, size=(num_samples, seq_length))
-
-        x_train = tf.random.uniform(
-            shape=(num_samples, seq_length), maxval=vocab_size, dtype=tf.int32
-        )
-        y_train = tf.random.uniform(
-            shape=(num_samples, 1), maxval=num_classes, dtype=tf.int32
-        )
-        return x_train, y_train
+        # y = np.random.randint(0, 2, size=(num_samples, 1))
+        # y = np.random.normal(1, 1, size=(num_samples, seq_length))
+        y = tf.cast(tf.math.greater(x, vocab_size / 2), tf.int32)
+        print("x: ", x.shape)
+        print("y: ", y.shape)
+        print("vocab: ", vocab_size, vocab_size / 2, vocab_size // 2)
+        print("x: ", x[:5, :5])
+        print("y: ", y[:5, :5])
+
+        # x_train = tf.random.normal(
+        #     shape=(num_samples, seq_length), mean=vocab_size, dtype=tf.int32
+        # )
+        # y_train = tf.random.normal(
+        #     shape=(num_samples, 1), maxval=num_classes, dtype=tf.int32
+        # )
+        return x, y
 
     @pytest.fixture(scope="class")
     def model(self):
@@ -138,7 +149,7 @@ def model(self):
             vocab_size=self.vocab_size,
             maxlen_position_encoding=self.seq_length,
             num_heads=num_head,
-            d_model=64,
+            d_model=16,
             n_blocks=1,
         )
         decoder = TransformerDecoder(
@@ -155,10 +166,10 @@ def model(self):
         tgt_inputs = tf.keras.layers.Input(shape=(self.seq_length,))
         enc_output, attn_weights = encoder(inputs)
         print("enc_ouput: ", enc_output.shape)
-        dec_output, attn_weights_dec = decoder(inputs, enc_output, enc_output)
-        predictions = tf.keras.layers.Dense(1, activation="softmax")(dec_output)
+        dec_output, attn_weights_dec = decoder(tgt_inputs, enc_output, enc_output)
+        predictions = tf.keras.layers.Dense(1, activation="sigmoid")(dec_output)
 
-        model = tf.keras.Model(inputs=[inputs], outputs=predictions)
+        model = tf.keras.Model(inputs=[inputs, tgt_inputs], outputs=predictions)
         model.compile(
             optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
         )
@@ -169,6 +180,6 @@ def test_model_creation(self, model):
         x, y = self.create_toy_dataset(
             num_samples=100, vocab_size=self.vocab_size, seq_length=self.seq_length
         )
-        history = model.fit(x, y, epochs=50, batch_size=32, validation_split=0.2)
+        history = model.fit([x, x], y, epochs=100, batch_size=32, validation_split=0.2)
         assert isinstance(model, tf.keras.Model)
         assert model is not None
diff --git a/tests/layers/test_transformer_encoder.py b/tests/layers/test_transformer_encoder.py
index 47954c8..d62779c 100644
--- a/tests/layers/test_transformer_encoder.py
+++ b/tests/layers/test_transformer_encoder.py
@@ -1,3 +1,5 @@
+import os
+
 import pytest
 import tensorflow as tf
 import numpy as np
@@ -129,9 +131,9 @@ def test_training(self, model):
             vocab_size=self.vocab_size, seq_length=self.seq_length, num_samples=100
         )
         history = model.fit(
-            x_train, y_train, epochs=50, batch_size=64, validation_split=0.2
+            x_train, y_train, epochs=50, batch_size=16, validation_split=0.2
         )
-        tf.keras.mixed_precision.set_global_policy("mixed_float16")
+        # tf.keras.mixed_precision.set_global_policy("mixed_float16")
         assert (
             history.history["accuracy"][-1] > 0.5
         ), "Training accuracy should be greater than 0.5"
diff --git a/transformerx/__version__.py b/transformerx/__version__.py
index cb001e5..2068d33 100644
--- a/transformerx/__version__.py
+++ b/transformerx/__version__.py
@@ -1,5 +1,10 @@
 VERSION = (1, 0, 0, "beta", 3)
 
-__version__ = '.'.join(map(str, VERSION))
+if len(VERSION) < 3:
+    raise ValueError("VERSION must have at least three elements")
 
-print(__version__)
\ No newline at end of file
+__version__ = ".".join(str(v) for v in VERSION[:3])
+if len(VERSION) > 3:
+    __version__ += "-" + ".".join(str(v) for v in VERSION[3:])
+# version_str += "-dev"
+print(__version__)
diff --git a/transformerx/layers/addnorm.py b/transformerx/layers/addnorm.py
index 403e6db..daae09f 100644
--- a/transformerx/layers/addnorm.py
+++ b/transformerx/layers/addnorm.py
@@ -46,7 +46,7 @@ class AddNorm(tf.keras.layers.Layer):
     Examples
     --------
     >>> x = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32)
-    >>> y = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32)
+    >>> y = tf.constant(np.arange(10).reshape(5, 2) * 11, dtype=tf.float32)
     >>> print(x)
     tf.Tensor(
     [[ 0. 10.]
@@ -56,14 +56,14 @@ class AddNorm(tf.keras.layers.Layer):
      [80. 90.]], shape=(5, 2), dtype=float32)
 
     >>> addnorm = AddNorm(norm_type='layer', norm_eps=1e-6, dropout_rate=0.2, activation='relu')
-    >>> output = addnorm([x, y])
+    >>> output = addnorm(x, y)
     >>> print(output)
     tf.Tensor(
-    [[0.        0.        ]
-     [4.1565704 3.2312596]
-     [9.174077  8.174077 ]
-     [14.191582 13.116871 ]
-     [19.209087 18.134377 ]], shape=(5, 2), dtype=float32)
+    [[0. 1.]
+     [0. 1.]
+     [0. 1.]
+     [0. 1.]
+     [0. 1.]], shape=(5, 2), dtype=float32)
 
     References
     ----------
@@ -99,7 +99,7 @@ def __init__(
 
         if dropout_rate >= 1:
             raise ValueError("Dropout rate must be less than 1")
-        self.dropout = tf.keras.layers.Dropout(dropout_rate)
+        self.dropout = tf.keras.layers.Dropout(self.dropout_rate)
         # Regularizers
         self.kernel_regularizer = kernel_regularizer
         self.bias_regularizer = bias_regularizer
@@ -154,10 +154,10 @@ def call(self, x: tf.Tensor, residual: tf.Tensor, **kwargs):
             )
 
         # Apply dropout
-        residual = self.dropout(residual, training=kwargs.get("training", False))
+        # residual = self.dropout(residual)
 
         # Add residual connection
-        x = tf.keras.layers.Add()([x, residual])
+        x = tf.add(x, residual)
 
         # Apply normalization
         x = self.norm_layer(x)
@@ -184,12 +184,12 @@ def get_config(self):
         return config
 
 
-if __name__ == "__main__":
-    X = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32)
-    Y = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32)
-
-    addnorm = AddNorm(
-        norm_type="layer", norm_eps=1e-6, dropout_rate=0.2, activation="relu"
-    )
-    output = addnorm(X, X)
-    print(output)
+# if __name__ == "__main__":
+#     x = tf.constant(np.arange(10).reshape(5, 2) * 10, dtype=tf.float32)
+#     y = tf.constant(np.arange(10, 20).reshape(5, 2) * 13, dtype=tf.float32)
+#
+#     addnorm = AddNorm(
+#         norm_type="layer", norm_eps=1e-6, dropout_rate=0.2, activation="relu"
+#     )
+#     output = addnorm(x, y)
+#     print(output)
diff --git a/transformerx/layers/dot_product_attention.py b/transformerx/layers/dot_product_attention.py
index b944303..0f68bda 100644
--- a/transformerx/layers/dot_product_attention.py
+++ b/transformerx/layers/dot_product_attention.py
@@ -1,8 +1,8 @@
-import numpy as np
 import tensorflow as tf
 
 from transformerx.layers.masks.global_attention_mask import GlobalAttentionMask
 from transformerx.utils import masked_softmax
+from transformerx.layers.masks import LookAheadMask
 
 
 class DotProductAttention(tf.keras.layers.Layer):
@@ -25,60 +25,63 @@ class DotProductAttention(tf.keras.layers.Layer):
     Notes
     -----
     Dot-product attention formulation is as following:
-    .. math:: Attention(Q, K, V) = softmax(Q K^T) V
+
+    .. math::
+        Attention(Q, K, V) = softmax(Q K^T) V
 
     And scaled dot-product attention [1]_ is formulated as:
 
-    ..math:: Attention(Q, K, V) = softmax(\frac{QK^T}{\sqrt{d_k}})V
+    .. math::
+        Attention(Q, K, V) = softmax(\\frac{QK^T}{\\sqrt{d_k}}) V
 
 
     Examples
     --------
     Scaled dot-product (scaled multiplicative) self-attention of tensor `x` (we feed `x` to queries, keys, and
     values).
-
-    >>> x = tf.cast(np.random.random([2, 3, 2]), dtype=tf.float32)
+    >>> tf.random.set_seed(1)
+    >>> x = tf.cast(tf.random.uniform([2, 3, 2]), dtype=tf.float32)
     >>> print(x)
     tf.Tensor(
-    [[[0.5418388  0.23626359]
-      [0.4220487  0.394948  ]
-      [0.6125364  0.12296485]]
-
-     [[0.17872103 0.5700011 ]
-      [0.28264287 0.02290592]
-      [0.24536102 0.39220297]]], shape=(2, 3, 2), dtype=float32)  #random
+    [[[0.16513085 0.9014813 ]
+      [0.6309742  0.4345461 ]
+      [0.29193902 0.64250207]]
+    <BLANKLINE>
+     [[0.9757855  0.43509948]
+      [0.6601019  0.60489583]
+      [0.6366315  0.6144488 ]]], shape=(2, 3, 2), dtype=float32)
 
     >>> dot_product = DotProductAttention(0.2)
     >>> queries, keys, values = x, x, x
-    >>> output = dot_product(queries, keys, values)
+    >>> output, attn_weights = dot_product(queries, keys, values)
     >>> print(output)
     tf.Tensor(
-    [[[0.45955482 0.63378114]
-      [0.48054144 0.62751293]
-      [0.43684354 0.64026886]]
-
-     [[0.82063836 0.2958246 ]
-      [0.8300792  0.30486548]
-      [0.83300924 0.30762452]]], shape=(2, 3, 2), dtype=float32)
+    [[[0.34450796 0.6787753 ]
+      [0.36907017 0.65472305]
+      [0.35440704 0.66882825]]
+    <BLANKLINE>
+     [[0.77042043 0.5446019 ]
+      [0.7632908  0.5484005 ]
+      [0.7627964  0.5486638 ]]], shape=(2, 3, 2), dtype=float32)
 
     The next example shows the dot-product (multiplicative) self-attention of tensor `x`.
 
     >>> dot_product = DotProductAttention(dropout_rate=0.1, scaled=False)
-    >>> output = dot_product(queries, keys, values)
+    >>> output, attn_weights = dot_product(queries, keys, values)
     >>> print(output)
     tf.Tensor(
-    [[[0.5195807  0.6383675 ]
-      [0.49765232 0.6440835 ]
-      [0.5132934  0.64001364]]
-
-     [[0.6074392  0.80120546]
-      [0.6098373  0.80074203]
-      [0.5967663  0.7891044 ]]], shape=(2, 3, 2), dtype=float32)
+    [[[0.33704066 0.6868143 ]
+      [0.37176722 0.6526886 ]
+      [0.35094902 0.6727435 ]]
+    <BLANKLINE>
+     [[0.7759446  0.54165894]
+      [0.7657266  0.54710305]
+      [0.7650213  0.5474789 ]]], shape=(2, 3, 2), dtype=float32)
 
     References
     ----------
     .. [1] A. Vaswani, N. Shazeer, N. Parmar, J. Uszkoreit, L. Jones, A. N. Gomez, L. Kaiser, I. Polosukhin, Attention
-    is all you need, in: NIPS, pp. 5998–6008.
+        is all you need, in: NIPS, pp. 5998–6008.
     """
 
     def __init__(
@@ -135,19 +138,22 @@ def call(
 
         # apply causal mask
         if self.causal_mask:
+            # Obsolete version of masking. To be removed in the upcomming updates
             # seq_len = tf.shape(queries)[2]
             # heads = tf.shape(queries)[1]
-            batch_size, num_heads, seq_len, _ = tf.unstack(tf.shape(queries))
-            causal_mask = tf.ones((num_heads, seq_len)) * -1e9
-            causal_mask = tf.linalg.LinearOperatorLowerTriangular(
-                causal_mask
-            ).to_dense()
-            causal_mask = tf.expand_dims(causal_mask, axis=0)  # add batch dimension
-            causal_mask = tf.broadcast_to(
-                tf.expand_dims(causal_mask, -1), tf.shape(scores)
-            )  # broadcast across batch dimension
-            # scores +=
-            scores = scores + causal_mask
+            # batch_size, num_heads, seq_len, _ = tf.unstack(tf.shape(queries))
+            # causal_mask = tf.ones((num_heads, seq_len)) * -1e9
+            # causal_mask = tf.linalg.LinearOperatorLowerTriangular(
+            #     causal_mask
+            # ).to_dense()
+            # causal_mask = tf.expand_dims(causal_mask, axis=0)  # add batch dimension
+            # causal_mask = tf.broadcast_to(
+            #     tf.expand_dims(causal_mask, -1), tf.shape(scores)
+            # )  # broadcast across batch dimension
+
+            # New version of masking
+            look_ahead_mask = LookAheadMask()
+            scores = look_ahead_mask(scores)
 
         # to be uncommented later
         # apply global mask
@@ -160,7 +166,6 @@ def call(
         # self.attention_weights = tf.nn.softmax(scores, axis=-1, mask=attention_mask)
         # scores = tf.matmul(self.dropout(self.attention_weights, **kwargs), values)
         attention_output = tf.matmul(self.dropout(self.attention_weights), values)
-
         return attention_output, self.attention_weights
 
     def get_attention_weights(self):
diff --git a/transformerx/layers/masks/__init__.py b/transformerx/layers/masks/__init__.py
index e69de29..80c100d 100644
--- a/transformerx/layers/masks/__init__.py
+++ b/transformerx/layers/masks/__init__.py
@@ -0,0 +1 @@
+from .base import LookAheadMask
diff --git a/transformerx/layers/masks/base.py b/transformerx/layers/masks/base.py
new file mode 100644
index 0000000..a2c597d
--- /dev/null
+++ b/transformerx/layers/masks/base.py
@@ -0,0 +1,125 @@
+import tensorflow as tf
+
+
+class BaseMask(tf.keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def build_mask(self, inputs):
+        raise NotImplementedError("Subclasses must implement build_mask method")
+
+    def call(self, inputs, *args, **kwargs):
+        if tf.shape(inputs).shape == 4:
+            pass
+        elif tf.shape(inputs).shape == 3:
+            inputs = tf.expand_dims(inputs, axis=1)
+        else:
+            raise f"Invalid input shape. Expected 3D or 4D tensors, but received {len(inputs.shape)}D."
+        mask = self.build_mask(inputs)
+        return tf.add(inputs, mask * -1e9)
+
+
+class LookAheadMask(BaseMask):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def build_mask(self, inputs):
+        input_shape = tf.shape(inputs)
+        if input_shape.shape == 4:
+            print("input shape: ", input_shape)
+            k_seq_len = input_shape[3]
+            q_seq_len = input_shape[2]
+
+        # mask = 1 - tf.linalg.band_part(tf.ones((q_seq_len, k_seq_len)), -1, 0)
+        mask = (
+            1
+            - tf.linalg.LinearOperatorLowerTriangular(
+                tf.ones((q_seq_len, k_seq_len)), -1, 0
+            ).to_dense()
+        )
+        return mask
+
+
+class PaddingMask(BaseMask):
+    def __init__(self, padding_value=0, multi_head=True, **kwargs):
+        super().__init__(**kwargs)
+        self.padding_value = padding_value
+        self.multi_head = multi_head
+
+    def build_mask(self, inputs):
+        mask = tf.cast(tf.math.equal(inputs, self.padding_value), tf.float32)
+        return mask
+
+
+class PaddingMaskNew(tf.keras.layers.Layer):
+    def __init__(self, multi_head=True, padding_value=0, **kwargs):
+        super(PaddingMask, self).__init__(**kwargs)
+        self.multi_head = multi_head
+        self.padding_value = padding_value
+
+    def build(self, input_shape):
+        pass
+
+    def call(self, inputs):
+        seq = tf.cast(tf.math.equal(inputs, self.padding_value), tf.float32)
+        seq = tf.expand_dims(seq, axis=1)
+        if self.multi_head:
+            seq = tf.expand_dims(seq, axis=1)
+        return seq
+
+    def get_config(self):
+        config = super(PaddingMask, self).get_config()
+        config.update({"multi_head": self.multi_head})
+        return config
+
+
+if __name__ == "__main__":
+    from transformerx.layers import DotProductAttention, MultiHeadAttention
+
+    input_tensor = tf.random.uniform((2, 4, 6))
+    q_input_tensor = tf.random.uniform((2, 4, 6))
+    attn_o, attn_w = DotProductAttention()(q_input_tensor, q_input_tensor, input_tensor)
+
+    print("attn_w.shape: ", attn_w.shape)
+    la_mask = LookAheadMask()
+    output_tensor = la_mask(attn_w)
+    print(output_tensor.shape, output_tensor)
+
+    multihead_attn = MultiHeadAttention(d_model=32, num_heads=4, dropout_rate=0.1)
+    output, attn_w = multihead_attn(q_input_tensor, input_tensor, input_tensor)
+
+    sample_input = tf.random.uniform((1, 1, 4, 2))
+    # output_tensor = la_mask(attn_w)
+    output_tensor = la_mask(sample_input)
+    print(output_tensor.shape, output_tensor)
+
+    data = [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
+    # Create a 2D tensor
+    data = tf.constant([[1, 2, 3], [4, 5, 6]])
+
+    # Convert the dataset to a tensor
+    # data_tensor = tf.constant(data, dtype=tf.float32)
+
+    # Create a SequencePadding layer
+    # sequence_padding_layer = PaddingLayer(0, 4)
+
+    # padded_data = sequence_padding_layer(data)
+
+    # Test input
+    input_tensor = tf.constant(
+        [
+            [[1, 2, 0], [4, 5, 6], [7, 8, 9], [0, 0, 0]],
+            [[1, 2, 3], [4, 5, 0], [0, 0, 0], [0, 0, 0]],
+        ],
+        dtype=tf.float32,
+    )
+
+    # Create a PaddingMask layer
+    padding_mask_layer = PaddingMask()
+
+    # Generate the padding mask
+    # padding_mask = padding_mask_layer(input_tensor)
+    # print(padding_mask.shape, padding_mask)
+
+    lad_mask = la_mask(input_tensor)
+    # print(lad_mask.shape, lad_mask)
diff --git a/transformerx/layers/multihead_attention.py b/transformerx/layers/multihead_attention.py
index c3fa618..b7cf77a 100644
--- a/transformerx/layers/multihead_attention.py
+++ b/transformerx/layers/multihead_attention.py
@@ -1,4 +1,3 @@
-import numpy as np
 import tensorflow as tf
 from einops import rearrange
 
@@ -89,7 +88,9 @@ class MultiHeadAttention(tf.keras.layers.Layer):
     Returns
     -------
     output:
-        Concatenated tensors
+        Concatenated tensors. Same shape as the queries.
+    attention_weights:
+            Optional tensor of attention weights.
 
     Methods
     -------
@@ -102,38 +103,50 @@ class MultiHeadAttention(tf.keras.layers.Layer):
 
     Examples
     --------
-    >>> x = tf.constant(np.random.random([2, 3, 2]), dtype=tf.float32)
-    >>> multihead = MultiHeadAttention(d_model=8)
-    >>> print(multihead)
-    <__main__.MultiHeadAttention object at 0x7ff83c16bb80>
+    >>> import tensorflow as tf
+    >>> import random
+    >>> tf.random.set_seed(1)
+    >>> random.seed(42)
 
-    >>> output = multihead(x, x, x)
+
+    >>> x = tf.constant(tf.random.uniform([2, 3, 2]), dtype=tf.float32)
+    >>> multihead = MultiHeadAttention(d_model=8, dropout_rate=0)
+    >>> print(type(multihead))
+    <class 'multihead_attention.MultiHeadAttention'>
+
+    >>> output, attn_weights = multihead(x, x, x)
     >>> print(output)
     tf.Tensor(
-    [[[ 0.2051548   0.32050014  0.2915167  -0.04056092  0.12072253
-        0.06477361  0.18725544  0.02056682]
-      [ 0.19823116  0.2983173   0.27711272 -0.04071879  0.11172265
-        0.06080601  0.18654731  0.00577436]
-      [ 0.19831955  0.30106473  0.27666807 -0.03963682  0.11234044
-        0.0615251   0.18657821  0.00680977]]
-     [[ 0.14630345  0.21267754  0.26289055 -0.10759152  0.03963668
-        0.04118761  0.11257525  0.05869889]
-      [ 0.14556082  0.21070784  0.26139364 -0.10755821  0.03894955
-        0.04060047  0.11260018  0.05745776]
-      [ 0.14547291  0.21081978  0.26109838 -0.10745162  0.03889
-        0.04069766  0.11251941  0.05741404]]], shape=(2, 3, 8), dtype=float32)
-
-    >>> attention = MultiHeadAttention(d_model=16, num_heads=4, dropout=0.1)
-    >>> queries = tf.random.normal((3, 10, 16))
+    [[[ 0.27276292 -0.2744614  -0.06085328 -0.03441356 -0.1577001
+        0.33375    -0.7894692  -0.33158925]
+      [ 0.2792416  -0.27180034 -0.06341933 -0.02869054 -0.15612581
+        0.33674437 -0.7850623  -0.3237151 ]
+      [ 0.274466   -0.27393326 -0.06170867 -0.03307929 -0.15757665
+        0.33440444 -0.78846383 -0.3293347 ]]
+    <BLANKLINE>
+     [[ 0.44330204 -0.14170787 -0.1372787   0.3109271  -0.30478996
+        0.47728932 -0.8789958  -0.3304574 ]
+      [ 0.44153026 -0.14282975 -0.13679348  0.30881953 -0.30498797
+        0.476456   -0.8804113  -0.33254212]
+      [ 0.44139963 -0.14291355 -0.13675913  0.30866385 -0.3050046
+        0.4763937  -0.88051784 -0.3326969 ]]], shape=(2, 3, 8), dtype=float32)
+
+
+
+
+
+    >>> tf.random.set_seed(1)
+    >>> attention = MultiHeadAttention(d_model=16, num_heads=4, dropout_rate=0.1)
+    >>> queries = tf.random.normal((3, 20, 16))
     >>> keys = tf.random.normal((3, 20, 16))
     >>> values = tf.random.normal((3, 20, 16))
-    >>> valid_lens = tf.constant([10, 15, 20])
-    >>> output, _ = attention(queries, keys, values, valid_lens)
-    >>> output.shape
-    (3, 10, 16)
+    >>> valid_lens = tf.constant([3, 20])
+    >>> output, _ = attention(queries, keys, values)
+    >>> print(output.shape)
+    (3, 20, 16)
 
-    >>> window_mask = tf.ones((3, 10, 20))
-    >>> output, _ = attention(queries, keys, values, valid_lens, window_mask=window_mask)
+    >>> window_mask = tf.ones((3, 10))
+    >>> output, _ = attention(queries, keys, values, attention_mask=window_mask)
     >>> output.shape
     (3, 10, 16)
 
@@ -285,7 +298,7 @@ def call(
         >>> values = tf.random.normal([batch_size, no_of_key_value_pairs, depth])
         >>> valid_lens = tf.random.uniform([batch_size], minval=0, maxval=no_of_queries, dtype=tf.int32)
 
-        >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout=dropout)
+        >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout_rate=dropout)
         >>> output, attention_weights = multihead_attn(queries, keys, values, valid_lens)
 
         Here is an example of how to use the call method with a window mask:
@@ -298,7 +311,7 @@ def call(
         >>> valid_lens = tf.random.uniform([batch_size], minval=0, maxval=no_of_queries, dtype=tf.int32)
         >>> window_mask = tf.random.uniform([batch_size, no_of_queries, no_of_key_value_pairs], 0, 2, dtype=tf.int32)
 
-        >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout=dropout)
+        >>> multihead_attn = MultiHeadAttention(d_model=depth, num_heads=num_heads, dropout_rate=dropout)
         >>> output, attention_weights = multihead_attn(queries, keys, values, valid_lens, window_mask)
         """
 
@@ -320,6 +333,7 @@ def call(
 
         # Shape of output: (batch_size * num_heads, no. of queries,
         # depth / num_heads)
+        print("multihead q: ", queries.shape)
         attention_output, attention_weights = self.attention(
             queries, keys, values, attention_mask, **kwargs
         )
diff --git a/transformerx/layers/transformer_decoder.py b/transformerx/layers/transformer_decoder.py
index b570341..1277bd2 100644
--- a/transformerx/layers/transformer_decoder.py
+++ b/transformerx/layers/transformer_decoder.py
@@ -270,10 +270,9 @@ def apply_positional_embedding(self, inputs=None, **kwargs):
         )
 
     def call(self, queries, keys, values, attention_mask=None, **kwargs):
-        queries = self.apply_positional_embedding(queries, **kwargs)
+        blk_outputs = self.apply_positional_embedding(queries, **kwargs)
         # keys = self.apply_positional_embedding(keys, **kwargs)
         # values = self.apply_positional_embedding(values, **kwargs)
-        blk_outputs = queries
         # self.attention_weights = [None] * len(self.blocks)
         self.attention_weights = []
         for i, blk in enumerate(self.blocks):
diff --git a/transformerx/utils.py b/transformerx/utils.py
index 91f14a2..50e35be 100644
--- a/transformerx/utils.py
+++ b/transformerx/utils.py
@@ -8,8 +8,8 @@ def sequence_mask(X, attention_mask, value=-1e9):
         raise TypeError("X must be a Tensor")
     if not isinstance(attention_mask, tf.Tensor):
         raise TypeError("attention_mask must be a Tensor")
-    if len(X.shape) not in (2, 3):
-        raise ValueError("X must be a 2D or 3D tensor")
+    if len(X.shape) not in (2, 3, 4):
+        raise ValueError("X must be a 2D, 3D, or 4D tensor")
     if len(attention_mask.shape) not in (1, 2):
         raise ValueError("attention_mask must be a 1D or 2D tensor")
 
@@ -18,8 +18,10 @@ def sequence_mask(X, attention_mask, value=-1e9):
         mask = tf.range(start=0, limit=maxlen, dtype=tf.float32)[None, :] < tf.cast(
             attention_mask, dtype=tf.float32
         )
+        print("mask.shape: ", mask.shape, attention_mask.shape, X.shape)
     else:
         maxlen = X.shape[0]
+        print("attention_mask.shape: ", attention_mask.shape, X.shape)
         mask = tf.range(start=0, limit=maxlen, dtype=tf.float32) < tf.cast(
             attention_mask, dtype=tf.float32
         )