From b1413a6c7fbfcbe7036a12b788f0cdfce729a105 Mon Sep 17 00:00:00 2001
From: Marius Brehler <marius.brehler@amd.com>
Date: Tue, 1 Oct 2024 19:12:11 +0200
Subject: [PATCH 1/5] Update instructions on creating a virtual env (#3724)

The `python` command is only available on Ubuntu if the
`python-is-python3` package is installed, see
https://packages.ubuntu.com/jammy/python-is-python3 and
https://packages.ubuntu.com/jammy/all/python-is-python3/filelist. As
Python 2 isn't supported anyway, it's safe to point to `python3` here
instead.
---
 docs/development.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/development.md b/docs/development.md
index d785b7ebc09d..4c70af129383 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -14,7 +14,7 @@ While this is running, you can already setup the Python venv and dependencies in
 ## Setup your Python VirtualEnvironment and Dependencies
 
 ```shell
-python -m venv mlir_venv
+python3 -m venv mlir_venv
 source mlir_venv/bin/activate
 # Some older pip installs may not be able to handle the recent PyTorch deps
 python -m pip install --upgrade pip

From 617c1c76ce4d0410e2318dbd25d69c68db45388c Mon Sep 17 00:00:00 2001
From: Prathamesh Tagore <63031630+meshtag@users.noreply.github.com>
Date: Wed, 2 Oct 2024 18:25:54 +0530
Subject: [PATCH 2/5] [torch.bind_symbolic_shape] Fix verifier for shapeSymbol
 detection (#3751)

The op can be valid with no attached shape symbols if they are not
required by the corresponding affine map. Fix the verifier to consider
number of arguments for both.
---
 lib/Dialect/Torch/IR/TorchOps.cpp |  7 +++++--
 test/Dialect/Torch/invalid.mlir   | 10 +++++++++-
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/lib/Dialect/Torch/IR/TorchOps.cpp b/lib/Dialect/Torch/IR/TorchOps.cpp
index bed228671de1..e10564bbe26b 100644
--- a/lib/Dialect/Torch/IR/TorchOps.cpp
+++ b/lib/Dialect/Torch/IR/TorchOps.cpp
@@ -5405,8 +5405,11 @@ void BindSymbolicShapeOp::print(OpAsmPrinter &p) {
 }
 
 LogicalResult BindSymbolicShapeOp::verify() {
-  if (getShapeSymbols().empty())
-    return emitOpError() << "requires non-empty shapeSymbols";
+  if (getShapeSymbols().size() !=
+      getShapeExpressions().getValue().getNumSymbols())
+    return emitOpError()
+           << "requires equal number of shape symbol args and symbol args to "
+              "the attached affine map, since they are 1:1 mapped";
 
   for (auto symbol : getShapeSymbols()) {
     Operation *definingOp = symbol.getDefiningOp();
diff --git a/test/Dialect/Torch/invalid.mlir b/test/Dialect/Torch/invalid.mlir
index 5b732788faef..8f38c66ad154 100644
--- a/test/Dialect/Torch/invalid.mlir
+++ b/test/Dialect/Torch/invalid.mlir
@@ -381,13 +381,21 @@ func.func private @tensor.sparse() -> !torch.vtensor<[64,64],f32,12345>
 
 func.func @torch.symbolic_int$no_shape_symbols(%arg0: !torch.vtensor<[?],f32>) -> !torch.vtensor<[?],f32> {
   %0 = torch.symbolic_int "s0" {min_val = 3, max_val = 6} : !torch.int
-  // expected-error @+1 {{op requires non-empty shapeSymbols}}
+  // expected-error @+1 {{op requires equal number of shape symbol args and symbol args to the attached affine map, since they are 1:1 mapped}}
   torch.bind_symbolic_shape %arg0, [], affine_map<()[s0] -> (s0)> : !torch.vtensor<[?],f32>
   return %arg0 : !torch.vtensor<[?],f32>
 }
 
 // -----
 
+// Verifier should not fail here since the op does not require shapeSymbols.
+func.func @torch.symbolic_int$no_shape_symbols_no_symbols_in_map(%arg0: !torch.vtensor<[?],f32>) -> !torch.vtensor<[?],f32> {
+  torch.bind_symbolic_shape %arg0, [], affine_map<()[] -> (1)> : !torch.vtensor<[?],f32>
+  return %arg0 : !torch.vtensor<[?],f32>
+}
+
+// -----
+
 func.func @torch.symbolic_int$no_shape_symbols(%arg0: !torch.vtensor<[?],f32>) -> !torch.vtensor<[?],f32> {
   %int0 = torch.constant.int 0
   // expected-error @+1 {{shape symbol must be produced by a SymbolicIntOp}}

From a2bfe47faa7480259915343a762958e4ae25c501 Mon Sep 17 00:00:00 2001
From: Samu Tamminen <7460037+samutamm@users.noreply.github.com>
Date: Wed, 2 Oct 2024 15:17:58 +0200
Subject: [PATCH 3/5] [onnx] Add IDF and TFIDF modes to TFIDF Vectorizer
 (#3726)

Address https://github.com/nod-ai/SHARK-Turbine/issues/833
---
 .../Conversion/TorchOnnxToTorch/Patterns.h    | 25 +++++++++++++
 .../TorchOnnxToTorch/DefaultDomainQtoZ.cpp    | 37 +++++++++++++++++--
 2 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/include/torch-mlir/Conversion/TorchOnnxToTorch/Patterns.h b/include/torch-mlir/Conversion/TorchOnnxToTorch/Patterns.h
index 1cf4df932f69..f71deaff2efa 100644
--- a/include/torch-mlir/Conversion/TorchOnnxToTorch/Patterns.h
+++ b/include/torch-mlir/Conversion/TorchOnnxToTorch/Patterns.h
@@ -338,6 +338,31 @@ struct OpBinder {
     return failure();
   }
 
+  ParseResult f32FloatArrayAttr(llvm::SmallVector<float> &values,
+                                StringRef nameSuffix,
+                                ArrayRef<float> defaults) {
+    SmallString<64> name("torch.onnx.");
+    name.append(nameSuffix);
+    auto attr = op->getAttr(name);
+    if (!attr) {
+      values.append(defaults.begin(), defaults.end());
+      return success();
+    }
+    if (auto arrayAttr = dyn_cast<ArrayAttr>(attr)) {
+      for (auto element : arrayAttr) {
+        auto floatAttr = dyn_cast<FloatAttr>(element);
+        if (!floatAttr)
+          return failure();
+        FloatType t = cast<FloatType>(floatAttr.getType());
+        if (t.getWidth() != 32)
+          return failure();
+        values.push_back(floatAttr.getValue().convertToFloat());
+      }
+      return success();
+    }
+    return failure();
+  }
+
   ParseResult stringArrayAttr(llvm::SmallVector<std::string> &values,
                               StringRef nameSuffix) {
     SmallString<64> name("torch.onnx.");
diff --git a/lib/Conversion/TorchOnnxToTorch/DefaultDomainQtoZ.cpp b/lib/Conversion/TorchOnnxToTorch/DefaultDomainQtoZ.cpp
index ea5156a0c878..95413b080343 100644
--- a/lib/Conversion/TorchOnnxToTorch/DefaultDomainQtoZ.cpp
+++ b/lib/Conversion/TorchOnnxToTorch/DefaultDomainQtoZ.cpp
@@ -4339,6 +4339,7 @@ void mlir::torch::onnx_c::populateDefaultDomainQtoZ(
         llvm::SmallVector<int64_t> ngram_counts;
         llvm::SmallVector<int64_t> ngram_indexes;
         llvm::SmallVector<int64_t> pool_int64s;
+        llvm::SmallVector<float> weights;
         std::string mode;
         int64_t min_gram_length;
         int64_t max_gram_length;
@@ -4356,9 +4357,10 @@ void mlir::torch::onnx_c::populateDefaultDomainQtoZ(
             binder.tensorOperand(input) || binder.tensorResultType(resultType))
           return failure();
 
-        if (mode != "TF")
-          return rewriter.notifyMatchFailure(binder.op,
-                                             "TF mode supported only");
+        llvm::SmallVector<float> defaultWeights(ngram_indexes.size(), 1.0f);
+        if (binder.f32FloatArrayAttr(weights, "weights", defaultWeights))
+          return failure();
+
         if (pool_int64s.size() == 0)
           return rewriter.notifyMatchFailure(
               binder.op, "pool_int64s empty, only integers supported");
@@ -4584,9 +4586,36 @@ void mlir::torch::onnx_c::populateDefaultDomainQtoZ(
                     binder.getLoc(), loopConditionTrue, ValueRange({count}));
               }
               count = skipLoop.getResult(0);
-              // insert count "tf" into output
               Value countFloat = rewriter.create<Torch::AtenFloatScalarOp>(
                   binder.getLoc(), count);
+              if (mode == "IDF" || mode == "TFIDF") {
+                // both IDF and TFIDF modes use weights
+                float weight = weights[ngram_i];
+                Value constWeight = rewriter.create<Torch::ConstantFloatOp>(
+                    binder.getLoc(), rewriter.getF64FloatAttr(weight));
+
+                // TFIDF
+                Value multiplier = countFloat;
+                if (mode == "IDF") {
+                  // All the counts larger than 1 would be truncated to 1
+                  // and the i-th element in weights would be used to scale
+                  // (by multiplication) the count of the i-th n-gram in pool.
+
+                  Value intCount = rewriter.create<Torch::AtenIntScalarOp>(
+                      binder.getLoc(), count);
+                  // compare intCount > 0
+                  Value gtZeroCount = rewriter.create<Torch::AtenGtIntOp>(
+                      binder.getLoc(), intCount, zero);
+                  gtZeroCount = rewriter.create<Torch::AtenIntBoolOp>(
+                      binder.getLoc(), gtZeroCount);
+                  Value gtZeroCountFloat =
+                      rewriter.create<Torch::AtenFloatScalarOp>(binder.getLoc(),
+                                                                gtZeroCount);
+                  multiplier = gtZeroCountFloat;
+                }
+                countFloat = rewriter.create<Torch::AtenMulFloatOp>(
+                    binder.getLoc(), multiplier, constWeight);
+              }
               Value dataList = rewriter.create<Torch::PrimListConstructOp>(
                   binder.getLoc(),
                   rewriter.getType<Torch::ListType>(

From f8e4a9a3c2d1946ca0cc09026e6f4b1668e3d91a Mon Sep 17 00:00:00 2001
From: Sambhav Jain <sambhav.jain@getcruise.com>
Date: Wed, 2 Oct 2024 11:52:20 -0700
Subject: [PATCH 4/5] [Release] Fix binary name for downstream compatibility
 (#3752)

As of Sep 14, the torch-mlir binary
[wheels](https://github.com/llvm/torch-mlir-release/releases/tag/dev-wheels)
got renamed to `torch-mlir-core` from `torch-mlir`:
![image](https://github.com/user-attachments/assets/152e4977-71ef-4f57-8757-6dc75f72b670)

This was an unintended side-effect of the recent change of
`TORCH_MLIR_ENABLE_ONLY_MLIR_PYTHON_BINDINGS=True`
(https://github.com/llvm/torch-mlir/pull/3711) which skips setting `NAME
= "torch-mlir"` in
[setup.py](https://github.com/llvm/torch-mlir/blob/main/setup.py#L226-L232).

To avoid having multiple downstreams fix their pip deps, this change
allows using the same `torch-mlir` name for binaries, and reserves a
separate `torch-mlir-ext` name for the (less popular) binaries with
extensions enabled.
---
 .../python_deploy/build_linux_packages.sh     | 32 ++++++++++---------
 .../python_deploy/build_macos_packages.sh     | 14 ++++----
 setup.py                                      |  4 +--
 3 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/build_tools/python_deploy/build_linux_packages.sh b/build_tools/python_deploy/build_linux_packages.sh
index 4f80d3167d74..aa687bab447c 100755
--- a/build_tools/python_deploy/build_linux_packages.sh
+++ b/build_tools/python_deploy/build_linux_packages.sh
@@ -50,7 +50,7 @@ TM_PYTHON_VERSIONS="${TM_PYTHON_VERSIONS:-cp38-cp38 cp310-cp310 cp311-cp311}"
 # Location to store Release wheels
 TM_OUTPUT_DIR="${TM_OUTPUT_DIR:-${this_dir}/wheelhouse}"
 # What "packages to build"
-TM_PACKAGES="${TM_PACKAGES:-torch-mlir torch-mlir-core}"
+TM_PACKAGES="${TM_PACKAGES:-torch-mlir torch-mlir-ext}"
 # Use pre-built Pytorch
 TM_USE_PYTORCH_BINARY="${TM_USE_PYTORCH_BINARY:-ON}"
 # Skip running tests if you want quick iteration
@@ -83,12 +83,12 @@ function run_on_host() {
   fi
   mkdir -p "${TM_OUTPUT_DIR}"
   case "$package" in
-    torch-mlir)
+    torch-mlir-ext)
       TM_CURRENT_DOCKER_IMAGE=${TM_RELEASE_DOCKER_IMAGE}
       export USERID=0
       export GROUPID=0
       ;;
-    torch-mlir-core)
+    torch-mlir)
       TM_CURRENT_DOCKER_IMAGE=${TM_RELEASE_DOCKER_IMAGE}
       export USERID=0
       export GROUPID=0
@@ -158,22 +158,22 @@ function run_in_docker() {
       export PATH=$python_dir/bin:$orig_path
       echo ":::: Python version $(python3 --version)"
       case "$package" in
-        torch-mlir)
-          clean_wheels torch_mlir "$python_version"
-          build_torch_mlir "$TM_TORCH_VERSION"
+        torch-mlir-ext)
+          clean_wheels torch_mlir_ext "$python_version"
+          build_torch_mlir_ext "$TM_TORCH_VERSION"
 
           # Disable audit wheel until we can fix ODR torch issues.  See
           # https://github.com/llvm/torch-mlir/issues/1709
           #
-          #run_audit_wheel torch_mlir "$python_version"
+          #run_audit_wheel torch_mlir_ext "$python_version"
 
-          clean_build torch_mlir "$python_version"
+          clean_build torch_mlir_ext "$python_version"
           ;;
-        torch-mlir-core)
-          clean_wheels torch_mlir_core "$python_version"
-          build_torch_mlir_core
-          run_audit_wheel torch_mlir_core "$python_version"
-          clean_build torch_mlir_core "$python_version"
+        torch-mlir)
+          clean_wheels torch_mlir "$python_version"
+          build_torch_mlir
+          run_audit_wheel torch_mlir "$python_version"
+          clean_build torch_mlir "$python_version"
           ;;
         out-of-tree)
           setup_venv "$python_version" "$TM_TORCH_VERSION"
@@ -431,7 +431,7 @@ function clean_build() {
   rm -rf /main_checkout/torch-mlir/build /main_checkout/torch-mlir/llvm-build /main_checkout/torch-mlir/docker_venv  /main_checkout/torch-mlir/libtorch
 }
 
-function build_torch_mlir() {
+function build_torch_mlir_ext() {
   # Disable LTC build for releases
   export TORCH_MLIR_ENABLE_LTC=0
   local torch_version="$1"
@@ -470,7 +470,9 @@ function run_audit_wheel() {
   rm "$generic_wheel"
 }
 
-function build_torch_mlir_core() {
+function build_torch_mlir() {
+  # Disable LTC build for releases
+  export TORCH_MLIR_ENABLE_LTC=0
   python -m pip install --no-cache-dir -r /main_checkout/torch-mlir/build-requirements.txt
   CMAKE_GENERATOR=Ninja \
   TORCH_MLIR_PYTHON_PACKAGE_VERSION=${TORCH_MLIR_PYTHON_PACKAGE_VERSION} \
diff --git a/build_tools/python_deploy/build_macos_packages.sh b/build_tools/python_deploy/build_macos_packages.sh
index c6fb3a4d209a..5b4b2031cdc5 100755
--- a/build_tools/python_deploy/build_macos_packages.sh
+++ b/build_tools/python_deploy/build_macos_packages.sh
@@ -56,16 +56,16 @@ function run() {
       export PATH=$python_dir/bin:$orig_path
       echo ":::: Python version $(python3 --version)"
       case "$package" in
+        torch-mlir-ext)
+          clean_wheels torch_mlir_ext "$python_version"
+          build_torch_mlir_ext torch_mlir_ext "$python_version"
+          run_audit_wheel torch_mlir_ext "$python_version"
+          ;;
         torch-mlir)
           clean_wheels torch_mlir "$python_version"
           build_torch_mlir torch_mlir "$python_version"
           run_audit_wheel torch_mlir "$python_version"
           ;;
-        torch-mlir-core)
-          clean_wheels torch_mlir_core "$python_version"
-          build_torch_mlir_core torch_mlir_core "$python_version"
-          run_audit_wheel torch_mlir_core "$python_version"
-          ;;
         *)
           echo "Unrecognized package '$package'"
           exit 1
@@ -75,7 +75,7 @@ function run() {
   done
 }
 
-function build_torch_mlir() {
+function build_torch_mlir_ext() {
   local wheel_basename="$1"
   local python_version="$2"
   rm -rf "$output_dir"/build_venv
@@ -93,7 +93,7 @@ function build_torch_mlir() {
   rm -rf "$output_dir"/build_venv
 }
 
-function build_torch_mlir_core() {
+function build_torch_mlir() {
   local wheel_basename="$1"
   local python_version="$2"
   rm -rf "$output_dir"/build_venv
diff --git a/setup.py b/setup.py
index 71491affb988..d62f08073b58 100644
--- a/setup.py
+++ b/setup.py
@@ -223,13 +223,13 @@ def build_extension(self, ext):
 EXT_MODULES = [
     CMakeExtension("torch_mlir._mlir_libs._torchMlir"),
 ]
-NAME = "torch-mlir-core"
+NAME = "torch-mlir"
 
 # If building PyTorch extensions, customize.
 if not TORCH_MLIR_ENABLE_ONLY_MLIR_PYTHON_BINDINGS:
     import torch
 
-    NAME = "torch-mlir"
+    NAME = "torch-mlir-ext"
     INSTALL_REQUIRES.extend(
         [
             f"torch=={torch.__version__}".split("+", 1)[0],

From f0b7ca72f5c8e2694e6b7a6d4d162216d1f40b9c Mon Sep 17 00:00:00 2001
From: Kyle Wang <ec1wng@gmail.com>
Date: Wed, 2 Oct 2024 14:00:19 -0700
Subject: [PATCH 5/5] Fixed GRU quality issues exposed by e2e tests (#3753)

Issue: https://github.com/nod-ai/SHARK-ModelDev/issues/856

Related tests:
![Screenshot 2024-10-01
175305](https://github.com/user-attachments/assets/0dc0901b-058f-427c-a596-9e806fd38836)
---
 .../OnnxRecurrentLayerOpExpanders.cpp         | 84 +++++++++----------
 1 file changed, 42 insertions(+), 42 deletions(-)

diff --git a/lib/Conversion/TorchOnnxToTorch/OnnxRecurrentLayerOpExpanders.cpp b/lib/Conversion/TorchOnnxToTorch/OnnxRecurrentLayerOpExpanders.cpp
index b18cd09f030a..e7ab690e0ff3 100644
--- a/lib/Conversion/TorchOnnxToTorch/OnnxRecurrentLayerOpExpanders.cpp
+++ b/lib/Conversion/TorchOnnxToTorch/OnnxRecurrentLayerOpExpanders.cpp
@@ -1072,11 +1072,10 @@ LogicalResult OnnxGruExpander(OpBinder binder,
   Value cstNone = b.create<ConstantNoneOp>();
   Value cstZero = b.create<ConstantIntOp>(intType, b.getI64IntegerAttr(0));
   Value cstOne = b.create<ConstantIntOp>(intType, b.getI64IntegerAttr(1));
-  Value cstTwo = b.create<ConstantIntOp>(intType, b.getI64IntegerAttr(2));
 
   // Binding arguments
   ValueTensorType yTy, Y_hType;
-  if (binder.tensorResultTypeAtIndex(yTy, 0) ||
+  if (binder.tensorResultTypeAtIndex(yTy, 0) &&
       binder.tensorResultTypeAtIndex(Y_hType, 1)) {
     return rewriter.notifyMatchFailure(binder.op,
                                        "At least one output must be present");
@@ -1132,6 +1131,7 @@ LogicalResult OnnxGruExpander(OpBinder binder,
   // Validations
   auto XShape = xTy.getSizes();
   int64_t batch_size = (layout == 0) ? XShape[1] : XShape[0];
+  int64_t seq_len = (layout == 0) ? XShape[0] : XShape[1];
   int64_t input_size = XShape[2];
 
   std::ostringstream oss;
@@ -1173,6 +1173,10 @@ LogicalResult OnnxGruExpander(OpBinder binder,
     Value cstDtype = getDtypeIntValueForType(rewriter, loc, xTy.getDtype());
     initial_h =
         b.create<AtenZerosOp>(hTy, hShape, cstDtype, cstNone, cstNone, cstNone);
+  } else {
+    if (layout == 1) {
+      initial_h = StaticTranspose(b, initial_h, 0, 1);
+    }
   }
 
   if (binder.tensorOperandAtIndex(sequence_lens, 4))
@@ -1192,10 +1196,10 @@ LogicalResult OnnxGruExpander(OpBinder binder,
   // fill in B
   Value cstXDtype = getDtypeIntValueForType(rewriter, loc, xTy.getDtype());
   if (B == nullptr) {
-    SmallVector<int64_t> BShape = {num_directions, 2 * hidden_size};
+    SmallVector<int64_t> BShape = {num_directions, 6 * hidden_size};
     SmallVector<Value> BShapeListContents = {
         b.create<ConstantIntOp>(intType, b.getI64IntegerAttr(num_directions)),
-        b.create<ConstantIntOp>(intType, b.getI64IntegerAttr(2 * hidden_size))};
+        b.create<ConstantIntOp>(intType, b.getI64IntegerAttr(6 * hidden_size))};
     Value BShapeList = b.create<PrimListConstructOp>(
         b.getType<ListType>(intType), BShapeListContents);
     auto BType = b.getType<ValueTensorType>(BShape, wTy.getDtype());
@@ -1256,51 +1260,47 @@ LogicalResult OnnxGruExpander(OpBinder binder,
                       B_slices[4], B_slices[5]);
 
   // Process inputs based on layout
-  Value X_processed, initial_h_processed;
-  ValueTensorType yTy_processed, Y_hType_processed;
-
-  if (layout == 0) {
-    X_processed = X;
-    initial_h_processed = initial_h_forward;
-    yTy_processed = yTy;
-    Y_hType_processed = Y_hType;
-  } else {
-    X_processed = b.create<AtenTransposeIntOp>(X.getType(), X, cstZero, cstOne);
-    initial_h_processed = b.create<AtenTransposeIntOp>(
-        initial_h.getType(), initial_h_forward, cstZero, cstOne);
-
-    auto yTySizes = yTy.getSizes();
-    auto Y_hTypeSizes = Y_hType.getSizes();
-
-    yTy_processed = b.getType<ValueTensorType>(
-        llvm::SmallVector<int64_t>{yTySizes[1], yTySizes[0], yTySizes[2],
-                                   yTySizes[3]},
-        yTy.getDtype());
-
-    Y_hType_processed = b.getType<ValueTensorType>(
-        llvm::SmallVector<int64_t>{Y_hTypeSizes[1], Y_hTypeSizes[0],
-                                   Y_hTypeSizes[2]},
-        Y_hType.getDtype());
+  if (layout == 1) {
+    X = StaticTranspose(b, X, 0, 1);
   }
 
   // Weights and biases ready. Calling GRU layer to insert the actual ops.
-  GruLayerOutput gruLayerOutput =
-      gru_layer(b, X_processed, initial_h_processed, weights, activations,
-                linear_before_reset);
+  GruLayerOutput gruLayerOutput = gru_layer(b, X, initial_h_forward, weights,
+                                            activations, linear_before_reset);
 
   // Process outputs based on layout
-  Value Y_final, Y_h_final;
-  if (layout == 0) {
-    Y_final = b.create<AtenUnsqueezeOp>(yTy, gruLayerOutput.Y, cstOne);
-    Y_h_final = b.create<AtenUnsqueezeOp>(Y_hType, gruLayerOutput.Y_h, cstZero);
+  Value Y_final;
+  if (binder.tensorResultTypeAtIndex(yTy, 0)) {
+    Y_final = cstNone;
   } else {
-    auto Y_transposed = b.create<AtenTransposeIntOp>(
-        gruLayerOutput.Y.getType(), gruLayerOutput.Y, cstZero, cstOne);
-    Y_final = b.create<AtenUnsqueezeOp>(yTy, Y_transposed, cstTwo);
+    if (layout == 0) {
+      Y_final = b.create<AtenUnsqueezeOp>(yTy, gruLayerOutput.Y, cstOne);
+    } else {
+      Type yTy_original = b.getType<ValueTensorType>(
+          llvm::SmallVector<int64_t>{seq_len, 1, batch_size, hidden_size},
+          yTy.getDtype());
+      Y_final =
+          b.create<AtenUnsqueezeOp>(yTy_original, gruLayerOutput.Y, cstOne);
+      Y_final = StaticTranspose(b, Y_final, 1, 2);
+      Y_final = StaticTranspose(b, Y_final, 0, 1);
+    }
+  }
 
-    auto Y_h_transposed = b.create<AtenTransposeIntOp>(
-        gruLayerOutput.Y_h.getType(), gruLayerOutput.Y_h, cstZero, cstOne);
-    Y_h_final = b.create<AtenUnsqueezeOp>(Y_hType, Y_h_transposed, cstZero);
+  Value Y_h_final;
+  if (binder.tensorResultTypeAtIndex(Y_hType, 1)) {
+    Y_h_final = cstNone;
+  } else {
+    if (layout == 0) {
+      Y_h_final =
+          b.create<AtenUnsqueezeOp>(Y_hType, gruLayerOutput.Y_h, cstZero);
+    } else {
+      Type y_hTy_original = b.getType<ValueTensorType>(
+          llvm::SmallVector<int64_t>{1, batch_size, hidden_size},
+          Y_hType.getDtype());
+      Y_h_final = b.create<AtenUnsqueezeOp>(y_hTy_original, gruLayerOutput.Y_h,
+                                            cstZero);
+      Y_h_final = StaticTranspose(b, Y_h_final, 0, 1);
+    }
   }
 
   rewriter.replaceOp(binder.op, mlir::ValueRange{Y_final, Y_h_final});