Index: mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
===================================================================
--- mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -843,7 +843,8 @@
                        DefaultValuedAttr<BoolArrayAttr, "{}">:$use_full_tile_buffers,
                        UnitAttr:$use_full_tiles_by_default,
                        UnitAttr:$use_alloca,
-                       OptionalAttr<DeviceMappingArrayAttr>:$mapping,
+                       ConfinedAttr<OptionalAttr<DeviceMappingArrayAttr>, [ArrayMaxCount<1>]>:$mapping,
+                       OptionalAttr<DenseI64ArrayAttr>:$copy_permutation,
                        OptionalAttr<I64Attr>:$alignment);
   let results = (outs PDL_Operation:$transformed);
 
Index: mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
===================================================================
--- mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -177,8 +177,8 @@
 /// dimension. If that is not possible, contains the dynamic size of the
 /// subview. The call back should return the buffer to use.
 using AllocBufferCallbackFn = std::function<std::optional<Value>(
-    OpBuilder &b, memref::SubViewOp subView,
-    ArrayRef<Value> boundingSubViewSize, DataLayout &layout)>;
+    OpBuilder &b, Operation *subViewOp, ArrayRef<Value> boundingSubViewSize,
+    DataLayout &layout)>;
 
 /// Callback function type used to deallocate the buffers used to hold the
 /// promoted subview.
@@ -256,6 +256,13 @@
     copyOutFn = copyOut;
     return *this;
   }
+
+  ArrayRef<int64_t> copyPermutation;
+  LinalgPromotionOptions &
+  setCopyPermutation(ArrayRef<int64_t> permutationArrayRef) {
+    copyPermutation = permutationArrayRef;
+    return *this;
+  }
 };
 
 /// Split Reduction options.
@@ -507,16 +514,19 @@
 /// Create a new buffer using the `allocationFn` provided. The size of this
 /// buffer is the smallest constant bounding size along each dimension that
 /// can be computed for the size of the result of `subView`. Returns the
-/// allocated buffer as `fullLocalView` and the view that matches the size of
-/// the result of subview operation as `partialLocalView`.
+/// allocated buffer as `fullLocalView`, the view that matches the size of
+/// the result of subview operation as `partialLocalView`, and the input or
+/// resulted operation from the promotion as`referenceOp` that can be used
+/// subsequent steps.
 struct PromotionInfo {
   Value fullLocalView;
   Value partialLocalView;
+  Value referenceOp;
 };
 FailureOr<PromotionInfo>
 promoteSubviewAsNewBuffer(OpBuilder &b, Location loc, memref::SubViewOp subView,
                           const AllocBufferCallbackFn &allocationFn,
-                          DataLayout &layout);
+                          DataLayout &layout, ArrayRef<int64_t> permutation);
 
 /// Promote the `subViews` into a new buffer allocated at the insertion point
 /// `b`. Promotion occurs in 3 steps:
@@ -531,8 +541,7 @@
                                     const LinalgPromotionOptions &options);
 
 /// Allocate the subview in the GPU workgroup memory.
-Optional<Value> allocateWorkgroupMemory(OpBuilder &builder,
-                                        memref::SubViewOp subview,
+Optional<Value> allocateWorkgroupMemory(OpBuilder &builder, Operation *subview,
                                         ArrayRef<Value> sizeBounds,
                                         DataLayout &);
 
@@ -544,8 +553,7 @@
 LogicalResult copyToWorkgroupMemory(OpBuilder &b, Value src, Value dst);
 
 /// Allocate the subview in the GPU private memory.
-Optional<Value> allocateGPUPrivateMemory(OpBuilder &builder,
-                                         memref::SubViewOp subview,
+Optional<Value> allocateGPUPrivateMemory(OpBuilder &builder, Operation *subview,
                                          ArrayRef<Value> sizeBounds,
                                          DataLayout &);
 
Index: mlir/include/mlir/IR/OpBase.td
===================================================================
--- mlir/include/mlir/IR/OpBase.td
+++ mlir/include/mlir/IR/OpBase.td
@@ -1694,6 +1694,10 @@
     CPred<"$_self.cast<::mlir::ArrayAttr>().size() >= " # n>,
     "with at least " # n # " elements">;
 
+class ArrayMaxCount<int n> : AttrConstraint<
+    CPred<"$_self.cast<::mlir::ArrayAttr>().size() <= " # n>,
+    "with at most " # n # " elements">;
+
 class ArrayCount<int n> : AttrConstraint<
     CPred<"$_self.cast<::mlir::ArrayAttr>().size() == " #n>,
     "with exactly " # n # " elements">;
Index: mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
===================================================================
--- mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -1859,6 +1859,9 @@
     } else {
       return emitDefaultDefiniteFailure(target);
     }
+    if (getCopyPermutation().has_value()) {
+      promotionOptions.setCopyPermutation(*getCopyPermutation());
+    }
   }
 
   if (failed(promoteSubviewsPrecondition(target, promotionOptions)))
@@ -1869,7 +1872,7 @@
   FailureOr<LinalgOp> res = promoteSubViews(rewriter, target, promotionOptions);
   if (failed(res))
     return emitDefaultDefiniteFailure(target);
-  results.push_back(target);
+  results.push_back(*res);
   return DiagnosedSilenceableFailure::success();
 }
 
Index: mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
===================================================================
--- mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
+++ mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
@@ -80,11 +80,17 @@
 /// memref<..xi8> and return a view to get a memref type of shape
 /// boundingSubViewSize.
 static std::optional<Value> defaultAllocBufferCallBack(
-    const LinalgPromotionOptions &options, OpBuilder &builder,
-    memref::SubViewOp subView, ArrayRef<Value> boundingSubViewSize,
-    std::optional<unsigned> alignment, DataLayout &layout) {
-  ShapedType viewType = subView.getType();
-  ImplicitLocOpBuilder b(subView.getLoc(), builder);
+    const LinalgPromotionOptions &options, OpBuilder &builder, Operation *op,
+    ArrayRef<Value> boundingSubViewSize, std::optional<unsigned> alignment,
+    DataLayout &layout) {
+  auto viewType = llvm::TypeSwitch<Operation *, ShapedType>(op)
+                      .Case<memref::SubViewOp, memref::TransposeOp>(
+                          [](auto casted) { return casted.getType(); })
+                      .Default([](Operation *) { return nullptr; });
+  if (!viewType)
+    return std::nullopt;
+
+  ImplicitLocOpBuilder b(op->getLoc(), builder);
   auto zero = b.createOrFold<arith::ConstantIndexOp>(0);
   auto one = b.createOrFold<arith::ConstantIndexOp>(1);
 
@@ -138,6 +144,7 @@
 
   /// Alignment of promoted buffer.
   std::optional<unsigned> alignment;
+  ArrayRef<int64_t> copyPermutation;
 };
 } // namespace
 
@@ -165,7 +172,7 @@
   if (options.allocationFn) {
     allocationFn = *options.allocationFn;
   } else {
-    allocationFn = [&](OpBuilder &b, memref::SubViewOp subViewOp,
+    allocationFn = [&](OpBuilder &b, Operation *subViewOp,
                        ArrayRef<Value> boundingSubViewSize,
                        DataLayout &layout) -> std::optional<Value> {
       return defaultAllocBufferCallBack(options, b, subViewOp,
@@ -190,6 +197,7 @@
   };
   copyInFn = (options.copyInFn ? *(options.copyInFn) : defaultCopyCallBack);
   copyOutFn = (options.copyOutFn ? *(options.copyOutFn) : defaultCopyCallBack);
+  copyPermutation = options.copyPermutation;
 }
 
 // Performs promotion of a `subView` into a local buffer of the size of the
@@ -197,12 +205,15 @@
 // than the actual size of the `subView` at the boundaries.
 // This is related to the full/partial tile problem.
 // Returns a PromotionInfo containing a `buffer`, `fullLocalView` and
-// `partialLocalView` such that:
+// `partialLocalView` and `referenceOp` such that:
 //   * `buffer` is always the size of the full tile.
 //   * `fullLocalView` is a dense contiguous view into that buffer.
 //   * `partialLocalView` is a dense non-contiguous slice of `fullLocalView`
 //     that corresponds to the size of `subView` and accounting for boundary
 //     effects.
+//   * `referenceOp` which can be the original subviewOp or created
+//   memref:TransposeOp
+//
 // The point of the full tile buffer is that constant static tile sizes are
 // folded and result in a buffer type with statically known size and alignment
 // properties.
@@ -211,13 +222,22 @@
 // by a partial `copy` op.
 FailureOr<PromotionInfo> mlir::linalg::promoteSubviewAsNewBuffer(
     OpBuilder &b, Location loc, memref::SubViewOp subView,
-    const AllocBufferCallbackFn &allocationFn, DataLayout &layout) {
+    const AllocBufferCallbackFn &allocationFn, DataLayout &layout,
+    ArrayRef<int64_t> permutation) {
   auto viewType = subView.getType();
   auto rank = viewType.getRank();
-  SmallVector<Value, 4> fullSizes;
+  SmallVector<Value> fullSizes;
   SmallVector<OpFoldResult> partialSizes;
   fullSizes.reserve(rank);
   partialSizes.reserve(rank);
+  // Get identity map.
+  AffineMap permutationMap =
+      AffineMap::getMultiDimIdentityMap(rank, b.getContext());
+  // If permutation is given update the permutation map.
+  if (permutation.size() > 0) {
+    permutationMap = AffineMap::getPermutationMap(
+        llvm::to_vector_of<unsigned>(permutation), b.getContext());
+  }
   llvm::SmallBitVector droppedDims = subView.getDroppedDims();
   int64_t resultDimIdx = 0;
   for (const auto &en : llvm::enumerate(subView.getOrCreateRanges(b, loc))) {
@@ -227,16 +247,12 @@
     // Try to extract a tight constant. If the size is known statically, no need
     // to look for the bound.
     LLVM_DEBUG(llvm::dbgs() << "Extract tightest: " << rangeValue.size << "\n");
-    Value size;
-    if (auto attr = rangeValue.size.dyn_cast<Attribute>()) {
-      size = getValueOrCreateConstantIndexOp(b, loc, rangeValue.size);
-    } else {
-      Value materializedSize =
-          getValueOrCreateConstantIndexOp(b, loc, rangeValue.size);
-      FailureOr<int64_t> upperBound =
-          getConstantUpperBoundForIndex(materializedSize);
+    Value size = getValueOrCreateConstantIndexOp(b, loc, rangeValue.size);
+    auto attr = rangeValue.size.dyn_cast<Attribute>();
+    if (!attr) {
+      FailureOr<int64_t> upperBound = getConstantUpperBoundForIndex(size);
       size = failed(upperBound)
-                 ? materializedSize
+                 ? size
                  : b.create<arith::ConstantIndexOp>(loc, *upperBound);
     }
     LLVM_DEBUG(llvm::dbgs() << "Extracted tightest: " << size << "\n");
@@ -244,18 +260,29 @@
     partialSizes.push_back(
         b.createOrFold<memref::DimOp>(loc, subView, resultDimIdx++));
   }
-  SmallVector<int64_t, 4> dynSizes(fullSizes.size(), ShapedType::kDynamic);
   // If a callback is not specified, then use the default implementation for
   // allocating the promoted buffer.
-  std::optional<Value> fullLocalView =
-      allocationFn(b, subView, fullSizes, layout);
+  std::optional<Value> fullLocalView;
+  Value referenceOp = subView;
+  if (permutationMap.isIdentity()) {
+    fullLocalView = allocationFn(b, subView, fullSizes, layout);
+  } else {
+    auto transposeOp = b.create<memref::TransposeOp>(
+        loc, subView, AffineMapAttr::get(permutationMap));
+    referenceOp = transposeOp;
+    fullLocalView = allocationFn(
+        b, transposeOp,
+        applyPermutationMap(permutationMap, ArrayRef(fullSizes)), layout);
+  }
+
   if (!fullLocalView)
     return failure();
   SmallVector<OpFoldResult, 4> zeros(fullSizes.size(), b.getIndexAttr(0));
   SmallVector<OpFoldResult, 4> ones(fullSizes.size(), b.getIndexAttr(1));
   auto partialLocalView = b.createOrFold<memref::SubViewOp>(
-      loc, *fullLocalView, zeros, partialSizes, ones);
-  return PromotionInfo{*fullLocalView, partialLocalView};
+      loc, *fullLocalView, zeros,
+      applyPermutationMap(permutationMap, ArrayRef(partialSizes)), ones);
+  return PromotionInfo{*fullLocalView, partialLocalView, referenceOp};
 }
 
 static FailureOr<MapVector<int64_t, PromotionInfo>>
@@ -269,8 +296,9 @@
   for (auto v : options.subViews) {
     memref::SubViewOp subView =
         cast<memref::SubViewOp>(v.second.getDefiningOp());
-    auto promotionInfo = promoteSubviewAsNewBuffer(
-        b, b.getLoc(), subView, options.allocationFn, layout);
+    auto promotionInfo =
+        promoteSubviewAsNewBuffer(b, b.getLoc(), subView, options.allocationFn,
+                                  layout, options.copyPermutation);
     if (failed(promotionInfo))
       return failure();
     promotionInfoMap[v.first] = *promotionInfo;
@@ -306,9 +334,8 @@
     auto info = promotionInfoMap.find(v.first);
     if (info == promotionInfoMap.end())
       continue;
-    if (failed(options.copyInFn(
-            b, cast<memref::SubViewOp>(v.second.getDefiningOp()),
-            info->second.partialLocalView)))
+    if (failed(options.copyInFn(b, info->second.referenceOp,
+                                info->second.partialLocalView)))
       return failure();
   }
   return promotionInfoMap;
@@ -332,6 +359,7 @@
   opViews.reserve(op->getNumOperands());
   SmallVector<std::pair<Value, Value>, 8> writebackViews;
   writebackViews.reserve(promotedBuffersAndViews->size());
+  SmallVector<AffineMap> indexingMaps = op.getIndexingMapsArray();
   for (OpOperand &opOperand : op->getOpOperands()) {
     int64_t operandNumber = opOperand.getOperandNumber();
     if (options.subViews.count(operandNumber) != 0) {
@@ -343,13 +371,41 @@
             (*promotedBuffersAndViews)[operandNumber].partialLocalView);
       if (operandNumber >= op.getNumDpsInputs())
         writebackViews.emplace_back(std::make_pair(
-            opOperand.get(),
+            (*promotedBuffersAndViews)[operandNumber].referenceOp,
             (*promotedBuffersAndViews)[operandNumber].partialLocalView));
+
+      // 2.1 Get the identity affine map.
+      AffineMap permutationMap = AffineMap::getMultiDimIdentityMap(
+          opOperand.get().getType().cast<MemRefType>().getRank(),
+          b.getContext());
+
+      // 2.2 Apply the given copy permutation to the original indexing map.
+      if (options.copyPermutation.size() > 0) {
+        permutationMap = AffineMap::getPermutationMap(
+            llvm::to_vector_of<unsigned>(options.copyPermutation),
+            b.getContext());
+      }
+      AffineMap transposedMap =
+          permutationMap.compose(op.getMatchingIndexingMap(&opOperand));
+      indexingMaps[op.getIndexingMapIndex(&opOperand)] = transposedMap;
     } else {
       opViews.push_back(opOperand.get());
     }
   }
   op->setOperands(0, opViews.size(), opViews);
+  linalg::GenericOp transposedGenericOp;
+  // 2.3 If the copy permutation is given replace the current Linalg op with a
+  // linalg.generic.
+  if (options.copyPermutation.size() > 0) {
+    ValueRange operandsRef(op->getOperands());
+    transposedGenericOp = b.create<linalg::GenericOp>(
+        /*location=*/op->getLoc(),
+        /*inputs=*/operandsRef.take_front(op.getNumDpsInputs()),
+        /*outputs=*/operandsRef.drop_front(op.getNumDpsInputs()),
+        /*indexingMaps=*/indexingMaps,
+        /*iteratorTypes=*/op.getIteratorTypesArray());
+    transposedGenericOp.getRegion().takeBody(op->getRegion(0));
+  }
 
   OpBuilder::InsertionGuard guard(b);
   b.setInsertionPointAfter(op);
@@ -363,6 +419,14 @@
   // 4. Dealloc all local buffers.
   for (const auto &pi : *promotedBuffersAndViews)
     (void)options.deallocationFn(b, pi.second.fullLocalView);
+
+  // 5. If the copy permutation is given replace the current Linalg op with
+  // created linalg.generic.
+  if (options.copyPermutation.size() > 0) {
+    IRRewriter rewriter(b);
+    rewriter.replaceOp(op, transposedGenericOp->getResults());
+    return cast<LinalgOp>(*transposedGenericOp);
+  }
   return op;
 }
 
@@ -403,12 +467,23 @@
 /// Allocate the given subview to a memory address space in GPU by creating a
 /// allocation operation and setting the memref type address space to desired
 /// address space.
-static Optional<Value> allocateSubviewGPUMemoryInAddressSpace(
-    OpBuilder &builder, memref::SubViewOp subview, ArrayRef<Value> sizeBounds,
-    gpu::AddressSpace addressSpace) {
+static Optional<Value>
+allocateSubviewGPUMemoryInAddressSpace(OpBuilder &builder, Operation *op,
+                                       ArrayRef<Value> sizeBounds,
+                                       gpu::AddressSpace addressSpace) {
   OpBuilder::InsertionGuard guard(builder);
+  func::FuncOp funcOp;
+  MemRefType subview;
+  if (isa<memref::SubViewOp>(op)) {
+    subview = dyn_cast<memref::SubViewOp>(op).getType();
+    funcOp = dyn_cast<memref::SubViewOp>(op)->getParentOfType<func::FuncOp>();
+  } else if (isa<memref::TransposeOp>(op)) {
+    subview = dyn_cast<memref::TransposeOp>(op).getType();
+    funcOp = dyn_cast<memref::TransposeOp>(op)->getParentOfType<func::FuncOp>();
+  } else {
+    return std::nullopt;
+  }
 
-  func::FuncOp funcOp = subview->getParentOfType<func::FuncOp>();
   if (!funcOp)
     return std::nullopt;
 
@@ -424,7 +499,7 @@
 
   builder.setInsertionPoint(&funcOp.front(), funcOp.front().begin());
   auto type = MemRefType::get(
-      shape, subview.getType().getElementType(), MemRefLayoutAttrInterface{},
+      shape, subview.getElementType(), MemRefLayoutAttrInterface{},
       gpu::AddressSpaceAttr::get(builder.getContext(), addressSpace));
   Value buffer;
   if (addressSpace == gpu::GPUDialect::getWorkgroupAddressSpace()) {
@@ -438,9 +513,10 @@
 }
 
 /// Allocate the subview in the GPU workgroup memory.
-Optional<Value> mlir::linalg::allocateWorkgroupMemory(
-    OpBuilder &builder, memref::SubViewOp subview, ArrayRef<Value> sizeBounds,
-    DataLayout &) {
+Optional<Value>
+mlir::linalg::allocateWorkgroupMemory(OpBuilder &builder, Operation *subview,
+                                      ArrayRef<Value> sizeBounds,
+                                      DataLayout &) {
   return allocateSubviewGPUMemoryInAddressSpace(
       builder, subview, sizeBounds,
       gpu::GPUDialect::getWorkgroupAddressSpace());
@@ -463,9 +539,10 @@
 }
 
 /// Allocate the subview in the GPU private memory.
-Optional<Value> mlir::linalg::allocateGPUPrivateMemory(
-    OpBuilder &builder, memref::SubViewOp subview, ArrayRef<Value> sizeBounds,
-    DataLayout &) {
+Optional<Value>
+mlir::linalg::allocateGPUPrivateMemory(OpBuilder &builder, Operation *subview,
+                                       ArrayRef<Value> sizeBounds,
+                                       DataLayout &) {
   return allocateSubviewGPUMemoryInAddressSpace(
       builder, subview, sizeBounds, gpu::GPUDialect::getPrivateAddressSpace());
 }
Index: mlir/test/Dialect/Linalg/promote.mlir
===================================================================
--- mlir/test/Dialect/Linalg/promote.mlir
+++ mlir/test/Dialect/Linalg/promote.mlir
@@ -275,3 +275,52 @@
   %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!pdl.operation) -> !pdl.operation
   %1 = transform.structured.promote %0
 }
+
+// -----
+func.func @gemm_transposed(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>)
+{
+   linalg.matmul ins(%a, %b: memref<?x?xf32>, memref<?x?xf32>)
+               outs(%c: memref<?x?xf32>)
+   return
+}
+
+// CHECK-LABEL: func @gemm_transposed
+// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: memref<?x?xf32>
+// CHECK: %[[alloc_A:.*]] = memref.alloc() : memref<16x4xf32, #gpu.address_space<workgroup>>
+// CHECK: %[[alloc_B:.*]] = memref.alloc() : memref<8x4xf32, #gpu.address_space<workgroup>>
+// CHECK-DAG: %[[C16:.*]] = arith.constant 16
+// CHECK-DAG: %[[C0:.*]] = arith.constant 0
+// CHECK-DAG: %[[C1:.*]] = arith.constant 1
+// CHECK:   scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:     scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:       scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
+// CHECK:         %[[subview_A:.*]] = memref.subview {{.*}}
+// CHECK:         %[[subview_B:.*]] = memref.subview {{.*}}
+// CHECK:         %[[subview_C:.*]] = memref.subview {{.*}}
+
+// CHECK:         %[[transposed_B:.*]] = memref.transpose %[[subview_B]] (d0, d1) -> (d1, d0) : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[1, ?], offset: ?>>
+// CHECK:         %[[shared_B:.*]] = memref.subview %[[alloc_B]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<8x4xf32, #gpu.address_space<workgroup>> to memref<?x?xf32, strided<[4, 1]>, #gpu.address_space<workgroup>>
+
+// CHECK-NEXT:    gpu.barrier
+// CHECK-NEXT:    memref.copy %[[transposed_B]], %[[shared_B]] :  memref<?x?xf32, strided<[1, ?], offset: ?>> to memref<?x?xf32, strided<[4, 1]>, #gpu.address_space<workgroup>>
+// CHECK-NEXT:    gpu.barrier
+
+// CHECK:         %[[shared_A:.*]] = memref.subview %[[alloc_A]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<16x4xf32, #gpu.address_space<workgroup>> to memref<?x?xf32, strided<[4, 1]>, #gpu.address_space<workgroup>>
+
+
+// CHECK-NEXT:    gpu.barrier
+// CHECK-NEXT:    memref.copy %[[subview_A]], %[[shared_A]] :  memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[4, 1]>, #gpu.address_space<workgroup>>
+// CHECK-NEXT:    gpu.barrier
+
+// CHECK:         linalg.generic {{.+}} ins(%[[shared_A]], %[[shared_B]]{{.*}} outs(%[[subview_C]]
+
+
+transform.sequence failures(propagate) {
+^bb0(%arg1: !pdl.operation):
+  %0 = transform.structured.match ops{["linalg.matmul"]} in %arg1 : (!pdl.operation) -> !pdl.operation
+  %1, %loops:3 = transform.structured.tile %0 [16, 8, 4] : (!pdl.operation) -> (!pdl.operation, !pdl.operation, !pdl.operation, !pdl.operation)
+  %2 = transform.structured.promote %1 { operands_to_promote = [1], mapping = [#gpu.memory_space<workgroup>], copy_permutation = array<i64: 1, 0> }
+  %3 = transform.structured.promote %2 { operands_to_promote = [0], mapping = [#gpu.memory_space<workgroup>]}
+}