diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgOps.td
@@ -18,7 +18,6 @@
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "mlir/Interfaces/TilingInterface.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
 
 // Base class for Linalg dialect ops that do not correspond to library calls.
@@ -130,207 +129,6 @@
   let hasCanonicalizer = 1;
 }
 
-def Linalg_PadTensorOp : Linalg_Op<"pad_tensor",
-    [AttrSizedOperandSegments, NoSideEffect,
-     DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>,
-     DeclareOpInterfaceMethods<TilingInterface,
-         ["getDestinationOperands", "getLoopIteratorTypes", "getIterationDomain",
-          "getTiledImplementation"]>]> {
-  let summary = "tensor pad operation";
-  let description = [{
-    `linalg.pad_tensor` is an operation that pads the `source` tensor
-    with given `low` and `high` padding config.
-
-    The PadTensor operation supports the following arguments:
-
-    * source: the "base" tensor on which to pad.
-    * low: A list contains the padding along the start of each
-           dimension, i.e `low`.
-    * high: A list contains the padding along the end of each
-            dimension, i.e. `high`.
-    * nofold: indicates that the operation should not be folded when source and
-              result types are equal.
-
-    The result tensor dimensions are `low` + `dim` + `high` along that
-    dimension. The number of elements of `low` and `high` must match
-    the rank of the input tensor. They can be either a constant or a
-    dynamic value.
-
-    The region of the `pad_tensor` operation returns the value to use
-    for the padding. The arguments of the region represent the index
-    of the source being accessed. There should be as many arguments as
-    the rank of the `source` tensor. The value `yield`-ed by the
-    region is used as the value of the view at the given position.
-
-    If `nofold` is set, the padding operation will not be folded away even
-    if the source type and the padded type have the same static shape. This can
-    be used, e.g., for packing or promotion to faster memory.
-
-    Example 1:
-
-    ```mlir
-      %pad_value = ... : f32
-      %0 = linalg.pad_tensor %0 low[1, 2] high[2, 3] {
-      ^bb0(%arg0 : index, %arg1 : index):
-        linalg.yield %pad_value : f32
-      } : tensor<?x?xf32> to tensor<?x?xf32>
-    ```
-
-    Example 2:
-
-    ```mlir
-      %pad_value = ... : f32
-      %0 = linalg.pad_tensor %arg0 low[2, %arg1, 3, 3] high[3, 3, %arg1, 2] {
-      ^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index):
-          linalg.yield %pad_value : f32
-      } : tensor<1x2x2x?xf32> to tensor<6x?x?x?xf32>
-    ```
-
-    Example 3:
-
-    ```mlir
-      %pad_value = ... : f32
-      %0 = linalg.pad_tensor %arg0 low[0, 0] high[%ub0, %ub1] {
-      ^bb0(%arg1: index, %arg2: index):
-        linalg.yield %pad_value : f32
-      } : tensor<2x3xf32> to tensor<?x?xf32>
-    ```
-
-    Example 4:
-
-    ```mlir
-      // Force a padded value to be always exist with `nofold`.
-      %pad_value = ... : f32
-      %0 = linalg.pad_tensor %arg0 nofold low[0, 0] high[0, 0] {
-      ^bb0(%arg1: index, %arg2: index):
-        linalg.yield %pad_value : f32
-      } : tensor<2x3xf32> to tensor<2x3xf32>
-    ```
-  }];
-
-  let arguments = (ins
-    AnyTensor:$source,
-    Variadic<Index>:$low,
-    Variadic<Index>:$high,
-    I64ArrayAttr:$static_low,
-    I64ArrayAttr:$static_high,
-    UnitAttr:$nofold);
-
-  let regions = (region SizedRegion<1>:$region);
-
-  let results = (outs AnyTensor:$result);
-
-  // TODO: Remove custom<InferType> when AllTypesMatch supports opt. operands.
-  let assemblyFormat = [{
-    $source
-    (`nofold` $nofold^)?
-    `low` `` custom<OperandsOrIntegersSizesList>($low, $static_low)
-    `high` `` custom<OperandsOrIntegersSizesList>($high, $static_high)
-    $region attr-dict `:` type($source) `to` type($result)
-  }];
-
-  let extraClassDeclaration = [{
-    static StringRef getStaticLowAttrName() {
-      return "static_low";
-    }
-
-    static StringRef getStaticHighAttrName() {
-      return "static_high";
-    }
-
-    RankedTensorType getSourceType() {
-      return source().getType().cast<RankedTensorType>();
-    }
-    RankedTensorType getResultType() {
-      return getResult().getType().cast<RankedTensorType>();
-    }
-
-    // Infer the shape of the result tensor given the type of the source tensor
-    // and paddings. Known result dimensions that cannot necessarily be inferred
-    // from low/high padding sizes can be optionally specified. Those will be
-    // considered when computing the result type.
-    static RankedTensorType inferResultType(
-                                RankedTensorType sourceType,
-                                ArrayRef<int64_t> staticLow,
-                                ArrayRef<int64_t> staticHigh,
-                                ArrayRef<int64_t> resultShape = {});
-
-    // Return a PadTensorOp that pads `source` to `type` size where the static
-    // sizes are assumed to be greater than the dynamic sizes. The op performs
-    // "high" padding (i.e. it adds trailing padding values until the desired
-    // size is met).
-    static linalg::PadTensorOp createPadHighOp(
-        Type type, Value source, Value pad, bool nofold, Location loc,
-        OpBuilder & builder);
-
-    // Return a PadTensorOp that pads `source to `type` size with `pad` value.
-    // I.e., a block will be created and the `pad` value will be yielded
-    // directly. If the type passed is nullptr, it is inferred.
-    static linalg::PadTensorOp createPadScalarOp(
-        Type type, Value source, Value pad, ArrayRef<OpFoldResult> low,
-        ArrayRef<OpFoldResult> high, bool nofold, Location loc,
-        OpBuilder & builder);
-
-    // Return the pad value if it is a constant. Return null value otherwise.
-    Value getConstantPaddingValue();
-
-    // Return a vector of all the static or dynamic values (low/high padding) of
-    // the op.
-    inline SmallVector<OpFoldResult> getMixedPadImpl(ArrayAttr staticAttrs,
-                                                     ValueRange values) {
-      SmallVector<OpFoldResult> res;
-      unsigned numDynamic = 0;
-      unsigned count = staticAttrs.size();
-      for (unsigned idx = 0; idx < count; ++idx) {
-        if (ShapedType::isDynamic(staticAttrs[idx].cast<IntegerAttr>().getInt()))
-          res.push_back(values[numDynamic++]);
-        else
-          res.push_back(staticAttrs[idx]);
-      }
-      return res;
-    }
-    SmallVector<OpFoldResult> getMixedLowPad() {
-      return getMixedPadImpl(static_low(), low());
-    }
-    SmallVector<OpFoldResult> getMixedHighPad() {
-      return getMixedPadImpl(static_high(), high());
-    }
-    // Return true if low padding is guaranteed to be 0.
-    bool hasZeroLowPad() {
-      return llvm::all_of(getMixedLowPad(), [](OpFoldResult ofr) {
-        return getConstantIntValue(ofr) == static_cast<int64_t>(0);
-      });
-    }
-    // Return true if high padding is guaranteed to be 0.
-    bool hasZeroHighPad() {
-      return llvm::all_of(getMixedHighPad(), [](OpFoldResult ofr) {
-        return getConstantIntValue(ofr) == static_cast<int64_t>(0);
-      });
-    }
-  }];
-
-  let builders = [
-    // Build a PadTensorOp with mixed static and dynamic entries.
-    OpBuilder<(ins "Value":$source, "ArrayRef<int64_t>":$staticLow,
-      "ArrayRef<int64_t>":$staticHigh, "ValueRange":$low, "ValueRange":$high,
-      CArg<"bool", "false">:$nofold,
-      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
-    // Build a PadTensorOp with all dynamic entries.
-    OpBuilder<(ins "Value":$source, "ValueRange":$low, "ValueRange":$high,
-      CArg<"bool", "false">:$nofold,
-      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
-    // Build a PadTensorOp with mixed static and dynamic entries and custom
-    // result type. If the type passed is nullptr, it is inferred.
-    OpBuilder<(ins "Type":$resultType, "Value":$source,
-      "ArrayRef<OpFoldResult>":$low, "ArrayRef<OpFoldResult>":$high,
-      CArg<"bool", "false">:$nofold,
-      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
-  ];
-
-  let hasCanonicalizer = 1;
-  let hasFolder = 1;
-}
-
 def Linalg_YieldOp : Linalg_Op<"yield", [NoSideEffect, ReturnLike, Terminator]>,
     Arguments<(ins Variadic<AnyType>:$values)> {
   let summary = "Linalg yield operation";
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h b/mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/HoistPadding.h
@@ -1,4 +1,5 @@
-//===- HoistPadding.h - Hoisting transformation for PadTensorOp -*- C++ -*-===//
+//===- HoistPadding.h - Hoisting transformation for tensor::PadOp -*- C++
+//-*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -14,8 +15,11 @@
 namespace mlir {
 class Value;
 
+namespace tensor {
+class PadOp;
+} // namespace tensor
+
 namespace linalg {
-class PadTensorOp;
 
 /// Mechanically hoist padding operations on tensors by `numLoops` into a new,
 /// generally larger tensor. This achieves packing of multiple padding ops into
@@ -59,8 +63,8 @@
 ///      }
 ///    }
 /// ```
-FailureOr<Value> hoistPaddingOnTensors(PadTensorOp opToHoist, int numLoops,
-                                       PadTensorOp &hoistedOp);
+FailureOr<Value> hoistPaddingOnTensors(tensor::PadOp opToHoist, int numLoops,
+                                       tensor::PadOp &hoistedOp);
 
 } // namespace linalg
 } // namespace mlir
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -1132,18 +1132,18 @@
 // Op-specific patterns.
 //===----------------------------------------------------------------------===//
 
-/// PadTensorOp is not canonicalized away yet, so we provide a transformation to
-/// `linalg.generic`.
-struct PadTensorOpTransformationPattern : public OpRewritePattern<PadTensorOp> {
-  using OpRewritePattern<PadTensorOp>::OpRewritePattern;
+/// tensor::PadOp is not canonicalized away yet, so we provide a transformation
+/// to `linalg.generic`.
+struct PadOpTransformationPattern : public OpRewritePattern<tensor::PadOp> {
+  using OpRewritePattern<tensor::PadOp>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(PadTensorOp padOp,
+  LogicalResult matchAndRewrite(tensor::PadOp padOp,
                                 PatternRewriter &rewriter) const override;
 };
 
 /// Pad the operands of `opToPad` to a static bounding box. Use `paddingFunc`
 /// and `nofoldFunc` to set the padding value and the nofold attribute of the
-/// introduced PadTensorOps, respectively. Update `paddedOp` to the cloned
+/// introduced tensor::PadOps, respectively. Update `paddedOp` to the cloned
 /// statically shaped operation and return the extracted dynamically shaped
 /// results. If padding fails, return failure.
 FailureOr<SmallVector<Value>>
@@ -1153,23 +1153,23 @@
                   LinalgOp &paddedOp);
 
 using OptimizeCopyFn =
-    std::function<LogicalResult(PatternRewriter &, PadTensorOp, Value)>;
+    std::function<LogicalResult(PatternRewriter &, tensor::PadOp, Value)>;
 
-/// Rewrite a PadTensorOp into a sequence of InitTensorOp, FillOp and
+/// Rewrite a tensor::PadOp into a sequence of InitTensorOp, FillOp and
 /// InsertSliceOp. For now, only constant padding values are supported.
 /// `OptimizeCopyFn` can be used to customize copying step optimization.
-struct GeneralizePadTensorOpPattern : public OpRewritePattern<PadTensorOp> {
-  GeneralizePadTensorOpPattern(MLIRContext *context,
-                               OptimizeCopyFn optimizeCopyFn = nullptr,
-                               PatternBenefit benefit = 1)
-      : OpRewritePattern<PadTensorOp>(context, benefit),
+struct GeneralizePadOpPattern : public OpRewritePattern<tensor::PadOp> {
+  GeneralizePadOpPattern(MLIRContext *context,
+                         OptimizeCopyFn optimizeCopyFn = nullptr,
+                         PatternBenefit benefit = 1)
+      : OpRewritePattern<tensor::PadOp>(context, benefit),
         optimizeCopyFn(std::move(optimizeCopyFn)) {}
-  LogicalResult matchAndRewrite(PadTensorOp padOp,
+  LogicalResult matchAndRewrite(tensor::PadOp padOp,
                                 PatternRewriter &rewriter) const override;
 
 protected:
   OptimizeCopyFn optimizeCopyFn;
-  Value createFillOrGenerateOp(PatternRewriter &rewriter, PadTensorOp padOp,
+  Value createFillOrGenerateOp(PatternRewriter &rewriter, tensor::PadOp padOp,
                                Value dest,
                                const SmallVector<Value> &dynSizes) const;
 };
@@ -1179,9 +1179,9 @@
 /// are used to encode a certain ordering of pattern application. To avoid
 /// scattering magic constants throughout the code base, the patterns must be
 /// added with this function. `baseBenefit` can be used to offset the benefit
-/// of all PadTensorOp vectorization patterns by a certain value.
-void populatePadTensorOpVectorizationPatterns(RewritePatternSet &patterns,
-                                              PatternBenefit baseBenefit = 1);
+/// of all tensor::PadOp vectorization patterns by a certain value.
+void populatePadOpVectorizationPatterns(RewritePatternSet &patterns,
+                                        PatternBenefit baseBenefit = 1);
 
 /// Match and rewrite for the pattern:
 /// ```
diff --git a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
--- a/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h
@@ -107,12 +107,12 @@
     OpBuilder &b, Location loc, Value source, ArrayRef<OpFoldResult> offsets,
     ArrayRef<OpFoldResult> sizes, ArrayRef<OpFoldResult> strides);
 
-/// Create a PadTensorOp that pads `source` to the size of the statically sized
-/// `type` whose static sizes are assumed to be greater than the dynamic
+/// Create a tensor::PadOp that pads `source` to the size of the statically
+/// sized `type` whose static sizes are assumed to be greater than the dynamic
 /// `source` size. The padding introduces trailing `pad` values until the target
 /// size is met. If `source` is defined by one or more LinalgOps that have been
 /// padded with the same value and sizes, return their padded result instead of
-/// creating a PadTensorOp.
+/// creating a tensor::PadOp.
 ///
 /// Example:
 /// ```
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h b/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h
--- a/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h
+++ b/mlir/include/mlir/Dialect/Tensor/IR/Tensor.h
@@ -19,6 +19,7 @@
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/TilingInterface.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -14,6 +14,7 @@
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/TilingInterface.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
 
 class Tensor_Op<string mnemonic, list<OpTrait> traits = []>
@@ -777,6 +778,190 @@
   let extraClassDeclaration = commonExtraClassDeclaration;
 }
 
+//===----------------------------------------------------------------------===//
+// PadOp
+//===----------------------------------------------------------------------===//
+
+def Tensor_PadOp : Tensor_Op<"pad", [AttrSizedOperandSegments, NoSideEffect]> {
+  let summary = "tensor pad operation";
+  let description = [{
+    `tensor.pad` is an operation that pads the `source` tensor
+    with given `low` and `high` padding config.
+
+    The PadTensor operation supports the following arguments:
+
+    * source: the "base" tensor on which to pad.
+    * low: A list contains the padding along the start of each
+           dimension, i.e `low`.
+    * high: A list contains the padding along the end of each
+            dimension, i.e. `high`.
+    * nofold: indicates that the operation should not be folded when source and
+              result types are equal.
+
+    The result tensor dimensions are `low` + `dim` + `high` along that
+    dimension. The number of elements of `low` and `high` must match
+    the rank of the input tensor. They can be either a constant or a
+    dynamic value.
+
+    The region of the `tensor.pad` operation returns the value to use
+    for the padding. The arguments of the region represent the index
+    of the source being accessed. There should be as many arguments as
+    the rank of the `source` tensor. The value `yield`-ed by the
+    region is used as the value of the view at the given position.
+
+    If `nofold` is set, the padding operation will not be folded away even
+    if the source type and the padded type have the same static shape. This can
+    be used, e.g., for packing or promotion to faster memory.
+
+    Example 1:
+
+    ```mlir
+      %pad_value = ... : f32
+      %0 = tensor.pad %0 low[1, 2] high[2, 3] {
+      ^bb0(%arg0 : index, %arg1 : index):
+        tensor.yield %pad_value : f32
+      } : tensor<?x?xf32> to tensor<?x?xf32>
+    ```
+
+    Example 2:
+
+    ```mlir
+      %pad_value = ... : f32
+      %0 = tensor.pad %arg0 low[2, %arg1, 3, 3] high[3, 3, %arg1, 2] {
+      ^bb0(%arg2: index, %arg3: index, %arg4: index, %arg5: index):
+          tensor.yield %pad_value : f32
+      } : tensor<1x2x2x?xf32> to tensor<6x?x?x?xf32>
+    ```
+
+    Example 3:
+
+    ```mlir
+      %pad_value = ... : f32
+      %0 = tensor.pad %arg0 low[0, 0] high[%ub0, %ub1] {
+      ^bb0(%arg1: index, %arg2: index):
+        tensor.yield %pad_value : f32
+      } : tensor<2x3xf32> to tensor<?x?xf32>
+    ```
+
+    Example 4:
+
+    ```mlir
+      // Force a padded value to be always exist with `nofold`.
+      %pad_value = ... : f32
+      %0 = tensor.pad %arg0 nofold low[0, 0] high[0, 0] {
+      ^bb0(%arg1: index, %arg2: index):
+        tensor.yield %pad_value : f32
+      } : tensor<2x3xf32> to tensor<2x3xf32>
+    ```
+  }];
+
+  let arguments = (ins
+    AnyTensor:$source,
+    Variadic<Index>:$low,
+    Variadic<Index>:$high,
+    I64ArrayAttr:$static_low,
+    I64ArrayAttr:$static_high,
+    UnitAttr:$nofold);
+
+  let regions = (region SizedRegion<1>:$region);
+
+  let results = (outs AnyTensor:$result);
+
+  // TODO: Remove custom<InferType> when AllTypesMatch supports opt. operands.
+  let assemblyFormat = [{
+    $source
+    (`nofold` $nofold^)?
+    `low` `` custom<OperandsOrIntegersSizesList>($low, $static_low)
+    `high` `` custom<OperandsOrIntegersSizesList>($high, $static_high)
+    $region attr-dict `:` type($source) `to` type($result)
+  }];
+
+  let extraClassDeclaration = [{
+    static StringRef getStaticLowAttrName() {
+      return "static_low";
+    }
+
+    static StringRef getStaticHighAttrName() {
+      return "static_high";
+    }
+
+    RankedTensorType getSourceType() {
+      return source().getType().cast<RankedTensorType>();
+    }
+    RankedTensorType getResultType() {
+      return getResult().getType().cast<RankedTensorType>();
+    }
+
+    // Infer the shape of the result tensor given the type of the source tensor
+    // and paddings. Known result dimensions that cannot necessarily be inferred
+    // from low/high padding sizes can be optionally specified. Those will be
+    // considered when computing the result type.
+    static RankedTensorType inferResultType(
+                                RankedTensorType sourceType,
+                                ArrayRef<int64_t> staticLow,
+                                ArrayRef<int64_t> staticHigh,
+                                ArrayRef<int64_t> resultShape = {});
+
+    // Return the pad value if it is a constant. Return null value otherwise.
+    Value getConstantPaddingValue();
+
+    // Return a vector of all the static or dynamic values (low/high padding) of
+    // the op.
+    inline SmallVector<OpFoldResult> getMixedPadImpl(ArrayAttr staticAttrs,
+                                                     ValueRange values) {
+      SmallVector<OpFoldResult> res;
+      unsigned numDynamic = 0;
+      unsigned count = staticAttrs.size();
+      for (unsigned idx = 0; idx < count; ++idx) {
+        if (ShapedType::isDynamic(staticAttrs[idx].cast<IntegerAttr>().getInt()))
+          res.push_back(values[numDynamic++]);
+        else
+          res.push_back(staticAttrs[idx]);
+      }
+      return res;
+    }
+    SmallVector<OpFoldResult> getMixedLowPad() {
+      return getMixedPadImpl(static_low(), low());
+    }
+    SmallVector<OpFoldResult> getMixedHighPad() {
+      return getMixedPadImpl(static_high(), high());
+    }
+    // Return true if low padding is guaranteed to be 0.
+    bool hasZeroLowPad() {
+      return llvm::all_of(getMixedLowPad(), [](OpFoldResult ofr) {
+        return getConstantIntValue(ofr) == static_cast<int64_t>(0);
+      });
+    }
+    // Return true if high padding is guaranteed to be 0.
+    bool hasZeroHighPad() {
+      return llvm::all_of(getMixedHighPad(), [](OpFoldResult ofr) {
+        return getConstantIntValue(ofr) == static_cast<int64_t>(0);
+      });
+    }
+  }];
+
+  let builders = [
+    // Build a PadOp with mixed static and dynamic entries.
+    OpBuilder<(ins "Value":$source, "ArrayRef<int64_t>":$staticLow,
+      "ArrayRef<int64_t>":$staticHigh, "ValueRange":$low, "ValueRange":$high,
+      CArg<"bool", "false">:$nofold,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
+    // Build a PadOp with all dynamic entries.
+    OpBuilder<(ins "Value":$source, "ValueRange":$low, "ValueRange":$high,
+      CArg<"bool", "false">:$nofold,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
+    // Build a PadOp with mixed static and dynamic entries and custom
+    // result type. If the type passed is nullptr, it is inferred.
+    OpBuilder<(ins "Type":$resultType, "Value":$source,
+      "ArrayRef<OpFoldResult>":$low, "ArrayRef<OpFoldResult>":$high,
+      CArg<"bool", "false">:$nofold,
+      CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
+  ];
+
+  let hasCanonicalizer = 1;
+  let hasFolder = 1;
+}
+
 
 //===----------------------------------------------------------------------===//
 // YieldOp
@@ -784,16 +969,17 @@
 
 def Tensor_YieldOp : Tensor_Op<"yield",
     [NoSideEffect, ReturnLike, Terminator,
-     HasParent<"::mlir::tensor::GenerateOp">]> {
+     HasParent<"::mlir::tensor::GenerateOp, ::mlir::tensor::PadOp">]> {
   let summary = "Yield a value from a region";
   let description = [{
      This operation is used to yield a single value from a within a region. It
      is used to create dynamically sized tensors
-     (see `tensor.generate` op).
+     (see `tensor.generate` and `tensor.pad` ops).
   }];
 
   let arguments = (ins AnyType:$value);
   let assemblyFormat = "$value attr-dict `:` type($value)";
+
   // Dummy builder to appease code in templated ensureTerminator that
   // GenerateOp's auto-generated parser calls.
   let builders = [OpBuilder<(ins), [{ /* nothing to do */ }]>];
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h b/mlir/include/mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h
@@ -0,0 +1,36 @@
+//===- TensorTilingOpInterfaceImpl.h - ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Tiling interface for TensorOps with ExternalModel.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_TENSOR_IR_TENSORTILINGINTERFACEIMPL_H_
+#define MLIR_DIALECT_TENSOR_IR_TENSORTILINGINTERFACEIMPL_H_
+
+#include "mlir/IR/Dialect.h"
+
+namespace mlir {
+namespace tensor {
+
+/// Registers external models for Tiling interface for tensor ops.
+/// Currently, it registers:
+///
+/// * TilingInterface for `tensor.pad`.
+///
+/// Unfortunately, a "normal" internal registration is not possible at the
+/// moment, because of the dependency of the interface implementation for these
+/// ops on `affine.apply` and Affine dialect already depends on TensorOps. In
+/// order to break the cyclic dependency (TensorOps->AffineOps->TensorOps) the
+/// implementation is moved to a separate library.
+void registerTilingOpInterfaceExternalModels(mlir::DialectRegistry &registry);
+
+} // namespace tensor
+} // namespace mlir
+
+#endif // MLIR_DIALECT_TENSOR_IR_TENSORTILINGINTERFACEIMPL_H_
diff --git a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
@@ -0,0 +1,34 @@
+//===- Utils.h -  Utilities to support the Tensor dialect -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_TENSOR_UTILS_UTILS_H_
+#define MLIR_DIALECT_TENSOR_UTILS_UTILS_H_
+
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+
+namespace mlir {
+namespace tensor {
+
+// Return a PadOp that pads `source` to `type` size where the static
+// sizes are assumed to be greater than the dynamic sizes. The op performs
+// "high" padding (i.e. it adds trailing padding values until the desired
+// size is met).
+PadOp createPadHighOp(Type type, Value source, Value pad, bool nofold,
+                      Location loc, OpBuilder &builder);
+
+// Return a PadOp that pads `source to `type` size with `pad` value.
+// I.e., a block will be created and the `pad` value will be yielded
+// directly. If the type passed is nullptr, it is inferred.
+PadOp createPadScalarOp(Type type, Value source, Value pad,
+                        ArrayRef<OpFoldResult> low, ArrayRef<OpFoldResult> high,
+                        bool nofold, Location loc, OpBuilder &builder);
+
+} // namespace tensor
+} // namespace mlir
+
+#endif // MLIR_DIALECT_TENSOR_UTILS_UTILS_H_
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -43,6 +43,7 @@
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.h"
+#include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Vector/VectorOps.h"
 #include "mlir/Dialect/X86Vector/X86VectorDialect.h"
@@ -86,6 +87,7 @@
                   x86vector::X86VectorDialect>();
   // clang-format on
   tensor::registerInferTypeOpInterfaceExternalModels(registry);
+  tensor::registerTilingOpInterfaceExternalModels(registry);
 }
 
 /// Append all the MLIR dialects to the registry contained in the given context.
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Tosa/Utils/CoversionUtils.h"
 #include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
@@ -1929,7 +1930,7 @@
       highValues.push_back(highVal);
     }
 
-    auto newPadOp = linalg::PadTensorOp::createPadScalarOp(
+    auto newPadOp = tensor::createPadScalarOp(
         padOp.getType(), input, padConstant, lowValues, highValues,
         /*nofold=*/false, loc, rewriter);
 
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
 #include "mlir/Dialect/Tosa/Utils/CoversionUtils.h"
 #include "mlir/Dialect/Utils/ReshapeOpsUtils.h"
@@ -55,9 +56,9 @@
 
   Value padValue = rewriter.create<arith::ConstantOp>(loc, padAttr);
 
-  return linalg::PadTensorOp::createPadScalarOp(
-             RankedTensorType::get(paddedShape, inputETy), input, padValue,
-             lowIndices, highIndices, /*nofold=*/false, loc, rewriter)
+  return tensor::createPadScalarOp(RankedTensorType::get(paddedShape, inputETy),
+                                   input, padValue, lowIndices, highIndices,
+                                   /*nofold=*/false, loc, rewriter)
       .result();
 }
 
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -1074,561 +1074,6 @@
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// PadTensorOp
-//===----------------------------------------------------------------------===//
-
-// TODO: Replace custom<InferType> directive with AllTypesMatch as soon as it
-// supports optional types.
-void printInferType(OpAsmPrinter &printer, Operation *op, Value optOperand,
-                    Type typeToInfer, Type typeToInferFrom) {}
-
-ParseResult parseInferType(OpAsmParser &parser,
-                           Optional<OpAsmParser::OperandType> optOperand,
-                           Type &typeToInfer, Type typeToInferFrom) {
-  if (optOperand)
-    typeToInfer = typeToInferFrom;
-  return success();
-}
-
-static LogicalResult verify(PadTensorOp op) {
-  auto sourceType = op.source().getType().cast<RankedTensorType>();
-  auto resultType = op.result().getType().cast<RankedTensorType>();
-  auto expectedType = PadTensorOp::inferResultType(
-      sourceType, extractFromI64ArrayAttr(op.static_low()),
-      extractFromI64ArrayAttr(op.static_high()));
-  for (int i = 0, e = sourceType.getRank(); i < e; ++i) {
-    if (resultType.getDimSize(i) == expectedType.getDimSize(i))
-      continue;
-    if (expectedType.isDynamicDim(i))
-      continue;
-    return op.emitError("specified type ")
-           << resultType << " does not match the inferred type "
-           << expectedType;
-  }
-
-  auto &region = op.region();
-  unsigned rank = resultType.getRank();
-  Block &block = region.front();
-  if (block.getNumArguments() != rank)
-    return op.emitError("expected the block to have ") << rank << " arguments";
-
-  // Note: the number and type of yield values are checked in the YieldOp.
-  for (const auto &en : llvm::enumerate(block.getArgumentTypes())) {
-    if (!en.value().isIndex())
-      return op.emitOpError("expected block argument ")
-             << (en.index() + 1) << " to be an index";
-  }
-
-  return success();
-}
-
-RankedTensorType PadTensorOp::inferResultType(RankedTensorType sourceType,
-                                              ArrayRef<int64_t> staticLow,
-                                              ArrayRef<int64_t> staticHigh,
-                                              ArrayRef<int64_t> resultShape) {
-  unsigned rank = sourceType.getRank();
-  assert(staticLow.size() == rank && "unexpected staticLow size mismatch");
-  assert(staticHigh.size() == rank && "unexpected staticHigh size mismatch");
-  assert((resultShape.empty() || resultShape.size() == rank) &&
-         "unexpected resultShape size mismatch");
-
-  SmallVector<int64_t, 4> inferredShape;
-  for (auto i : llvm::seq<unsigned>(0, rank)) {
-    if (sourceType.isDynamicDim(i) ||
-        staticLow[i] == ShapedType::kDynamicSize ||
-        staticHigh[i] == ShapedType::kDynamicSize) {
-      inferredShape.push_back(resultShape.empty() ? ShapedType::kDynamicSize
-                                                  : resultShape[i]);
-    } else {
-      int64_t size = sourceType.getDimSize(i) + staticLow[i] + staticHigh[i];
-      assert((resultShape.empty() || size == resultShape[i] ||
-              resultShape[i] == ShapedType::kDynamicSize) &&
-             "mismatch between inferred shape and result shape");
-      inferredShape.push_back(size);
-    }
-  }
-
-  return RankedTensorType::get(inferredShape, sourceType.getElementType());
-}
-
-void PadTensorOp::build(OpBuilder &b, OperationState &result, Value source,
-                        ArrayRef<int64_t> staticLow,
-                        ArrayRef<int64_t> staticHigh, ValueRange low,
-                        ValueRange high, bool nofold,
-                        ArrayRef<NamedAttribute> attrs) {
-  auto sourceType = source.getType().cast<RankedTensorType>();
-  auto resultType = inferResultType(sourceType, staticLow, staticHigh);
-  build(b, result, resultType, source, low, high, b.getI64ArrayAttr(staticLow),
-        b.getI64ArrayAttr(staticHigh), nofold ? b.getUnitAttr() : UnitAttr());
-  result.addAttributes(attrs);
-}
-
-void PadTensorOp::build(OpBuilder &b, OperationState &result, Value source,
-                        ValueRange low, ValueRange high, bool nofold,
-                        ArrayRef<NamedAttribute> attrs) {
-  auto sourceType = source.getType().cast<RankedTensorType>();
-  unsigned rank = sourceType.getRank();
-  SmallVector<int64_t, 4> staticVector(rank, ShapedType::kDynamicSize);
-  build(b, result, source, staticVector, staticVector, low, high, nofold,
-        attrs);
-}
-
-void PadTensorOp::build(OpBuilder &b, OperationState &result, Type resultType,
-                        Value source, ArrayRef<OpFoldResult> low,
-                        ArrayRef<OpFoldResult> high, bool nofold,
-                        ArrayRef<NamedAttribute> attrs) {
-  assert(resultType.isa<RankedTensorType>());
-  auto sourceType = source.getType().cast<RankedTensorType>();
-  SmallVector<Value, 4> dynamicLow, dynamicHigh;
-  SmallVector<int64_t, 4> staticLow, staticHigh;
-  // staticLow and staticHigh have full information of the padding config.
-  // This will grow staticLow and staticHigh with 1 value. If the config is
-  // dynamic (ie not a constant), dynamicLow and dynamicHigh will grow with 1
-  // value as well.
-  dispatchIndexOpFoldResults(low, dynamicLow, staticLow,
-                             ShapedType::kDynamicSize);
-  dispatchIndexOpFoldResults(high, dynamicHigh, staticHigh,
-                             ShapedType::kDynamicSize);
-  if (!resultType) {
-    resultType =
-        PadTensorOp::inferResultType(sourceType, staticLow, staticHigh);
-  }
-  build(b, result, resultType, source, dynamicLow, dynamicHigh,
-        b.getI64ArrayAttr(staticLow), b.getI64ArrayAttr(staticHigh),
-        nofold ? b.getUnitAttr() : UnitAttr());
-  result.addAttributes(attrs);
-}
-
-PadTensorOp PadTensorOp::createPadScalarOp(Type type, Value source, Value pad,
-                                           ArrayRef<OpFoldResult> low,
-                                           ArrayRef<OpFoldResult> high,
-                                           bool nofold, Location loc,
-                                           OpBuilder &builder) {
-  auto padTensorOp =
-      builder.create<linalg::PadTensorOp>(loc, type, source, low, high, nofold);
-  int rank = padTensorOp.getResultType().getRank();
-  SmallVector<Type, 4> blockArgTypes;
-  blockArgTypes.assign(rank, builder.getIndexType());
-  auto &region = padTensorOp.region();
-  // `builder.createBlock` changes the insertion point within the block. Create
-  // a guard to reset the insertion point of the builder after it is destroyed.
-  OpBuilder::InsertionGuard guard(builder);
-  builder.createBlock(&region, region.end(), blockArgTypes);
-  builder.create<linalg::YieldOp>(loc, pad);
-  return padTensorOp;
-}
-
-PadTensorOp PadTensorOp::createPadHighOp(Type type, Value source, Value pad,
-                                         bool nofold, Location loc,
-                                         OpBuilder &b) {
-  SmallVector<OpFoldResult, 4> low, high;
-  auto rankedTensorType = type.cast<RankedTensorType>();
-  assert(rankedTensorType.hasStaticShape());
-  for (const auto &en : enumerate(rankedTensorType.getShape())) {
-    AffineExpr d0;
-    bindDims(b.getContext(), d0);
-    auto dimOp = b.createOrFold<tensor::DimOp>(loc, source, en.index());
-    Value paddingWidth =
-        makeComposedAffineApply(b, loc, en.value() - d0, {dimOp});
-    high.push_back(paddingWidth);
-    low.push_back(b.createOrFold<arith::ConstantIndexOp>(loc, 0));
-  }
-  return PadTensorOp::createPadScalarOp(type, source, pad, low, high, nofold,
-                                        loc, b);
-}
-
-LogicalResult PadTensorOp::reifyResultShapes(
-    OpBuilder &b, ReifiedRankedShapedTypeDims &reifiedReturnShapes) {
-  Location loc = getLoc();
-  auto lowPad = getMixedLowPad();
-  auto highPad = getMixedHighPad();
-  SmallVector<Value> shapes;
-  for (auto dim : llvm::seq<int64_t>(0, getSourceType().getRank())) {
-    // Shape along each dimension is source dim + low pad + high pad.
-    SmallVector<Value> mapOperands;
-    mapOperands.push_back(b.createOrFold<tensor::DimOp>(loc, source(), dim));
-    AffineExpr expr = b.getAffineDimExpr(0);
-    unsigned numSymbols = 0;
-    auto addOpFoldResult = [&](OpFoldResult valueOrAttr) {
-      if (Value v = valueOrAttr.dyn_cast<Value>()) {
-        expr = expr + b.getAffineSymbolExpr(numSymbols++);
-        mapOperands.push_back(v);
-        return;
-      }
-      int64_t staticValue =
-          valueOrAttr.get<Attribute>().cast<IntegerAttr>().getInt();
-      expr = expr + staticValue;
-    };
-    addOpFoldResult(lowPad[dim]);
-    addOpFoldResult(highPad[dim]);
-    shapes.push_back(applyMapToValues(
-        b, loc, AffineMap::get(1, numSymbols, expr), mapOperands)[0]);
-  }
-  reifiedReturnShapes.emplace_back(std::move(shapes));
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-// Methods related to PadTensor tiling.
-//===----------------------------------------------------------------------===//
-
-SmallVector<Value> PadTensorOp::getDestinationOperands(OpBuilder &b) {
-  ReifiedRankedShapedTypeDims reifiedShapes;
-  (void)reifyResultShapes(b, reifiedShapes);
-  SmallVector<OpFoldResult> mixedSizes = getAsOpFoldResult(reifiedShapes[0]);
-  Value initTensor = b.create<InitTensorOp>(getLoc(), mixedSizes,
-                                            getResultType().getElementType());
-  return {initTensor};
-}
-
-SmallVector<StringRef> PadTensorOp::getLoopIteratorTypes() {
-  SmallVector<StringRef> iteratorTypes(getResultType().getRank(),
-                                       getParallelIteratorTypeName());
-  return iteratorTypes;
-}
-
-SmallVector<Range> PadTensorOp::getIterationDomain(OpBuilder &b) {
-  ReifiedRankedShapedTypeDims reifiedShapes;
-  (void)reifyResultShapes(b, reifiedShapes);
-  Value zero = b.create<arith::ConstantIndexOp>(getLoc(), 0);
-  Value one = b.create<arith::ConstantIndexOp>(getLoc(), 1);
-  // Initialize all the ranges to {zero, one, one}. All the `ub`s are
-  // overwritten.
-  SmallVector<Range> loopRanges(reifiedShapes[0].size(), {zero, one, one});
-  for (const auto &ub : enumerate(reifiedShapes[0]))
-    loopRanges[ub.index()].size = ub.value();
-  return loopRanges;
-}
-
-SmallVector<Operation *> PadTensorOp::getTiledImplementation(
-    OpBuilder &b, ValueRange dest, ArrayRef<OpFoldResult> offsets,
-    ArrayRef<OpFoldResult> sizes, bool /*tileDestOperands*/) {
-  // Only constant padding value supported.
-  Value padValue = getConstantPaddingValue();
-  if (!padValue)
-    return {};
-
-  // Helper variables and functions for various arithmetic operations. These are
-  // used extensively for computing new offset/length and padding values.
-  Location loc = getLoc();
-  AffineExpr dim0, dim1;
-  bindDims(b.getContext(), dim0, dim1);
-  // Add two integers.
-  auto addMap = AffineMap::get(2, 0, {dim0 + dim1});
-  auto add = [&](Value v1, Value v2) {
-    return b.createOrFold<AffineApplyOp>(loc, addMap, ValueRange{v1, v2});
-  };
-  // Subtract two integers.
-  auto subMap = AffineMap::get(2, 0, {dim0 - dim1});
-  auto sub = [&](Value v1, Value v2) {
-    return b.createOrFold<AffineApplyOp>(loc, subMap, ValueRange{v1, v2});
-  };
-  // Take the minimum of two integers.
-  auto idMap = AffineMap::getMultiDimIdentityMap(2, b.getContext());
-  auto min = [&](Value v1, Value v2) {
-    return b.createOrFold<AffineMinOp>(loc, idMap, ValueRange{v1, v2});
-  };
-  // Take the maximum of two integers.
-  auto max = [&](Value v1, Value v2) {
-    return b.createOrFold<AffineMaxOp>(loc, idMap, ValueRange{v1, v2});
-  };
-  // Zero index-typed integer.
-  auto zero = b.create<arith::ConstantIndexOp>(loc, 0);
-
-  // Helper function for filling static/dynamic low/high padding indices vectors
-  // of PadTensorOp.
-  auto appendIndex = [&](Value val, SmallVector<Value> &dynIndices,
-                         SmallVector<int64_t> &staticIndices) {
-    if (auto constInt = getConstantIntValue(val)) {
-      staticIndices.push_back(*constInt);
-    } else {
-      staticIndices.push_back(ShapedType::kDynamicSize);
-      dynIndices.push_back(val);
-    }
-  };
-
-  // Compute new offsets, lengths, low padding, high padding.
-  SmallVector<OpFoldResult> newOffsets, newLengths, newStrides;
-  SmallVector<Value> newLows, newHighs;
-  SmallVector<int64_t> staticNewLows, staticNewHighs;
-  // Set to true if the original data source is not read at all.
-  bool hasZeroLen = false;
-  // Same as hasZeroLen, but for dynamic dimension sizes. This condition
-  // is true if the original data source turns out to be unused at runtime.
-  Value dynHasZeroLenCond;
-
-  int64_t rank = getSourceType().getRank();
-  for (unsigned dim = 0; dim < rank; ++dim) {
-    auto low = getValueOrCreateConstantIndexOp(b, loc, getMixedLowPad()[dim]);
-    bool hasLowPad = getConstantIntValue(low) != static_cast<int64_t>(0);
-    auto high = getValueOrCreateConstantIndexOp(b, loc, getMixedHighPad()[dim]);
-    bool hasHighPad = getConstantIntValue(high) != static_cast<int64_t>(0);
-    auto offset = getValueOrCreateConstantIndexOp(b, loc, offsets[dim]);
-    auto length = getValueOrCreateConstantIndexOp(b, loc, sizes[dim]);
-    auto srcSize = b.createOrFold<tensor::DimOp>(loc, source(), dim);
-
-    // The new amount of low padding is `low - offset`. Except for the case
-    // where none of the low padding is read. In that case, the new amount of
-    // low padding is zero.
-    //
-    // Optimization: If low = 0, then newLow = 0.
-    Value newLow = hasLowPad ? max(zero, sub(low, offset)) : zero;
-    appendIndex(newLow, newLows, staticNewLows);
-
-    // Start reading the data from position `offset - low`. Since the original
-    // read may have started in the low padding zone, this value could be
-    // negative. Therefore, start reading from:
-    //
-    // max(offset - low, 0)
-    //
-    // The original read could also have started in the high padding zone.
-    // In that case, set the offset to the end of source tensor. The new
-    // ExtractSliceOp length will be zero in that case. (Effectively reading no
-    // data from the source.)
-    //
-    // Optimization: If low = 0, then the formula can be simplified.
-    Value newOffset = hasLowPad ? min(max(sub(offset, low), zero), srcSize)
-                                : min(offset, srcSize);
-    newOffsets.push_back(getAsOpFoldResult(newOffset));
-
-    // The original ExtractSliceOp was reading until position `offset + length`.
-    // Therefore, the corresponding position within the source tensor is:
-    //
-    // offset + length - low
-    //
-    // In case the original ExtractSliceOp stopped reading within the low
-    // padding zone, this value can be negative. In that case, the end position
-    // of the read should be zero. (Similar to newOffset.)
-    //
-    // The original read could also have stopped in the high padding zone.
-    // In that case, set the end positition of the read should be the end of the
-    // source tensor. (Similar to newOffset.)
-    //
-    // endLoc = min(max(offset - low + length, 0), srcSize)
-    //
-    // The new ExtractSliceOp length is `endLoc - newOffset`.
-    //
-    // Optimization: If low = 0, then the formula can be simplified.
-    Value endLoc = hasLowPad
-                       ? min(max(add(sub(offset, low), length), zero), srcSize)
-                       : min(add(offset, length), srcSize);
-    Value newLength = sub(endLoc, newOffset);
-    newLengths.push_back(getAsOpFoldResult(newLength));
-
-    // Check if newLength is zero. In that case, no SubTensorOp should be
-    // executed.
-    if (auto newLengthInt = getConstantIntValue(newLength)) {
-      hasZeroLen |= *newLengthInt == 0;
-    } else {
-      Value check = b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
-                                            newLength, zero);
-      dynHasZeroLenCond =
-          dynHasZeroLenCond
-              ? b.create<arith::OrIOp>(loc, check, dynHasZeroLenCond)
-              : check;
-    }
-
-    // The amount of high padding is simply the number of elements remaining,
-    // so that the result has the same length as the original ExtractSliceOp.
-    // As an optimization, if the original high padding is zero, then the new
-    // high padding must also be zero.
-    Value newHigh = hasHighPad ? sub(sub(length, newLength), newLow) : zero;
-    appendIndex(newHigh, newHighs, staticNewHighs);
-
-    // Only unit stride supported.
-    newStrides.push_back(b.getIndexAttr(1));
-  }
-
-  // The shape of the result can be obtained from the sizes passed in.
-  SmallVector<Value> dynDims;
-  SmallVector<int64_t> shape;
-  dispatchIndexOpFoldResults(sizes, dynDims, shape, ShapedType::kDynamicSize);
-  RankedTensorType resultType =
-      RankedTensorType::get(shape, getResultType().getElementType());
-
-  // Insert cast to ensure that types match. (May be folded away.)
-  auto castResult = [&](Value val) -> Operation * {
-    auto castOp = b.create<tensor::CastOp>(loc, resultType, val);
-    return castOp;
-  };
-
-  // In cases where the original data source is unused: Emit a GenerateOp and
-  // do not generate a SliceOp. (The result shape of the SliceOp would
-  // have a dimension of size 0, the semantics of which is unclear.)
-  auto createGenerateOp = [&]() {
-    // Create GenerateOp.
-    auto generateOp = b.create<tensor::GenerateOp>(
-        loc, resultType, dynDims,
-        [&](OpBuilder &builder, Location gLoc, ValueRange indices) {
-          builder.create<tensor::YieldOp>(gLoc, padValue);
-        });
-    return castResult(generateOp);
-  };
-
-  // Emit a SliceOp and a PadTensorOp. Should not be used in cases where
-  // the result shape of the new SliceOp has a zero dimension.
-  auto createPadTensorOfSubTensor = [&]() {
-    // Create pad_tensor(subtensor(x)).
-    auto newSliceOp = b.create<tensor::ExtractSliceOp>(
-        loc, source(), newOffsets, newLengths, newStrides);
-    auto newPadTensorOp = b.create<PadTensorOp>(
-        loc, newSliceOp, staticNewLows, staticNewHighs, newLows, newHighs);
-
-    // Copy region to new PadTensorOp.
-    BlockAndValueMapping bvm;
-    region().cloneInto(&newPadTensorOp.getRegion(), bvm);
-
-    // Cast result and return.
-    return castResult(newPadTensorOp);
-  };
-
-  // Rewrite subtensor(pad_tensor(x)) into a GenerateOp it is statically known
-  // that the original data source x is not used.
-  if (hasZeroLen) {
-    return {createGenerateOp()};
-  }
-
-  // If there are dynamic dimensions: Generate an scf.if check to avoid creating
-  // SliceOps with result dimensions of size 0 at runtime.
-  if (dynHasZeroLenCond) {
-    auto result = b.create<scf::IfOp>(
-        loc, resultType, dynHasZeroLenCond,
-        /*thenBuilder=*/
-        [&](OpBuilder &b, Location loc) {
-          b.create<scf::YieldOp>(loc, createGenerateOp()->getResult(0));
-        },
-        /*elseBuilder=*/
-        [&](OpBuilder &b, Location loc) {
-          b.create<scf::YieldOp>(loc,
-                                 createPadTensorOfSubTensor()->getResult(0));
-        });
-    return {result};
-  }
-  return {createPadTensorOfSubTensor()};
-}
-
-namespace {
-// Folds linalg.pad_tensor when padding is static zeros and the attribute
-// doesn't request otherwise.
-struct FoldStaticZeroPadding : public OpRewritePattern<PadTensorOp> {
-  using OpRewritePattern<PadTensorOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(PadTensorOp padTensorOp,
-                                PatternRewriter &rewriter) const override {
-    if (!padTensorOp.hasZeroLowPad() || !padTensorOp.hasZeroHighPad())
-      return failure();
-    if (padTensorOp.nofold())
-      return failure();
-    rewriter.replaceOpWithNewOp<tensor::CastOp>(
-        padTensorOp, padTensorOp.result().getType(), padTensorOp.source());
-    return success();
-  }
-};
-
-// Fold CastOp into PadTensorOp when adding static information.
-struct FoldSourceTensorCast : public OpRewritePattern<PadTensorOp> {
-  using OpRewritePattern<PadTensorOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(PadTensorOp padTensorOp,
-                                PatternRewriter &rewriter) const override {
-    auto castOp = padTensorOp.source().getDefiningOp<tensor::CastOp>();
-    if (!tensor::canFoldIntoConsumerOp(castOp))
-      return failure();
-
-    auto newResultType = PadTensorOp::inferResultType(
-        castOp.source().getType().cast<RankedTensorType>(),
-        extractFromI64ArrayAttr(padTensorOp.static_low()),
-        extractFromI64ArrayAttr(padTensorOp.static_high()),
-        padTensorOp.getResultType().getShape());
-
-    if (newResultType == padTensorOp.getResultType()) {
-      rewriter.updateRootInPlace(padTensorOp, [&]() {
-        padTensorOp.sourceMutable().assign(castOp.source());
-      });
-    } else {
-      auto newOp = rewriter.create<PadTensorOp>(
-          padTensorOp->getLoc(), newResultType, padTensorOp.source(),
-          padTensorOp.low(), padTensorOp.high(), padTensorOp.static_low(),
-          padTensorOp.static_high(), padTensorOp.nofold());
-      BlockAndValueMapping mapper;
-      padTensorOp.getRegion().cloneInto(&newOp.getRegion(), mapper);
-
-      rewriter.replaceOpWithNewOp<tensor::CastOp>(
-          padTensorOp, padTensorOp.getResultType(), newOp);
-    }
-    return success();
-  }
-};
-
-// Fold CastOp using the result of PadTensorOp back into the latter if it adds
-// static information.
-struct FoldTargetTensorCast : public OpRewritePattern<PadTensorOp> {
-  using OpRewritePattern<PadTensorOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(PadTensorOp padTensorOp,
-                                PatternRewriter &rewriter) const override {
-    if (!padTensorOp.result().hasOneUse())
-      return failure();
-    auto tensorCastOp =
-        dyn_cast<tensor::CastOp>(*padTensorOp->getUsers().begin());
-    if (!tensorCastOp)
-      return failure();
-    if (!tensor::preservesStaticInformation(padTensorOp.result().getType(),
-                                            tensorCastOp.dest().getType()))
-      return failure();
-
-    auto replacementOp = rewriter.create<PadTensorOp>(
-        padTensorOp.getLoc(), tensorCastOp.dest().getType(),
-        padTensorOp.source(), padTensorOp.low(), padTensorOp.high(),
-        padTensorOp.static_low(), padTensorOp.static_high(),
-        padTensorOp.nofold());
-    replacementOp.region().takeBody(padTensorOp.region());
-
-    rewriter.replaceOp(padTensorOp, replacementOp.result());
-    rewriter.replaceOp(tensorCastOp, replacementOp.result());
-    return success();
-  }
-};
-} // namespace
-
-void PadTensorOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                              MLIRContext *context) {
-  results.add<FoldStaticZeroPadding, FoldSourceTensorCast>(context);
-  results.add<FoldTargetTensorCast>(context);
-}
-
-/// Return the padding value of the PadTensorOp if it constant. In this context,
-/// "constant" means an actual constant or "defined outside of the block".
-///
-/// Values are considered constant in three cases:
-///  - A ConstantLike value.
-///  - A basic block argument from a different block.
-///  - A value defined outside of the block.
-///
-/// If the padding value is not constant, an empty Value is returned.
-Value PadTensorOp::getConstantPaddingValue() {
-  auto yieldOp = dyn_cast<YieldOp>(getRegion().front().getTerminator());
-  if (!yieldOp || yieldOp.values().size() != 1)
-    return {};
-  Value padValue = yieldOp.values().front();
-  // Check if yield value is a constant.
-  if (matchPattern(padValue, m_Constant()))
-    return padValue;
-  // Check if yield value is defined inside the PadTensorOp block.
-  if (padValue.getParentBlock() == &getRegion().front())
-    return {};
-  // Else: Yield value defined outside of the PadTensorOp block.
-  return padValue;
-}
-
-OpFoldResult PadTensorOp::fold(ArrayRef<Attribute>) {
-  if (getResultType().hasStaticShape() && getResultType() == getSourceType() &&
-      !nofold())
-    return source();
-  return {};
-}
-
 //===----------------------------------------------------------------------===//
 // YieldOp
 //===----------------------------------------------------------------------===//
@@ -1682,16 +1127,6 @@
   if (auto linalgOp = dyn_cast<LinalgOp>(parentOp))
     return verifyYield(op, cast<LinalgOp>(parentOp));
 
-  if (auto padTensorOp = dyn_cast<linalg::PadTensorOp>(parentOp)) {
-    if (op.getNumOperands() != 1)
-      return op.emitOpError("expected single yield operand (got ")
-             << op->getNumOperands() << ")";
-    if (op.getOperand(0).getType() !=
-        padTensorOp.getType().cast<ShapedType>().getElementType())
-      return op.emitOpError("expected yield type to match shape element type");
-    return success();
-  }
-
   if (auto tiledLoopOp = dyn_cast<linalg::TiledLoopOp>(parentOp)) {
     // Check if output args with tensor types match results types.
     SmallVector<Value, 2> tensorOuts;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Bufferize.cpp
@@ -320,7 +320,7 @@
     target.addLegalDialect<arith::ArithmeticDialect, AffineDialect,
                            memref::MemRefDialect, StandardOpsDialect,
                            tensor::TensorDialect>();
-    target.addIllegalOp<InitTensorOp, PadTensorOp, tensor::CollapseShapeOp,
+    target.addIllegalOp<InitTensorOp, tensor::PadOp, tensor::CollapseShapeOp,
                         tensor::ExpandShapeOp, tensor::ExtractSliceOp,
                         tensor::InsertSliceOp>();
 
@@ -363,5 +363,5 @@
       VectorTransferWriteOpConverter
     >(typeConverter, patterns.getContext());
   // clang-format on
-  patterns.add<GeneralizePadTensorOpPattern>(patterns.getContext());
+  patterns.add<GeneralizePadOpPattern>(patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
@@ -66,7 +66,7 @@
 
 static void applyEnablingTransformations(ModuleOp moduleOp) {
   RewritePatternSet patterns(moduleOp.getContext());
-  patterns.add<GeneralizePadTensorOpPattern>(moduleOp.getContext());
+  patterns.add<GeneralizePadOpPattern>(moduleOp.getContext());
   (void)applyPatternsAndFoldGreedily(moduleOp, std::move(patterns));
 }
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
@@ -1,4 +1,5 @@
-//===- HoistPadding.cpp - Hoisting transformation for PadTensorOp ---------===//
+//===- HoistPadding.cpp - Hoisting transformation for tensor::PadOp
+//---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -37,7 +38,7 @@
 using namespace mlir;
 using namespace mlir::linalg;
 
-/// Analysis class to support PadTensorOp hoisting across multiple enclosing
+/// Analysis class to support tensor::PadOp hoisting across multiple enclosing
 /// loops. The failure conditions are:
 ///   1. Pad op has a use that is not an input of a LinalgOp.
 ///   2. Pad op does not have a constant padding value.
@@ -53,7 +54,7 @@
 ///   8. There is no enclosing scf::ForOp that indexes the padded data.
 /// Other cases succeed and will trigger hoisting of the pad op.
 struct HoistingAnalysis {
-  HoistingAnalysis(PadTensorOp padTensorOp, int numLoops);
+  HoistingAnalysis(tensor::PadOp padTensorOp, int numLoops);
 
   bool isValid() { return valid; }
 
@@ -98,7 +99,7 @@
   /// ```
   /// dropNonIndexDependencies(%padded_slice, %slice)
   /// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.
-  LogicalResult dropNonIndexDependencies(PadTensorOp padTensorOp,
+  LogicalResult dropNonIndexDependencies(tensor::PadOp padTensorOp,
                                          tensor::ExtractSliceOp sliceOp);
 
   /// Encodes whether the analysis is valid and hoisting can proceed.
@@ -107,7 +108,7 @@
 
 /// Return true if all uses of `padTensorOp` are an input tensor of some
 /// LinalgOp.
-static bool isOnlyUsedAsInputOfLinalgOp(PadTensorOp padTensorOp) {
+static bool isOnlyUsedAsInputOfLinalgOp(tensor::PadOp padTensorOp) {
   for (OpOperand &use : padTensorOp.result().getUses()) {
     auto linalgUser = dyn_cast<linalg::LinalgOp>(use.getOwner());
     if (!linalgUser || !linalgUser.isInputTensor(&use)) {
@@ -126,7 +127,7 @@
 /// Multi-loops such as scf.parallel or linalg.tiled_loop are not modeled atm.
 /// Control-flow and other containing ops with regions are not modeled atm.
 static void
-getAtMostNEnclosingLoops(PadTensorOp padTensorOp, int nLevels,
+getAtMostNEnclosingLoops(tensor::PadOp padTensorOp, int nLevels,
                          SmallVector<scf::ForOp> &reverseEnclosingLoops) {
   AsmState state(padTensorOp->getParentOfType<mlir::FuncOp>());
   (void)state;
@@ -143,7 +144,7 @@
   }
 }
 
-HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int numLoops) {
+HoistingAnalysis::HoistingAnalysis(tensor::PadOp padTensorOp, int numLoops) {
   valid = false;
 
   // Bail on any use that isn't an input of a Linalg op.
@@ -232,7 +233,7 @@
 }
 
 LogicalResult
-HoistingAnalysis::dropNonIndexDependencies(PadTensorOp padTensorOp,
+HoistingAnalysis::dropNonIndexDependencies(tensor::PadOp padTensorOp,
                                            tensor::ExtractSliceOp sliceOp) {
   // Set of all values used for index computation.
   SetVector<Value> indexEdges;
@@ -373,9 +374,9 @@
                                        ValueRange{ivVal, lbVal, stepVal});
 }
 
-FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(PadTensorOp opToHoist,
+FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(tensor::PadOp opToHoist,
                                                      int numLoops,
-                                                     PadTensorOp &hoistedOp) {
+                                                     tensor::PadOp &hoistedOp) {
   LLVM_DEBUG(DBGS() << "Try to hoist " << *(opToHoist) << " by " << numLoops
                     << " loops\n");
   HoistingAnalysis analysis(opToHoist, numLoops);
@@ -399,7 +400,7 @@
   // Create the packed tensor<?x?x..?xpadded_shape> into which we amortize
   // padding.
   SmallVector<int64_t> packedShape(nPackedLoops, ShapedType::kDynamicSize);
-  // TODO: go grab dims when necessary, for now PadTensorOp returns a static
+  // TODO: go grab dims when necessary, for now tensor::PadOp returns a static
   // tensor.
   llvm::append_range(packedShape, paddedTensorType.getShape());
   auto packedTensorType =
@@ -463,7 +464,7 @@
   // sizes = [1 .. 1, paddedShape].
   SmallVector<OpFoldResult> sizes(nPackedLoops, b.getIndexAttr(1));
   for (int64_t sz : paddedTensorType.getShape()) {
-    // TODO: go grab dims when necessary, for now PadTensorOp returns a static
+    // TODO: go grab dims when necessary, for now tensor::PadOp returns a static
     // tensor.
     assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes");
     sizes.push_back(b.getIndexAttr(sz));
@@ -506,6 +507,7 @@
       loc, opToHoist.getResultType(), packedTensor, offsets, sizes, strides);
 
   // Make the newly cloned `opToHoist` available to the caller.
-  hoistedOp = cast<PadTensorOp>(bvm.lookup(opToHoist.result()).getDefiningOp());
+  hoistedOp =
+      cast<tensor::PadOp>(bvm.lookup(opToHoist.result()).getDefiningOp());
   return newResult;
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp b/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/LinalgStrategyPasses.cpp
@@ -100,7 +100,7 @@
                                              filter);
     else
       tilingPattern.add<LinalgTilingPattern>(ctx, options, filter);
-    if (anchorOpName == linalg::PadTensorOp::getOperationName())
+    if (anchorOpName == tensor::PadOp::getOperationName())
       populatePadTensorTilingPatterns(tilingPattern, options);
     (void)applyPatternsAndFoldGreedily(funcOp, std::move(tilingPattern));
   }
@@ -302,12 +302,12 @@
                                        std::move(vectorizationPatterns));
 
     // Apply the pad tensor op vectorization separately to avoid running the
-    // GenericPadTensorOpVectorizationPattern too early.
+    // GenericPadOpVectorizationPattern too early.
     // TODO: Improve once we have better infrastructure to control pattern
     // application.
     if (vectorizePadding) {
       RewritePatternSet patterns(funcOp.getContext());
-      linalg::populatePadTensorOpVectorizationPatterns(patterns);
+      linalg::populatePadOpVectorizationPatterns(patterns);
       (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
     }
   }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/PadOpInterchange.cpp b/mlir/lib/Dialect/Linalg/Transforms/PadOpInterchange.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/PadOpInterchange.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/PadOpInterchange.cpp
@@ -38,9 +38,9 @@
 /// ```
 ///
 /// if the `linalg.generic` has all parallel iterator types.
-struct FusePadTensorOp : OpRewritePattern<PadTensorOp> {
-  using OpRewritePattern<PadTensorOp>::OpRewritePattern;
-  LogicalResult matchAndRewrite(PadTensorOp padOp,
+struct FusePadOp : OpRewritePattern<tensor::PadOp> {
+  using OpRewritePattern<tensor::PadOp>::OpRewritePattern;
+  LogicalResult matchAndRewrite(tensor::PadOp padOp,
                                 PatternRewriter &rewriter) const override {
     // Only works on padding op that sets the padded value to a constant.
     Value padValue = padOp.getConstantPaddingValue();
@@ -61,7 +61,10 @@
           padOp, "only supported for ops with all parallel iterator types");
     }
     ReifiedRankedShapedTypeDims resultShape;
-    if (failed(padOp.reifyResultShapes(rewriter, resultShape)) ||
+    ReifyRankedShapedTypeOpInterface reifyShapedTypeInterface =
+        dyn_cast<ReifyRankedShapedTypeOpInterface>(padOp.getOperation());
+    if (failed(reifyShapedTypeInterface.reifyResultShapes(rewriter,
+                                                          resultShape)) ||
         resultShape.size() != 1) {
       return rewriter.notifyMatchFailure(
           padOp, "failed to get shape of pad op result");
@@ -118,5 +121,5 @@
 
 void mlir::linalg::populateFusePadTensorWithProducerLinalgOpPatterns(
     RewritePatternSet &patterns) {
-  patterns.add<FusePadTensorOp>(patterns.getContext());
+  patterns.add<FusePadOp>(patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -338,18 +338,18 @@
   return failure();
 }
 
-/// Generate a loop nest around a given PadTensorOp (for tiling). `newPadOp`
-/// and `loopNest` are output parameters that return the new (tiled) PadTensorOp
-/// and the loop nest.
-static LogicalResult tilePadTensorOp(RewriterBase &builder, PadTensorOp op,
-                                     PadTensorOp &newPadOp, LoopNest &loopNest,
-                                     const LinalgTilingOptions &options) {
+/// Generate a loop nest around a given tensor::PadOp (for tiling). `newPadOp`
+/// and `loopNest` are output parameters that return the new (tiled)
+/// tensor::PadOp and the loop nest.
+static LogicalResult tilePadOp(RewriterBase &builder, tensor::PadOp op,
+                               tensor::PadOp &newPadOp, LoopNest &loopNest,
+                               const LinalgTilingOptions &options) {
   Location loc = op.getLoc();
   OpBuilder::InsertionGuard g(builder);
   builder.setInsertionPoint(op);
 
-  // Clone PadTensorOp so that the existing op can be replaced more easily.
-  newPadOp = cast<PadTensorOp>(builder.clone(*op.getOperation()));
+  // Clone tensor::PadOp so that the existing op can be replaced more easily.
+  newPadOp = cast<tensor::PadOp>(builder.clone(*op.getOperation()));
   // Get rank and tile sizes.
   int64_t rank = op.getResultType().getRank();
   SmallVector<Value> tileSizes =
@@ -358,7 +358,9 @@
   Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
   tileSizes.append(rank - tileSizes.size(), zero);
   // Compute lower and upper bounds of the loop nest.
-  SmallVector<Range> ranges = op.getIterationDomain(builder);
+  TilingInterface tilingInterface =
+      dyn_cast<TilingInterface>(op.getOperation());
+  SmallVector<Range> ranges = tilingInterface.getIterationDomain(builder);
   SmallVector<Value> lbs, dims, allDims, steps;
   for (int64_t i = 0; i < rank; ++i) {
     allDims.push_back(ranges[i].size);
@@ -369,7 +371,8 @@
     }
   }
   // Generate loop nest: One loop per dimension.
-  SmallVector<Value> destOperand = op.getDestinationOperands(builder);
+  SmallVector<Value> destOperand =
+      tilingInterface.getDestinationOperands(builder);
   loopNest = mlir::scf::buildLoopNest(
       builder, loc, lbs, /*ubs=*/dims, steps, ValueRange(destOperand),
       [&](OpBuilder &b, Location loc, ValueRange localIvs,
@@ -379,8 +382,8 @@
             computeTileOffsets(b, loc, localIvs, tileSizes);
         SmallVector<Value> sizes =
             computeTileSizes(b, loc, localIvs, tileSizes, allDims);
-        // Create ExtractSliceOp: Extract a tile from the PadTensorOp.
-        // Note: The PadTensorOp is located outside of the loop nest. It is
+        // Create ExtractSliceOp: Extract a tile from the tensor::PadOp.
+        // Note: The tensor::PadOp is located outside of the loop nest. It is
         // later moved inside by ExtractSliceOfPadTensorSwapPattern.
         auto map = AffineMap::getMultiDimIdentityMap(rank, b.getContext());
         Value tiledOutput =
@@ -399,21 +402,21 @@
 }
 
 namespace {
-struct PadTensorOpTilingPattern : public OpRewritePattern<PadTensorOp> {
-  PadTensorOpTilingPattern(MLIRContext *ctx, LinalgTilingOptions opt)
-      : OpRewritePattern<PadTensorOp>(ctx), options(std::move(opt)) {}
+struct PadOpTilingPattern : public OpRewritePattern<tensor::PadOp> {
+  PadOpTilingPattern(MLIRContext *ctx, LinalgTilingOptions opt)
+      : OpRewritePattern<tensor::PadOp>(ctx), options(std::move(opt)) {}
 
-  LogicalResult matchAndRewrite(PadTensorOp op,
+  LogicalResult matchAndRewrite(tensor::PadOp op,
                                 PatternRewriter &rewriter) const override {
     if (op->hasAttr(LinalgTransforms::kLinalgTransformMarker))
       return failure();
-    PadTensorOp newPadOp;
+    tensor::PadOp newPadOp;
     LoopNest loopNest;
-    if (failed(tilePadTensorOp(rewriter, op, newPadOp, loopNest, options)))
+    if (failed(tilePadOp(rewriter, op, newPadOp, loopNest, options)))
       return failure();
     newPadOp->setAttr(LinalgTransforms::kLinalgTransformMarker,
                       rewriter.getUnitAttr());
-    // Replace all uses of the original PadTensorOp.
+    // Replace all uses of the original tensor::PadOp.
     rewriter.replaceOp(op, loopNest.getResults()[0]);
     return success();
   }
@@ -470,7 +473,7 @@
   tensor::InsertSliceOp::getCanonicalizationPatterns(patterns, ctx);
 
   InitTensorOp::getCanonicalizationPatterns(patterns, ctx);
-  PadTensorOp::getCanonicalizationPatterns(patterns, ctx);
+  tensor::PadOp::getCanonicalizationPatterns(patterns, ctx);
   ctx->getLoadedDialect<LinalgDialect>()->getCanonicalizationPatterns(patterns);
 
   CanonicalizationPatternList<
@@ -489,13 +492,13 @@
 #define GET_OP_LIST
 #include "mlir/Dialect/Linalg/IR/LinalgStructuredOps.cpp.inc"
                  >::insert(patterns, options, f);
-  patterns.add<PadTensorOpTilingPattern>(ctx, options);
+  patterns.add<PadOpTilingPattern>(ctx, options);
 }
 
 void mlir::linalg::populatePadTensorTilingPatterns(
     RewritePatternSet &patterns, const LinalgTilingOptions &options) {
   auto *ctx = patterns.getContext();
-  patterns.add<PadTensorOpTilingPattern>(ctx, options);
+  patterns.add<PadOpTilingPattern>(ctx, options);
 }
 
 static void applyExtractSliceOfPadTensorSwapPattern(FuncOp funcOp) {
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -160,9 +160,9 @@
 /// Helper function that tries to pad `opOperand`. Exit early for scalar
 /// operands, if `paddingFunc` returns failure, or if `opOperand` is not defined
 /// by an ExtractSliceOp. Otherwise, try to pad the operand even if it already
-/// has a static shape. Set `result` to the result of the created PadTensorOp or
-/// and return success if the operand either has been padded to a static shape
-/// or already had a static shape and failure otherwise.
+/// has a static shape. Set `result` to the result of the created tensor::PadOp
+/// or and return success if the operand either has been padded to a static
+/// shape or already had a static shape and failure otherwise.
 static LogicalResult padOperandToSmallestStaticBoundingBox(
     OpBuilder &b, linalg::LinalgOp opToPad, OpOperand *opOperand,
     const PaddingValueComputationFunction &paddingFunc,
@@ -528,10 +528,10 @@
   // Hoist the padding.
   for (const auto &en : enumerate(depths)) {
     OpOperand &opOperand = paddedOp->getOpOperand(en.index());
-    auto padTensorOp = opOperand.get().getDefiningOp<PadTensorOp>();
+    auto padTensorOp = opOperand.get().getDefiningOp<tensor::PadOp>();
     if (!padTensorOp || en.value() == 0)
       continue;
-    PadTensorOp hoistedOp;
+    tensor::PadOp hoistedOp;
     FailureOr<Value> newResult =
         hoistPaddingOnTensors(padTensorOp, en.value(), hoistedOp);
     if (failed(newResult))
@@ -749,10 +749,11 @@
   return SmallVector<StringRef>(nParallelLoops, getParallelIteratorTypeName());
 }
 
-/// Rewrite a PadTensorOp into a sequence of InitTensorOp, FillOp (to
+/// Rewrite a tensor::PadOp into a sequence of InitTensorOp, FillOp (to
 /// initialize with pad_val) and GenericOp (to copy contents).
-LogicalResult PadTensorOpTransformationPattern::matchAndRewrite(
-    linalg::PadTensorOp padOp, PatternRewriter &rewriter) const {
+LogicalResult
+PadOpTransformationPattern::matchAndRewrite(tensor::PadOp padOp,
+                                            PatternRewriter &rewriter) const {
 
   auto inputShapedType = padOp.source().getType().cast<ShapedType>();
   auto resultShapedType = padOp.result().getType().cast<ShapedType>();
@@ -767,9 +768,8 @@
   //   1. A BBarg from a different block.
   //   2. A value defined outside of the current block.
   Block &block = padOp.region().front();
-  auto yieldOp = cast<YieldOp>(block.getTerminator());
-  assert(yieldOp.getNumOperands() == 1 && "expected single operand yield");
-  Value padValue = yieldOp.values().front();
+  auto yieldOp = cast<tensor::YieldOp>(block.getTerminator());
+  Value padValue = yieldOp.value();
   Operation *definingOp = padValue.getDefiningOp();
   if (definingOp && definingOp->getBlock() == &block)
     return failure();
@@ -812,8 +812,8 @@
 
 /// Filling `dest` using FillOp constant padding value if possible.
 /// Otherwise, generate a tensor::GenerateOp.
-Value GeneralizePadTensorOpPattern::createFillOrGenerateOp(
-    PatternRewriter &rewriter, PadTensorOp padOp, Value dest,
+Value GeneralizePadOpPattern::createFillOrGenerateOp(
+    PatternRewriter &rewriter, tensor::PadOp padOp, Value dest,
     const SmallVector<Value> &dynSizes) const {
   auto padValue = padOp.getConstantPaddingValue();
   if (padValue)
@@ -825,20 +825,12 @@
   // Copy region to new op.
   BlockAndValueMapping bvm;
   padOp.region().cloneInto(&generateOp.getRegion(), bvm);
-  // Rewrite linalg::YieldOp to tensor::YieldOp.
-  OpBuilder::InsertionGuard guard(rewriter);
-  auto yieldOp =
-      dyn_cast<linalg::YieldOp>(generateOp.getRegion().front().getTerminator());
-  assert(yieldOp && "malformed PadTensorOp: expected YieldOp terminator");
-  assert(yieldOp.values().size() == 1);
-  rewriter.setInsertionPoint(yieldOp);
-  rewriter.replaceOpWithNewOp<tensor::YieldOp>(yieldOp, yieldOp.values()[0]);
   return generateOp;
 }
 
 LogicalResult
-GeneralizePadTensorOpPattern::matchAndRewrite(PadTensorOp padOp,
-                                              PatternRewriter &rewriter) const {
+GeneralizePadOpPattern::matchAndRewrite(tensor::PadOp padOp,
+                                        PatternRewriter &rewriter) const {
   // Given an OpFoldResult, return an index-typed value.
   auto getIdxValue = [&](OpFoldResult ofr) {
     if (auto val = ofr.dyn_cast<Value>())
@@ -877,10 +869,10 @@
   if (optimizeCopyFn && optimizeCopyFn(rewriter, padOp, fill).succeeded())
     return success();
 
-  // PadTensorOps cannot be optimized. Generate a InsertSliceOp instead
+  // tensor::PadOps cannot be optimized. Generate a InsertSliceOp instead
   // for copying the PadOp source.
   auto sourceType = padOp.getSourceType();
-  // Compute size of source of PadTensorOp.
+  // Compute size of source of tensor::PadOp.
   SmallVector<OpFoldResult> srcSizes;
   for (unsigned dim = 0; dim < sourceType.getRank(); ++dim) {
     if (sourceType.isDynamicDim(dim)) {
@@ -901,15 +893,17 @@
 
 LogicalResult ExtractSliceOfPadTensorSwapPattern::matchAndRewrite(
     tensor::ExtractSliceOp sliceOp, PatternRewriter &rewriter) const {
-  auto padOp = sliceOp.source().getDefiningOp<PadTensorOp>();
+  auto padOp = sliceOp.source().getDefiningOp<tensor::PadOp>();
   if (!padOp)
     return failure();
   // Only unit stride supported.
   if (!sliceOp.hasUnitStride())
     return failure();
 
+  TilingInterface tilingInterface =
+      dyn_cast<TilingInterface>(padOp.getOperation());
   Operation *tiledPadOp =
-      padOp
+      tilingInterface
           .getTiledImplementation(
               rewriter, /*dest=*/ValueRange{}, sliceOp.getMixedOffsets(),
               sliceOp.getMixedSizes(), /*tileDestOperands=*/false)
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -682,20 +682,19 @@
   return result;
 }
 
-/// Rewrite a PadTensorOp into a sequence of InitTensorOp, FillOp and
+/// Rewrite a tensor::PadOp into a sequence of InitTensorOp, FillOp and
 /// InsertSliceOp. For now, only constant padding values are supported.
 /// If there is enough static type information, TransferReadOps and
 /// TransferWriteOps may be generated instead of InsertSliceOps.
-struct GenericPadTensorOpVectorizationPattern
-    : public GeneralizePadTensorOpPattern {
-  GenericPadTensorOpVectorizationPattern(MLIRContext *context,
-                                         PatternBenefit benefit = 1)
-      : GeneralizePadTensorOpPattern(context, tryVectorizeCopy, benefit) {}
-  /// Vectorize the copying of a PadTensorOp's source. This is possible if
+struct GenericPadOpVectorizationPattern : public GeneralizePadOpPattern {
+  GenericPadOpVectorizationPattern(MLIRContext *context,
+                                   PatternBenefit benefit = 1)
+      : GeneralizePadOpPattern(context, tryVectorizeCopy, benefit) {}
+  /// Vectorize the copying of a tensor::PadOp's source. This is possible if
   /// each dimension size is statically know in the source type or the result
   /// type (or both).
   static LogicalResult tryVectorizeCopy(PatternRewriter &rewriter,
-                                        PadTensorOp padOp, Value dest) {
+                                        tensor::PadOp padOp, Value dest) {
     auto sourceType = padOp.getSourceType();
     auto resultType = padOp.getResultType();
 
@@ -767,13 +766,13 @@
   }
 };
 
-/// Base pattern for rewriting PadTensorOps whose result is consumed by a
+/// Base pattern for rewriting tensor::PadOps whose result is consumed by a
 /// given operation type OpTy.
 template <typename OpTy>
-struct VectorizePadTensorOpUserPattern : public OpRewritePattern<PadTensorOp> {
-  using OpRewritePattern<PadTensorOp>::OpRewritePattern;
+struct VectorizePadOpUserPattern : public OpRewritePattern<tensor::PadOp> {
+  using OpRewritePattern<tensor::PadOp>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(PadTensorOp padOp,
+  LogicalResult matchAndRewrite(tensor::PadOp padOp,
                                 PatternRewriter &rewriter) const final {
     bool changed = false;
     // Insert users in vector, because some users may be replaced/removed.
@@ -785,10 +784,10 @@
 
 protected:
   virtual LogicalResult rewriteUser(PatternRewriter &rewriter,
-                                    PadTensorOp padOp, OpTy op) const = 0;
+                                    tensor::PadOp padOp, OpTy op) const = 0;
 };
 
-/// Rewrite use of PadTensorOp result in TransferReadOp. E.g.:
+/// Rewrite use of tensor::PadOp result in TransferReadOp. E.g.:
 /// ```
 /// %0 = linalg.pad_tensor %src ... : tensor<?x?xf32> to tensor<17x5xf32>
 /// %r = vector.transfer_read %0[%c0, %c0], %cst
@@ -807,12 +806,12 @@
 /// - `xferOp` has no out-of-bounds dims or mask.
 /// - Low padding is static 0.
 /// - Single, scalar padding value.
-struct PadTensorOpVectorizationWithTransferReadPattern
-    : public VectorizePadTensorOpUserPattern<vector::TransferReadOp> {
-  using VectorizePadTensorOpUserPattern<
-      vector::TransferReadOp>::VectorizePadTensorOpUserPattern;
+struct PadOpVectorizationWithTransferReadPattern
+    : public VectorizePadOpUserPattern<vector::TransferReadOp> {
+  using VectorizePadOpUserPattern<
+      vector::TransferReadOp>::VectorizePadOpUserPattern;
 
-  LogicalResult rewriteUser(PatternRewriter &rewriter, PadTensorOp padOp,
+  LogicalResult rewriteUser(PatternRewriter &rewriter, tensor::PadOp padOp,
                             vector::TransferReadOp xferOp) const override {
     // Low padding must be static 0.
     if (!padOp.hasZeroLowPad())
@@ -837,7 +836,7 @@
   }
 };
 
-/// Rewrite use of PadTensorOp result in TransferWriteOp.
+/// Rewrite use of tensor::PadOp result in TransferWriteOp.
 /// This pattern rewrites TransferWriteOps that write to a padded tensor
 /// value, where the same amount of padding is immediately removed again after
 /// the write. In such cases, the TransferWriteOp can write to the non-padded
@@ -869,12 +868,12 @@
 ///   ExtractSliceOp trims the same amount of padding that was added
 ///   beforehand.
 /// - Single, scalar padding value.
-struct PadTensorOpVectorizationWithTransferWritePattern
-    : public VectorizePadTensorOpUserPattern<vector::TransferWriteOp> {
-  using VectorizePadTensorOpUserPattern<
-      vector::TransferWriteOp>::VectorizePadTensorOpUserPattern;
+struct PadOpVectorizationWithTransferWritePattern
+    : public VectorizePadOpUserPattern<vector::TransferWriteOp> {
+  using VectorizePadOpUserPattern<
+      vector::TransferWriteOp>::VectorizePadOpUserPattern;
 
-  LogicalResult rewriteUser(PatternRewriter &rewriter, PadTensorOp padOp,
+  LogicalResult rewriteUser(PatternRewriter &rewriter, tensor::PadOp padOp,
                             vector::TransferWriteOp xferOp) const override {
     // TODO: support 0-d corner case.
     if (xferOp.getTransferRank() == 0)
@@ -925,7 +924,7 @@
   /// sizes may turn out to be equal at runtime.
   bool hasSameTensorSize(Value beforePadding,
                          tensor::ExtractSliceOp afterTrimming) const {
-    // If the input to PadTensorOp is a CastOp, try with with both CastOp
+    // If the input to tensor::PadOp is a CastOp, try with with both CastOp
     // result and CastOp operand.
     if (auto castOp = beforePadding.getDefiningOp<tensor::CastOp>())
       if (hasSameTensorSize(castOp.source(), afterTrimming))
@@ -1000,7 +999,7 @@
   }
 };
 
-/// Rewrite use of PadTensorOp result in InsertSliceOp. E.g.:
+/// Rewrite use of tensor::PadOp result in InsertSliceOp. E.g.:
 /// ```
 /// %0 = linalg.pad_tensor %src ... : tensor<?x?xf32> to tensor<17x5xf32>
 /// %r = tensor.insert_slice %0
@@ -1023,12 +1022,12 @@
 /// - Only unit strides in `insertOp`.
 /// - Single, scalar padding value.
 /// - `padOp` result not used as destination.
-struct PadTensorOpVectorizationWithInsertSlicePattern
-    : public VectorizePadTensorOpUserPattern<tensor::InsertSliceOp> {
-  using VectorizePadTensorOpUserPattern<
-      tensor::InsertSliceOp>::VectorizePadTensorOpUserPattern;
+struct PadOpVectorizationWithInsertSlicePattern
+    : public VectorizePadOpUserPattern<tensor::InsertSliceOp> {
+  using VectorizePadOpUserPattern<
+      tensor::InsertSliceOp>::VectorizePadOpUserPattern;
 
-  LogicalResult rewriteUser(PatternRewriter &rewriter, PadTensorOp padOp,
+  LogicalResult rewriteUser(PatternRewriter &rewriter, tensor::PadOp padOp,
                             tensor::InsertSliceOp insertOp) const override {
     // Low padding must be static 0.
     if (!padOp.hasZeroLowPad())
@@ -1087,14 +1086,14 @@
   }
 };
 
-void mlir::linalg::populatePadTensorOpVectorizationPatterns(
+void mlir::linalg::populatePadOpVectorizationPatterns(
     RewritePatternSet &patterns, PatternBenefit baseBenefit) {
-  patterns.add<GenericPadTensorOpVectorizationPattern>(patterns.getContext(),
-                                                       baseBenefit);
+  patterns.add<GenericPadOpVectorizationPattern>(patterns.getContext(),
+                                                 baseBenefit);
   // Try these specialized patterns first before resorting to the generic one.
-  patterns.add<PadTensorOpVectorizationWithTransferReadPattern,
-               PadTensorOpVectorizationWithTransferWritePattern,
-               PadTensorOpVectorizationWithInsertSlicePattern>(
+  patterns.add<PadOpVectorizationWithTransferReadPattern,
+               PadOpVectorizationWithTransferWritePattern,
+               PadOpVectorizationWithInsertSlicePattern>(
       patterns.getContext(), baseBenefit.getBenefit() + 1);
 }
 
diff --git a/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt
--- a/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Utils/CMakeLists.txt
@@ -12,5 +12,6 @@
   MLIRSCF
   MLIRPass
   MLIRStandard
+  MLIRTensorUtils
   MLIRTransformUtils
   )
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
--- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -23,6 +23,7 @@
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/Dialect/StandardOps/Utils/Utils.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineExprVisitor.h"
@@ -328,7 +329,7 @@
   // Exit if `source` is not defined by an ExtractSliceOp.
   auto sliceOp = source.getDefiningOp<tensor::ExtractSliceOp>();
   if (!sliceOp)
-    return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
+    return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
 
   // Search the `source` use-def chain for padded LinalgOps.
   Value current = sliceOp.source();
@@ -339,22 +340,22 @@
     OpResult opResult = current.cast<OpResult>();
     current = linalgOp.getOutputOperand(opResult.getResultNumber())->get();
   }
-  auto padTensorOp = current ? current.getDefiningOp<PadTensorOp>() : nullptr;
+  auto padTensorOp = current ? current.getDefiningOp<tensor::PadOp>() : nullptr;
 
-  // Exit if the search fails to match a PadTensorOp at the end of the matched
+  // Exit if the search fails to match a tensor::PadOp at the end of the matched
   // LinalgOp sequence.
   if (!padTensorOp)
-    return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
+    return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
 
   // Exit if the padded result type does not match.
   if (sliceOp.source().getType() != type)
-    return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
+    return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
 
   // Exit if the LinalgOps are not high padded.
   if (llvm::any_of(padTensorOp.getMixedLowPad(), [](OpFoldResult ofr) {
         return getConstantIntValue(ofr) != static_cast<int64_t>(0);
       }))
-    return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
+    return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
 
   // Exit if `padTensorOpSliceOp`, which defines the slice used by
   // `padTensorOp`, is rank-reducing.
@@ -362,7 +363,7 @@
       padTensorOp.source().getDefiningOp<tensor::ExtractSliceOp>();
   if (!padTensorOpSliceOp || sliceOp.getMixedSizes().size() !=
                                  padTensorOpSliceOp.getMixedSizes().size())
-    return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
+    return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
 
   // Exit if the sizes of the dynamic sizes of `sliceOp` do not match the size
   // of the slice padded by `padTensorOp`.
@@ -372,7 +373,7 @@
                      return !isEqualConstantIntOrValue(std::get<0>(it),
                                                        std::get<1>(it));
                    }))
-    return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
+    return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
 
   // Exit if the padding values do not match.
   Attribute padTensorOpPadAttr, padAttr;
@@ -380,7 +381,7 @@
   if (!padTensorOpPad ||
       !matchPattern(padTensorOpPad, m_Constant(&padTensorOpPadAttr)) ||
       !matchPattern(pad, m_Constant(&padAttr)) || padTensorOpPadAttr != padAttr)
-    return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
+    return tensor::createPadHighOp(type, source, pad, nofold, loc, b);
 
   // Return the padded result if the padding values and sizes match.
   return sliceOp.source();
diff --git a/mlir/lib/Dialect/Tensor/CMakeLists.txt b/mlir/lib/Dialect/Tensor/CMakeLists.txt
--- a/mlir/lib/Dialect/Tensor/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tensor/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_subdirectory(IR)
 add_subdirectory(Transforms)
+add_subdirectory(Utils)
diff --git a/mlir/lib/Dialect/Tensor/IR/CMakeLists.txt b/mlir/lib/Dialect/Tensor/IR/CMakeLists.txt
--- a/mlir/lib/Dialect/Tensor/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tensor/IR/CMakeLists.txt
@@ -2,6 +2,7 @@
   TensorDialect.cpp
   TensorInferTypeOpInterfaceImpl.cpp
   TensorOps.cpp
+  TensorTilingInterfaceImpl.cpp
 )
 
 add_mlir_dialect_library(MLIRTensor
@@ -43,3 +44,20 @@
   MLIRSupport
   MLIRTensor
   )
+
+add_mlir_dialect_library(MLIRTensorTilingInterfaceImpl
+  TensorTilingInterfaceImpl.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Tensor
+
+  LINK_LIBS PUBLIC
+  MLIRAffine
+  MLIRIR
+  MLIRLinalg
+  MLIRSCF
+  MLIRStandard
+  MLIRSupport
+  MLIRTensor
+  MLIRTilingInterface
+  )
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp
--- a/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorInferTypeOpInterfaceImpl.cpp
@@ -161,6 +161,48 @@
   }
 };
 
+namespace {
+
+struct ReifyPadOp
+    : public ReifyRankedShapedTypeOpInterface::ExternalModel<ReifyPadOp,
+                                                             PadOp> {
+  LogicalResult
+  reifyResultShapes(Operation *op, OpBuilder &b,
+                    ReifiedRankedShapedTypeDims &reifiedReturnShapes) const {
+    auto padOp = cast<PadOp>(op);
+    Location loc = padOp.getLoc();
+    auto lowPad = padOp.getMixedLowPad();
+    auto highPad = padOp.getMixedHighPad();
+    SmallVector<Value> shapes;
+    for (auto dim : llvm::seq<int64_t>(0, padOp.getSourceType().getRank())) {
+      // Shape along each dimension is source dim + low pad + high pad.
+      SmallVector<Value> mapOperands;
+      mapOperands.push_back(
+          b.createOrFold<tensor::DimOp>(loc, padOp.source(), dim));
+      AffineExpr expr = b.getAffineDimExpr(0);
+      unsigned numSymbols = 0;
+      auto addOpFoldResult = [&](OpFoldResult valueOrAttr) {
+        if (Value v = valueOrAttr.dyn_cast<Value>()) {
+          expr = expr + b.getAffineSymbolExpr(numSymbols++);
+          mapOperands.push_back(v);
+          return;
+        }
+        int64_t staticValue =
+            valueOrAttr.get<Attribute>().cast<IntegerAttr>().getInt();
+        expr = expr + staticValue;
+      };
+      addOpFoldResult(lowPad[dim]);
+      addOpFoldResult(highPad[dim]);
+      shapes.push_back(applyMapToValues(
+          b, loc, AffineMap::get(1, numSymbols, expr), mapOperands)[0]);
+    }
+    reifiedReturnShapes.emplace_back(std::move(shapes));
+    return success();
+  }
+};
+
+} // namespace
+
 void mlir::tensor::registerInferTypeOpInterfaceExternalModels(
     DialectRegistry &registry) {
   registry
@@ -169,4 +211,5 @@
   registry
       .addOpInterface<tensor::CollapseShapeOp,
                       ReifyExpandOrCollapseShapeOp<tensor::CollapseShapeOp>>();
+  registry.addOpInterface<tensor::PadOp, ReifyPadOp>();
 }
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -476,6 +476,7 @@
   // Ensure that the region yields an element of the right type.
   auto yieldOp =
       llvm::cast<YieldOp>(op.body().getBlocks().front().getTerminator());
+
   if (yieldOp.value().getType() != resultTy.getElementType())
     return op.emitOpError(
         "body must be terminated with a `yield` operation of the tensor "
@@ -1481,6 +1482,258 @@
                                                sizes, strides);
 }
 
+//===----------------------------------------------------------------------===//
+// PadOp
+//===----------------------------------------------------------------------===//
+
+// TODO: Replace custom<InferType> directive with AllTypesMatch as soon as it
+// supports optional types.
+void printInferType(OpAsmPrinter &printer, Operation *op, Value optOperand,
+                    Type typeToInfer, Type typeToInferFrom) {}
+
+ParseResult parseInferType(OpAsmParser &parser,
+                           Optional<OpAsmParser::OperandType> optOperand,
+                           Type &typeToInfer, Type typeToInferFrom) {
+  if (optOperand)
+    typeToInfer = typeToInferFrom;
+  return success();
+}
+
+static LogicalResult verify(PadOp op) {
+  auto sourceType = op.source().getType().cast<RankedTensorType>();
+  auto resultType = op.result().getType().cast<RankedTensorType>();
+  auto expectedType = PadOp::inferResultType(
+      sourceType, extractFromI64ArrayAttr(op.static_low()),
+      extractFromI64ArrayAttr(op.static_high()));
+  for (int i = 0, e = sourceType.getRank(); i < e; ++i) {
+    if (resultType.getDimSize(i) == expectedType.getDimSize(i))
+      continue;
+    if (expectedType.isDynamicDim(i))
+      continue;
+    return op.emitError("specified type ")
+           << resultType << " does not match the inferred type "
+           << expectedType;
+  }
+
+  auto &region = op.region();
+  unsigned rank = resultType.getRank();
+  Block &block = region.front();
+  if (block.getNumArguments() != rank)
+    return op.emitError("expected the block to have ") << rank << " arguments";
+
+  // Note: the number and type of yield values are checked in the YieldOp.
+  for (const auto &en : llvm::enumerate(block.getArgumentTypes())) {
+    if (!en.value().isIndex())
+      return op.emitOpError("expected block argument ")
+             << (en.index() + 1) << " to be an index";
+  }
+
+  // Ensure that the region yields an element of the right type.
+  auto yieldOp = llvm::cast<YieldOp>(block.getTerminator());
+  if (yieldOp.value().getType() !=
+      op.getType().cast<ShapedType>().getElementType())
+    return op.emitOpError("expected yield type to match shape element type");
+
+  return success();
+}
+
+RankedTensorType PadOp::inferResultType(RankedTensorType sourceType,
+                                        ArrayRef<int64_t> staticLow,
+                                        ArrayRef<int64_t> staticHigh,
+                                        ArrayRef<int64_t> resultShape) {
+  unsigned rank = sourceType.getRank();
+  assert(staticLow.size() == rank && "unexpected staticLow size mismatch");
+  assert(staticHigh.size() == rank && "unexpected staticHigh size mismatch");
+  assert((resultShape.empty() || resultShape.size() == rank) &&
+         "unexpected resultShape size mismatch");
+
+  SmallVector<int64_t, 4> inferredShape;
+  for (auto i : llvm::seq<unsigned>(0, rank)) {
+    if (sourceType.isDynamicDim(i) ||
+        staticLow[i] == ShapedType::kDynamicSize ||
+        staticHigh[i] == ShapedType::kDynamicSize) {
+      inferredShape.push_back(resultShape.empty() ? ShapedType::kDynamicSize
+                                                  : resultShape[i]);
+    } else {
+      int64_t size = sourceType.getDimSize(i) + staticLow[i] + staticHigh[i];
+      assert((resultShape.empty() || size == resultShape[i] ||
+              resultShape[i] == ShapedType::kDynamicSize) &&
+             "mismatch between inferred shape and result shape");
+      inferredShape.push_back(size);
+    }
+  }
+
+  return RankedTensorType::get(inferredShape, sourceType.getElementType());
+}
+
+void PadOp::build(OpBuilder &b, OperationState &result, Value source,
+                  ArrayRef<int64_t> staticLow, ArrayRef<int64_t> staticHigh,
+                  ValueRange low, ValueRange high, bool nofold,
+                  ArrayRef<NamedAttribute> attrs) {
+  auto sourceType = source.getType().cast<RankedTensorType>();
+  auto resultType = inferResultType(sourceType, staticLow, staticHigh);
+  build(b, result, resultType, source, low, high, b.getI64ArrayAttr(staticLow),
+        b.getI64ArrayAttr(staticHigh), nofold ? b.getUnitAttr() : UnitAttr());
+  result.addAttributes(attrs);
+}
+
+void PadOp::build(OpBuilder &b, OperationState &result, Value source,
+                  ValueRange low, ValueRange high, bool nofold,
+                  ArrayRef<NamedAttribute> attrs) {
+  auto sourceType = source.getType().cast<RankedTensorType>();
+  unsigned rank = sourceType.getRank();
+  SmallVector<int64_t, 4> staticVector(rank, ShapedType::kDynamicSize);
+  build(b, result, source, staticVector, staticVector, low, high, nofold,
+        attrs);
+}
+
+void PadOp::build(OpBuilder &b, OperationState &result, Type resultType,
+                  Value source, ArrayRef<OpFoldResult> low,
+                  ArrayRef<OpFoldResult> high, bool nofold,
+                  ArrayRef<NamedAttribute> attrs) {
+  assert(resultType.isa<RankedTensorType>());
+  auto sourceType = source.getType().cast<RankedTensorType>();
+  SmallVector<Value, 4> dynamicLow, dynamicHigh;
+  SmallVector<int64_t, 4> staticLow, staticHigh;
+  // staticLow and staticHigh have full information of the padding config.
+  // This will grow staticLow and staticHigh with 1 value. If the config is
+  // dynamic (ie not a constant), dynamicLow and dynamicHigh will grow with 1
+  // value as well.
+  dispatchIndexOpFoldResults(low, dynamicLow, staticLow,
+                             ShapedType::kDynamicSize);
+  dispatchIndexOpFoldResults(high, dynamicHigh, staticHigh,
+                             ShapedType::kDynamicSize);
+  if (!resultType) {
+    resultType = PadOp::inferResultType(sourceType, staticLow, staticHigh);
+  }
+  build(b, result, resultType, source, dynamicLow, dynamicHigh,
+        b.getI64ArrayAttr(staticLow), b.getI64ArrayAttr(staticHigh),
+        nofold ? b.getUnitAttr() : UnitAttr());
+  result.addAttributes(attrs);
+}
+
+namespace {
+// Folds tensor.pad when padding is static zeros and the attribute
+// doesn't request otherwise.
+struct FoldStaticZeroPadding : public OpRewritePattern<PadOp> {
+  using OpRewritePattern<PadOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(PadOp padTensorOp,
+                                PatternRewriter &rewriter) const override {
+    if (!padTensorOp.hasZeroLowPad() || !padTensorOp.hasZeroHighPad())
+      return failure();
+    if (padTensorOp.nofold())
+      return failure();
+    rewriter.replaceOpWithNewOp<tensor::CastOp>(
+        padTensorOp, padTensorOp.result().getType(), padTensorOp.source());
+    return success();
+  }
+};
+
+// Fold CastOp into PadOp when adding static information.
+struct FoldSourceTensorCast : public OpRewritePattern<PadOp> {
+  using OpRewritePattern<PadOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(PadOp padTensorOp,
+                                PatternRewriter &rewriter) const override {
+    auto castOp = padTensorOp.source().getDefiningOp<tensor::CastOp>();
+    if (!tensor::canFoldIntoConsumerOp(castOp))
+      return failure();
+
+    auto newResultType = PadOp::inferResultType(
+        castOp.source().getType().cast<RankedTensorType>(),
+        extractFromI64ArrayAttr(padTensorOp.static_low()),
+        extractFromI64ArrayAttr(padTensorOp.static_high()),
+        padTensorOp.getResultType().getShape());
+
+    if (newResultType == padTensorOp.getResultType()) {
+      rewriter.updateRootInPlace(padTensorOp, [&]() {
+        padTensorOp.sourceMutable().assign(castOp.source());
+      });
+    } else {
+      auto newOp = rewriter.create<PadOp>(
+          padTensorOp->getLoc(), newResultType, padTensorOp.source(),
+          padTensorOp.low(), padTensorOp.high(), padTensorOp.static_low(),
+          padTensorOp.static_high(), padTensorOp.nofold());
+      BlockAndValueMapping mapper;
+      padTensorOp.getRegion().cloneInto(&newOp.getRegion(), mapper);
+
+      rewriter.replaceOpWithNewOp<tensor::CastOp>(
+          padTensorOp, padTensorOp.getResultType(), newOp);
+    }
+    return success();
+  }
+};
+
+// Fold CastOp using the result of PadOp back into the latter if it adds
+// static information.
+struct FoldTargetTensorCast : public OpRewritePattern<PadOp> {
+  using OpRewritePattern<PadOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(PadOp padTensorOp,
+                                PatternRewriter &rewriter) const override {
+    if (!padTensorOp.result().hasOneUse())
+      return failure();
+    auto tensorCastOp =
+        dyn_cast<tensor::CastOp>(*padTensorOp->getUsers().begin());
+    if (!tensorCastOp)
+      return failure();
+    if (!tensor::preservesStaticInformation(padTensorOp.result().getType(),
+                                            tensorCastOp.dest().getType()))
+      return failure();
+
+    auto replacementOp = rewriter.create<PadOp>(
+        padTensorOp.getLoc(), tensorCastOp.dest().getType(),
+        padTensorOp.source(), padTensorOp.low(), padTensorOp.high(),
+        padTensorOp.static_low(), padTensorOp.static_high(),
+        padTensorOp.nofold());
+    replacementOp.region().takeBody(padTensorOp.region());
+
+    rewriter.replaceOp(padTensorOp, replacementOp.result());
+    rewriter.replaceOp(tensorCastOp, replacementOp.result());
+    return success();
+  }
+};
+} // namespace
+
+void PadOp::getCanonicalizationPatterns(RewritePatternSet &results,
+                                        MLIRContext *context) {
+  results
+      .add<FoldStaticZeroPadding, FoldSourceTensorCast, FoldTargetTensorCast>(
+          context);
+}
+
+/// Return the padding value of the PadOp if it constant. In this context,
+/// "constant" means an actual constant or "defined outside of the block".
+///
+/// Values are considered constant in three cases:
+///  - A ConstantLike value.
+///  - A basic block argument from a different block.
+///  - A value defined outside of the block.
+///
+/// If the padding value is not constant, an empty Value is returned.
+Value PadOp::getConstantPaddingValue() {
+  auto yieldOp = dyn_cast<YieldOp>(getRegion().front().getTerminator());
+  if (!yieldOp)
+    return {};
+  Value padValue = yieldOp.value();
+  // Check if yield value is a constant.
+  if (matchPattern(padValue, m_Constant()))
+    return padValue;
+  // Check if yield value is defined inside the PadOp block.
+  if (padValue.getParentBlock() == &getRegion().front())
+    return {};
+  // Else: Yield value defined outside of the PadOp block.
+  return padValue;
+}
+
+OpFoldResult PadOp::fold(ArrayRef<Attribute>) {
+  if (getResultType().hasStaticShape() && getResultType() == getSourceType() &&
+      !nofold())
+    return source();
+  return {};
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp
@@ -0,0 +1,279 @@
+//===- TensorTilingInterface.cpp - Tiling Interface  models *- C++ ------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/StandardOps/Utils/Utils.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Interfaces/TilingInterface.h"
+
+using namespace mlir;
+using namespace mlir::tensor;
+
+namespace {
+
+struct PadOpTiling : public TilingInterface::ExternalModel<PadOpTiling, PadOp> {
+
+  SmallVector<Value> getDestinationOperands(Operation *op, OpBuilder &b) const {
+    ReifiedRankedShapedTypeDims reifiedShapes;
+    ReifyRankedShapedTypeOpInterface reifyShapedTypeInterface =
+        dyn_cast<ReifyRankedShapedTypeOpInterface>(op);
+    (void)reifyShapedTypeInterface.reifyResultShapes(b, reifiedShapes);
+
+    auto padOp = cast<PadOp>(op);
+    SmallVector<OpFoldResult> mixedSizes = getAsOpFoldResult(reifiedShapes[0]);
+    Value initTensor = b.create<linalg::InitTensorOp>(
+        op->getLoc(), mixedSizes, padOp.getResultType().getElementType());
+    return {initTensor};
+  }
+
+  SmallVector<StringRef> getLoopIteratorTypes(Operation *op) const {
+    auto padOp = cast<PadOp>(op);
+    SmallVector<StringRef> iteratorTypes(padOp.getResultType().getRank(),
+                                         getParallelIteratorTypeName());
+    return iteratorTypes;
+  }
+
+  SmallVector<Range> getIterationDomain(Operation *op, OpBuilder &b) const {
+    ReifiedRankedShapedTypeDims reifiedShapes;
+    ReifyRankedShapedTypeOpInterface reifyShapedTypeInterface =
+        dyn_cast<ReifyRankedShapedTypeOpInterface>(op);
+    (void)reifyShapedTypeInterface.reifyResultShapes(b, reifiedShapes);
+
+    Location loc = op->getLoc();
+    Value zero = b.create<arith::ConstantIndexOp>(loc, 0);
+    Value one = b.create<arith::ConstantIndexOp>(loc, 1);
+    // Initialize all the ranges to {zero, one, one}. All the `ub`s are
+    // overwritten.
+    SmallVector<Range> loopRanges(reifiedShapes[0].size(), {zero, one, one});
+    for (const auto &ub : enumerate(reifiedShapes[0]))
+      loopRanges[ub.index()].size = ub.value();
+    return loopRanges;
+  }
+
+  SmallVector<Operation *>
+  getTiledImplementation(Operation *op, OpBuilder &b, ValueRange dest,
+                         ArrayRef<OpFoldResult> offsets,
+                         ArrayRef<OpFoldResult> sizes,
+                         bool /*tileDestOperands*/) const {
+    auto padOp = cast<PadOp>(op);
+    // Only constant padding value supported.
+    Value padValue = padOp.getConstantPaddingValue();
+    if (!padValue)
+      return {};
+
+    // Helper variables and functions for various arithmetic operations. These
+    // are used extensively for computing new offset/length and padding values.
+    Location loc = op->getLoc();
+    AffineExpr dim0, dim1;
+    bindDims(b.getContext(), dim0, dim1);
+    // Add two integers.
+    auto addMap = AffineMap::get(2, 0, {dim0 + dim1});
+    auto add = [&](Value v1, Value v2) {
+      return b.createOrFold<AffineApplyOp>(loc, addMap, ValueRange{v1, v2});
+    };
+    // Subtract two integers.
+    auto subMap = AffineMap::get(2, 0, {dim0 - dim1});
+    auto sub = [&](Value v1, Value v2) {
+      return b.createOrFold<AffineApplyOp>(loc, subMap, ValueRange{v1, v2});
+    };
+    // Take the minimum of two integers.
+    auto idMap = AffineMap::getMultiDimIdentityMap(2, b.getContext());
+    auto min = [&](Value v1, Value v2) {
+      return b.createOrFold<AffineMinOp>(loc, idMap, ValueRange{v1, v2});
+    };
+    // Take the maximum of two integers.
+    auto max = [&](Value v1, Value v2) {
+      return b.createOrFold<AffineMaxOp>(loc, idMap, ValueRange{v1, v2});
+    };
+    // Zero index-typed integer.
+    auto zero = b.create<arith::ConstantIndexOp>(loc, 0);
+
+    // Helper function for filling static/dynamic low/high padding indices
+    // vectors of PadOp.
+    auto appendIndex = [&](Value val, SmallVector<Value> &dynIndices,
+                           SmallVector<int64_t> &staticIndices) {
+      if (auto constInt = getConstantIntValue(val)) {
+        staticIndices.push_back(*constInt);
+      } else {
+        staticIndices.push_back(ShapedType::kDynamicSize);
+        dynIndices.push_back(val);
+      }
+    };
+
+    // Compute new offsets, lengths, low padding, high padding.
+    SmallVector<OpFoldResult> newOffsets, newLengths, newStrides;
+    SmallVector<Value> newLows, newHighs;
+    SmallVector<int64_t> staticNewLows, staticNewHighs;
+    // Set to true if the original data source is not read at all.
+    bool hasZeroLen = false;
+    // Same as hasZeroLen, but for dynamic dimension sizes. This condition
+    // is true if the original data source turns out to be unused at runtime.
+    Value dynHasZeroLenCond;
+
+    int64_t rank = padOp.getSourceType().getRank();
+    for (unsigned dim = 0; dim < rank; ++dim) {
+      auto low =
+          getValueOrCreateConstantIndexOp(b, loc, padOp.getMixedLowPad()[dim]);
+      bool hasLowPad = getConstantIntValue(low) != static_cast<int64_t>(0);
+      auto high =
+          getValueOrCreateConstantIndexOp(b, loc, padOp.getMixedHighPad()[dim]);
+      bool hasHighPad = getConstantIntValue(high) != static_cast<int64_t>(0);
+      auto offset = getValueOrCreateConstantIndexOp(b, loc, offsets[dim]);
+      auto length = getValueOrCreateConstantIndexOp(b, loc, sizes[dim]);
+      auto srcSize = b.createOrFold<tensor::DimOp>(loc, padOp.source(), dim);
+
+      // The new amount of low padding is `low - offset`. Except for the case
+      // where none of the low padding is read. In that case, the new amount of
+      // low padding is zero.
+      //
+      // Optimization: If low = 0, then newLow = 0.
+      Value newLow = hasLowPad ? max(zero, sub(low, offset)) : zero;
+      appendIndex(newLow, newLows, staticNewLows);
+
+      // Start reading the data from position `offset - low`. Since the original
+      // read may have started in the low padding zone, this value could be
+      // negative. Therefore, start reading from:
+      //
+      // max(offset - low, 0)
+      //
+      // The original read could also have started in the high padding zone.
+      // In that case, set the offset to the end of source tensor. The new
+      // ExtractSliceOp length will be zero in that case. (Effectively reading
+      // no data from the source.)
+      //
+      // Optimization: If low = 0, then the formula can be simplified.
+      Value newOffset = hasLowPad ? min(max(sub(offset, low), zero), srcSize)
+                                  : min(offset, srcSize);
+      newOffsets.push_back(getAsOpFoldResult(newOffset));
+
+      // The original ExtractSliceOp was reading until position `offset +
+      // length`. Therefore, the corresponding position within the source tensor
+      // is:
+      //
+      // offset + length - low
+      //
+      // In case the original ExtractSliceOp stopped reading within the low
+      // padding zone, this value can be negative. In that case, the end
+      // position of the read should be zero. (Similar to newOffset.)
+      //
+      // The original read could also have stopped in the high padding zone.
+      // In that case, set the end positition of the read should be the end of
+      // the source tensor. (Similar to newOffset.)
+      //
+      // endLoc = min(max(offset - low + length, 0), srcSize)
+      //
+      // The new ExtractSliceOp length is `endLoc - newOffset`.
+      //
+      // Optimization: If low = 0, then the formula can be simplified.
+      Value endLoc =
+          hasLowPad ? min(max(add(sub(offset, low), length), zero), srcSize)
+                    : min(add(offset, length), srcSize);
+      Value newLength = sub(endLoc, newOffset);
+      newLengths.push_back(getAsOpFoldResult(newLength));
+
+      // Check if newLength is zero. In that case, no SubTensorOp should be
+      // executed.
+      if (auto newLengthInt = getConstantIntValue(newLength)) {
+        hasZeroLen |= *newLengthInt == 0;
+      } else {
+        Value check = b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
+                                              newLength, zero);
+        dynHasZeroLenCond =
+            dynHasZeroLenCond
+                ? b.create<arith::OrIOp>(loc, check, dynHasZeroLenCond)
+                : check;
+      }
+
+      // The amount of high padding is simply the number of elements remaining,
+      // so that the result has the same length as the original ExtractSliceOp.
+      // As an optimization, if the original high padding is zero, then the new
+      // high padding must also be zero.
+      Value newHigh = hasHighPad ? sub(sub(length, newLength), newLow) : zero;
+      appendIndex(newHigh, newHighs, staticNewHighs);
+
+      // Only unit stride supported.
+      newStrides.push_back(b.getIndexAttr(1));
+    }
+
+    // The shape of the result can be obtained from the sizes passed in.
+    SmallVector<Value> dynDims;
+    SmallVector<int64_t> shape;
+    dispatchIndexOpFoldResults(sizes, dynDims, shape, ShapedType::kDynamicSize);
+    RankedTensorType resultType =
+        RankedTensorType::get(shape, padOp.getResultType().getElementType());
+
+    // Insert cast to ensure that types match. (May be folded away.)
+    auto castResult = [&](Value val) -> Operation * {
+      auto castOp = b.create<tensor::CastOp>(loc, resultType, val);
+      return castOp;
+    };
+
+    // In cases where the original data source is unused: Emit a GenerateOp and
+    // do not generate a SliceOp. (The result shape of the SliceOp would
+    // have a dimension of size 0, the semantics of which is unclear.)
+    auto createGenerateOp = [&]() {
+      // Create GenerateOp.
+      auto generateOp = b.create<tensor::GenerateOp>(
+          loc, resultType, dynDims,
+          [&](OpBuilder &builder, Location gLoc, ValueRange indices) {
+            builder.create<tensor::YieldOp>(gLoc, padValue);
+          });
+      return castResult(generateOp);
+    };
+
+    // Emit a SliceOp and a PadOp. Should not be used in cases where
+    // the result shape of the new SliceOp has a zero dimension.
+    auto createPadTensorOfSubTensor = [&]() {
+      // Create pad_tensor(subtensor(x)).
+      auto newSliceOp = b.create<tensor::ExtractSliceOp>(
+          loc, padOp.source(), newOffsets, newLengths, newStrides);
+      auto newPadOp = b.create<PadOp>(loc, newSliceOp, staticNewLows,
+                                      staticNewHighs, newLows, newHighs);
+
+      // Copy region to new PadOp.
+      BlockAndValueMapping bvm;
+      padOp.region().cloneInto(&newPadOp.getRegion(), bvm);
+
+      // Cast result and return.
+      return castResult(newPadOp);
+    };
+
+    // Rewrite subtensor(pad_tensor(x)) into a GenerateOp it is statically known
+    // that the original data source x is not used.
+    if (hasZeroLen)
+      return {createGenerateOp()};
+
+    // If there are dynamic dimensions: Generate an scf.if check to avoid
+    // creating SliceOps with result dimensions of size 0 at runtime.
+    if (dynHasZeroLenCond) {
+      auto result = b.create<scf::IfOp>(
+          loc, resultType, dynHasZeroLenCond,
+          /*thenBuilder=*/
+          [&](OpBuilder &b, Location loc) {
+            b.create<scf::YieldOp>(loc, createGenerateOp()->getResult(0));
+          },
+          /*elseBuilder=*/
+          [&](OpBuilder &b, Location loc) {
+            b.create<scf::YieldOp>(loc,
+                                   createPadTensorOfSubTensor()->getResult(0));
+          });
+      return {result};
+    }
+    return {createPadTensorOfSubTensor()};
+  }
+};
+
+} // namespace
+
+void mlir::tensor::registerTilingOpInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addOpInterface<tensor::PadOp, PadOpTiling>();
+}
diff --git a/mlir/lib/Dialect/Tensor/Utils/CMakeLists.txt b/mlir/lib/Dialect/Tensor/Utils/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/Tensor/Utils/CMakeLists.txt
@@ -0,0 +1,12 @@
+add_mlir_dialect_library(MLIRTensorUtils
+  Utils.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Tensor
+
+  LINK_LIBS PUBLIC
+  MLIRAffine
+  MLIRArithmetic
+  MLIRIR
+  MLIRTensor
+)
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -0,0 +1,53 @@
+//===- Utils.cpp - Utilities to support the Tensor dialect ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities for the Tensor dialect.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Tensor/Utils/Utils.h"
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+
+using namespace mlir;
+using namespace mlir::tensor;
+
+PadOp mlir::tensor::createPadScalarOp(Type type, Value source, Value pad,
+                                      ArrayRef<OpFoldResult> low,
+                                      ArrayRef<OpFoldResult> high, bool nofold,
+                                      Location loc, OpBuilder &builder) {
+  auto padTensorOp =
+      builder.create<PadOp>(loc, type, source, low, high, nofold);
+  int rank = padTensorOp.getResultType().getRank();
+  SmallVector<Type, 4> blockArgTypes(rank, builder.getIndexType());
+  auto &region = padTensorOp.region();
+  // `builder.createBlock` changes the insertion point within the block. Create
+  // a guard to reset the insertion point of the builder after it is destroyed.
+  OpBuilder::InsertionGuard guard(builder);
+  builder.createBlock(&region, region.end(), blockArgTypes);
+  builder.create<YieldOp>(loc, pad);
+  return padTensorOp;
+}
+
+PadOp mlir::tensor::createPadHighOp(Type type, Value source, Value pad,
+                                    bool nofold, Location loc, OpBuilder &b) {
+  SmallVector<OpFoldResult, 4> low, high;
+  auto rankedTensorType = type.cast<RankedTensorType>();
+  assert(rankedTensorType.hasStaticShape());
+  for (const auto &en : enumerate(rankedTensorType.getShape())) {
+    AffineExpr d0;
+    bindDims(b.getContext(), d0);
+    auto dimOp = b.createOrFold<tensor::DimOp>(loc, source, en.index());
+    Value paddingWidth =
+        makeComposedAffineApply(b, loc, en.value() - d0, {dimOp});
+    high.push_back(paddingWidth);
+    low.push_back(b.createOrFold<arith::ConstantIndexOp>(loc, 0));
+  }
+  return createPadScalarOp(type, source, pad, low, high, nofold, loc, b);
+}
diff --git a/mlir/lib/Interfaces/TilingInterface.cpp b/mlir/lib/Interfaces/TilingInterface.cpp
--- a/mlir/lib/Interfaces/TilingInterface.cpp
+++ b/mlir/lib/Interfaces/TilingInterface.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Interfaces/TilingInterface.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
 
 using namespace mlir;
 
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
@@ -153,8 +153,8 @@
 // CHECK-LABEL: @max_pool_padded
 func @max_pool_padded(%arg0: tensor<1x6x34x62xf32>) -> () {
   // CHECK-DAG: [[CONST:%.+]] = arith.constant -3.40282347E+38 : f32
-  // CHECK-DAG: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 0, 0, 0] high[0, 0, 1, 0]
-  // CHECK-DAG:   linalg.yield [[CONST]]
+  // CHECK-DAG: [[PAD:%.+]] = tensor.pad %arg0 low[0, 0, 0, 0] high[0, 0, 1, 0]
+  // CHECK-DAG:   tensor.yield [[CONST]]
   // CHECK-DAG: [[INITVAL:%.+]] = arith.constant -3.40282347E+38 : f32
   // CHECK-DAG: [[INIT:%.+]] = linalg.init_tensor [1, 4, 33, 62]
   // CHECK-DAG: [[FILL:%.+]] = linalg.fill([[INITVAL]], [[INIT]])
@@ -206,7 +206,7 @@
 func @avg_pool(%arg0: tensor<1x6x34x62xf32>) -> (tensor<1x5x33x62xf32>) {
   // Initial piece computes the sum of the pooling region, with appropriate padding.
   // CHECK: [[CONST:%.+]] = arith.constant 0
-  // CHECK: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
+  // CHECK: [[PAD:%.+]] = tensor.pad %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
   // CHECK: [[CONST:%.+]] = arith.constant 0
   // CHECK: [[POOLINIT:%.+]] = linalg.init_tensor [1, 5, 33, 62]
   // CHECK: [[FILL:%.+]] = linalg.fill([[CONST]], [[POOLINIT]])
@@ -268,7 +268,7 @@
   // The calculations remain the same as above, only testing for dyn behavior
   // CHECK: %[[C0:.+]] = arith.constant 0
   // CHECK: %[[BATCH:.+]] = tensor.dim %arg0, %[[C0]]
-  // CHECK: %[[PAD:.+]] = linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
+  // CHECK: %[[PAD:.+]] = tensor.pad %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
   // CHECK: %[[POOLINIT:.+]] = linalg.init_tensor [%[[BATCH]], 5, 33, 62]
   // CHECK: %[[FILL:.+]] = linalg.fill
   // CHECK: %[[KERNEL:.+]] = linalg.init_tensor [4, 4]
@@ -386,8 +386,8 @@
 // CHECK-LABEL: @conv2d_padded_f32
 func @conv2d_padded_f32(%input: tensor<1x47x40x28xf32>, %weights: tensor<28x3x3x28xf32>, %bias: tensor<28xf32>) -> () {
   // CHECK: %[[C0:.+]] = arith.constant 0
-  // CHECK: linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
-  // CHECK:   linalg.yield %[[C0]]
+  // CHECK: tensor.pad %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
+  // CHECK:   tensor.yield %[[C0]]
   // CHECK: linalg.conv_2d_nhwc_hwcf
   %0 = "tosa.conv2d"(%input, %weights, %bias) {pad = [1, 1, 1, 1], stride = [1, 1], dilation = [2, 1]} : (tensor<1x47x40x28xf32>, tensor<28x3x3x28xf32>, tensor<28xf32>)  -> (tensor<1x45x40x28xf32>)
   return
@@ -398,8 +398,8 @@
 // CHECK-LABEL: @conv2d_quant
 func @conv2d_quant(%arg0 : tensor<1x12x12x1xi8>, %arg1 : tensor<1024x3x3x1xi8>, %arg2 : tensor<1024xi32>) -> () {
   // CHECK:   %[[C22:.+]] = arith.constant -22
-  // CHECK: linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
-  // CHECK:   linalg.yield %[[C22]]
+  // CHECK: tensor.pad %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
+  // CHECK:   tensor.yield %[[C22]]
   // CHECK: linalg.conv_2d_nhwc_hwcf_q
   %0 = "tosa.conv2d"(%arg0, %arg1, %arg2) {dilation = [1, 1], pad = [1, 1, 1, 1], quantization_info = {input_zp = -22 : i32, weight_zp = 42 : i32}, stride = [1, 1]} : (tensor<1x12x12x1xi8>, tensor<1024x3x3x1xi8>, tensor<1024xi32>) -> tensor<1x12x12x1024xi32>
   return
@@ -481,8 +481,8 @@
 // CHECK-LABEL: @depthwise_conv_quant
 func @depthwise_conv_quant(%arg0 : tensor<1x12x12x4xi8>, %arg1 : tensor<3x3x4x128xi8>, %arg2 : tensor<512xi32>) -> () {
   // CHECK: [[PADV:%.+]] = arith.constant -128
-  // CHECK: [[PAD:%.+]] = linalg.pad_tensor %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
-  // CHECK:   linalg.yield [[PADV]]
+  // CHECK: [[PAD:%.+]] = tensor.pad %arg0 low[0, 1, 1, 0] high[0, 1, 1, 0]
+  // CHECK:   tensor.yield [[PADV]]
 
   // CHECK: [[INIT:%.+]] = linalg.init_tensor [1, 12, 12, 4, 128]
   // CHECK: [[CST0:%.+]] = arith.constant 0
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg.mlir
@@ -1158,9 +1158,9 @@
   // CHECK-DAG: [[INDEX3:%.+]] = arith.constant 3 : index
   // CHECK-DAG: [[INDEX4:%.+]] = arith.constant 4 : index
   // CHECK-DAG: [[CST:%.+]] = arith.constant 0.000000e+00 : f32
-  // CHECK: linalg.pad_tensor %arg0 low{{\[}}%{{.*}}, [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]]  {
+  // CHECK: tensor.pad %arg0 low{{\[}}%{{.*}}, [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]]  {
   // CHECK: ^bb0(%arg1: index, %arg2: index):  
-  // CHECK:   linalg.yield [[CST]]
+  // CHECK:   tensor.yield [[CST]]
   // CHECK: } : tensor<1x2xf32> to tensor<4x9xf32>
   %1 = "tosa.pad"(%arg0, %0)  : (tensor<1x2xf32>, tensor<2x2xi32>)  -> (tensor<4x9xf32>)
   return %1 : tensor<4x9xf32>
@@ -1169,8 +1169,8 @@
 func @pad_int(%arg0 : tensor<1x2xi32>) -> (tensor<4x9xi32>) {
   %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
   // CHECK: [[CST:%.+]] = arith.constant 0 : i32
-  // CHECK: linalg.pad_tensor
-  // CHECK:   linalg.yield [[CST]]
+  // CHECK: tensor.pad
+  // CHECK:   tensor.yield [[CST]]
   %1 = "tosa.pad"(%arg0, %0)  : (tensor<1x2xi32>, tensor<2x2xi32>)  -> (tensor<4x9xi32>)
   return %1 : tensor<4x9xi32>
 }
@@ -1178,8 +1178,8 @@
 func @pad_quant(%arg0 : tensor<1x2xi32>) -> (tensor<4x9xi32>) {
   %0 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
   // CHECK: [[CST:%.+]] = arith.constant 42 : i32
-  // CHECK: linalg.pad_tensor
-  // CHECK:   linalg.yield [[CST]]
+  // CHECK: tensor.pad
+  // CHECK:   tensor.yield [[CST]]
   %1 = "tosa.pad"(%arg0, %0) { quantization_info = { input_zp = 42 : i32}} : (tensor<1x2xi32>, tensor<2x2xi32>)  -> (tensor<4x9xi32>)
   return %1 : tensor<4x9xi32>
 }
@@ -1194,9 +1194,9 @@
   // CHECK-DAG: [[INDEX3:%.+]] = arith.constant 3 : index
   // CHECK-DAG: [[INDEX4:%.+]] = arith.constant 4 : index
   // CHECK-DAG: [[CST:%.+]] = arith.constant 4.200000e+01 : f32
-  // CHECK: linalg.pad_tensor %arg0 low{{\[}}%{{.*}}, [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]]  {
+  // CHECK: tensor.pad %arg0 low{{\[}}%{{.*}}, [[INDEX3]]] high{{\[}}[[INDEX2]], [[INDEX4]]]  {
   // CHECK: ^bb0(%arg1: index, %arg2: index):  
-  // CHECK:   linalg.yield [[CST]]
+  // CHECK:   tensor.yield [[CST]]
   // CHECK: } : tensor<1x2xf32> to tensor<4x9xf32>
   %1 = arith.constant dense<42.0> : tensor<f32>
   %2 = "tosa.pad"(%arg0, %0, %1)  : (tensor<1x2xf32>, tensor<2x2xi32>, tensor<f32>)  -> (tensor<4x9xf32>)
diff --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir
--- a/mlir/test/Dialect/Linalg/bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/bufferize.mlir
@@ -277,9 +277,9 @@
 func @pad_tensor_dynamic_shape(%arg0: tensor<4x?x2x?xf32>, %arg1: index) -> tensor<4x?x?x?xf32> {
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.0 : f32
-  %out = linalg.pad_tensor %arg0 low[%c0, %c0, %arg1, %c0] high[%c0, %c0, %c0, %arg1]  {
+  %out = tensor.pad %arg0 low[%c0, %c0, %arg1, %c0] high[%c0, %c0, %c0, %arg1]  {
   ^bb0(%gen_arg1: index, %gen_arg2: index, %gen_arg3: index, %gen_arg4: index):  
-    linalg.yield %cst : f32
+    tensor.yield %cst : f32
   } : tensor<4x?x2x?xf32> to tensor<4x?x?x?xf32>
   return %out : tensor<4x?x?x?xf32>
 }
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -282,7 +282,7 @@
 //   CHECK-NOT:   linalg.fill
 //   CHECK-NOT:   linalg.matmul
 //   CHECK-NOT:   linalg.generic
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //       CHECK:   return
 func @dead_linalg_tensor(%arg0 : tensor<7x7xi32>, %arg1 : tensor<7x7xf32>,
                          %arg2: tensor<?x?xf32>, %high : index) {
@@ -296,146 +296,15 @@
   ^bb(%3: i32) :
     linalg.yield %3 : i32
   } -> tensor<7x7xi32>
-  %3 = linalg.pad_tensor %arg2 low[%c0, %c0] high[%high, %high] {
-        ^bb0(%arg9: index, %arg10: index):  
-          linalg.yield %cst : f32
+  %3 = tensor.pad %arg2 low[%c0, %c0] high[%high, %high] {
+        ^bb0(%arg9: index, %arg10: index):
+          tensor.yield %cst : f32
   } : tensor<?x?xf32> to tensor<2x4xf32>
   return
 }
 
 // -----
 
-// CHECK-LABEL: func @pad_tensor_same_static_shape(
-//  CHECK-SAME:   %[[ARG0:.*]]: tensor<5x6xf32>
-//   CHECK-NOT:   linalg.pad_tensor
-//       CHECK:   return %[[ARG0]]
-func @pad_tensor_same_static_shape(%arg0: tensor<5x6xf32>, %a: index)
-    -> tensor<5x6xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = linalg.pad_tensor %arg0 low[%a, 0] high[0, %a] {
-        ^bb0(%arg1: index, %arg2: index):
-          linalg.yield %cst : f32
-  } : tensor<5x6xf32> to tensor<5x6xf32>
-  return %0 : tensor<5x6xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @pad_tensor_nofold_same_static_shape(
-//  CHECK-SAME:   %[[ARG0:.*]]: tensor<5x6xf32>
-//       CHECK:   %[[PAD:.*]] = linalg.pad_tensor
-//       CHECK:   return %[[PAD]]
-func @pad_tensor_nofold_same_static_shape(%arg0: tensor<5x6xf32>, %a: index)
-    -> tensor<5x6xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = linalg.pad_tensor %arg0 nofold low[%a, 0] high[0, %a] {
-        ^bb0(%arg1: index, %arg2: index):
-          linalg.yield %cst : f32
-  } : tensor<5x6xf32> to tensor<5x6xf32>
-  return %0 : tensor<5x6xf32>
-}
-
-// -----
-
-// CHECK-LABEL:   func @pad_tensor_after_cast_different_shape(
-// CHECK-SAME:      %[[INPUT:.*]]: tensor<?x64x?x?xf32>) -> tensor<?x?x?x?xf32> {
-// CHECK:           %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[PADDED:.*]] = linalg.pad_tensor %[[INPUT]]
-// CHECK-SAME:        low[0, 0, 1, 1] high[0, 0, 1, 1]  {
-// CHECK:           ^bb0(%[[ARG1:.*]]: index, %[[ARG2:.*]]: index, %[[ARG3:.*]]: index, %[[ARG4:.*]]: index):
-// CHECK:             linalg.yield %[[CST]] : f32
-// CHECK:           } : tensor<?x64x?x?xf32> to tensor<?x64x?x?xf32>
-// CHECK:           %[[DYNAMIC:.*]] = tensor.cast %[[PADDED:.*]] :
-// CHECK-SAME:         tensor<?x64x?x?xf32> to tensor<?x?x?x?xf32>
-// CHECK:           return %[[DYNAMIC]] : tensor<?x?x?x?xf32>
-// CHECK:         }
-func @pad_tensor_after_cast_different_shape(%arg0: tensor<?x64x?x?xf32>)
-    -> tensor<?x?x?x?xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %dynamic = tensor.cast %arg0 : tensor<?x64x?x?xf32> to tensor<?x?x?x?xf32>
-  %padded = linalg.pad_tensor %dynamic low[0, 0, 1, 1] high[0, 0, 1, 1]  {
-    ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index):  
-    linalg.yield %cst: f32
-  } : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32>
-  return %padded: tensor<?x?x?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL:   func @pad_tensor_after_cast_same_shape(
-// CHECK-SAME:      %[[INPUT:.*]]: tensor<?x64x?x?xf32>,
-// CHECK-SAME:      %[[PADDING:.*]]: index) -> tensor<?x?x?x?xf32> {
-// CHECK:           %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-// CHECK:           %[[PADDED:.*]] = linalg.pad_tensor %[[INPUT]]
-// CHECK-SAME:        low[0, %[[PADDING]], 1, 1] high[0, %[[PADDING]], 1, 1]  {
-// CHECK:           ^bb0(%[[ARG1:.*]]: index, %[[ARG2:.*]]: index, %[[ARG3:.*]]: index, %[[ARG4:.*]]: index):
-// CHECK:             linalg.yield %[[CST]] : f32
-// CHECK:           } : tensor<?x64x?x?xf32> to tensor<?x?x?x?xf32>
-// CHECK:           return %[[PADDED:.*]] : tensor<?x?x?x?xf32>
-// CHECK:         }
-func @pad_tensor_after_cast_same_shape(%arg0: tensor<?x64x?x?xf32>, %padding : index)
-    -> tensor<?x?x?x?xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  %dynamic = tensor.cast %arg0 : tensor<?x64x?x?xf32> to tensor<?x?x?x?xf32>
-  %padded = linalg.pad_tensor %dynamic low[0, %padding, 1, 1] high[0, %padding, 1, 1]  {
-    ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index):  
-    linalg.yield %cst: f32
-  } : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32>
-  return %padded: tensor<?x?x?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @pad_tensor_of_cast(
-// CHECK-NOT:     tensor.cast
-// CHECK:         linalg.pad_tensor
-// CHECK:         tensor<8x?xf32> to tensor<8x32xf32>
-func @pad_tensor_of_cast(%t: tensor<8x?xf32>, %s: index) -> tensor<8x32xf32> {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.000000e+00 : f32
-  %0 = tensor.cast %t : tensor<8x?xf32> to tensor<?x?xf32>
-  %1 = linalg.pad_tensor %0 low[%c0, %c0] high[%c0, %s]  {
-  ^bb0(%arg9: index, %arg10: index):  
-    linalg.yield %cst : f32
-  } : tensor<?x?xf32> to tensor<8x32xf32>
-  return %1 : tensor<8x32xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @cast_of_pad_more_static
-func @cast_of_pad_more_static(%arg0: tensor<?x?xf32>, %padding: index) -> tensor<32x32xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  // CHECK: %[[PAD:.*]] = linalg.pad_tensor
-  // CHECK: tensor<?x?xf32> to tensor<32x32xf32>
-  %padded = linalg.pad_tensor %arg0 low[%padding, %padding] high[0, 0] {
-  ^bb0(%arg1: index, %arg2: index):
-    linalg.yield %cst : f32
-  } : tensor<?x?xf32> to tensor<?x?xf32>
-  // CHECK-NOT: tensor.cast
-  %casted = tensor.cast %padded : tensor<?x?xf32> to tensor<32x32xf32>
-  // CHECK: return %[[PAD]]
-  return %casted : tensor<32x32xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @cast_of_pad_less_static
-func @cast_of_pad_less_static(%arg0: tensor<32x?x?xf32>, %padding: index) -> tensor<?x32x32xf32> {
-  %cst = arith.constant 0.000000e+00 : f32
-  // CHECK: linalg.pad_tensor
-  %padded = linalg.pad_tensor %arg0 low[%padding, %padding, %padding] high[0, 0, 0] {
-  ^bb0(%arg1: index, %arg2: index, %arg3: index):
-    linalg.yield %cst : f32
-  } : tensor<32x?x?xf32> to tensor<32x?x?xf32>
-  // CHECK: %[[CAST:.*]] = tensor.cast
-  %casted = tensor.cast %padded : tensor<32x?x?xf32> to tensor<?x32x32xf32>
-  // CHECK: return %[[CAST]]
-  return %casted : tensor<?x32x32xf32>
-}
-
-// -----
-
 func @propogate_casts(%arg0 : tensor<?x?xf32>, %arg1 : f32, %arg2 : index,
     %arg3 : index) -> tensor<?x?xf32> {
   %c0 = arith.constant 0 : index
@@ -579,71 +448,6 @@
 
 // -----
 
-func @tensor_pad_cast_fold(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.0 : f32
-  %0 = tensor.cast %arg0 : tensor<4x4xf32> to tensor<?x?xf32>
-  %1 = linalg.pad_tensor %0 low[%c0, %c0] high[%c0, %c0]  {
-    ^bb0(%arg1: index, %arg2: index):  
-      linalg.yield %cst : f32
-  } : tensor<?x?xf32> to tensor<4x4xf32>
-  return %1 : tensor<4x4xf32>
-}
-// CHECK-LABEL: @tensor_pad_cast
-// CHECK-SAME: %[[ARG0:.+]]: tensor<4x4xf32>
-// CHECK: return %[[ARG0]]
-
-// -----
-
-// CHECK-LABEL: func @fold_pad_tensor_source_cast(
-//  CHECK-SAME:                  %[[ARG0:.*]]: tensor<4x?xf32>
-//   CHECK-NOT:   tensor.cast
-//       CHECK:   %[[RESULT:.*]] = linalg.pad_tensor %[[ARG0]]
-func @fold_pad_tensor_source_cast(%arg0: tensor<4x?xf32>) -> tensor<4x4xf32> {
-  %cst = arith.constant 0.0 : f32
-  %0 = tensor.cast %arg0 : tensor<4x?xf32> to tensor<?x?xf32>
-  %1 = linalg.pad_tensor %0 low[0, 0] high[0, 1]  {
-    ^bb0(%arg1: index, %arg2: index):  
-      linalg.yield %cst : f32
-  } : tensor<?x?xf32> to tensor<4x4xf32>
-  return %1 : tensor<4x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @pad_static_zero_cast(
-//  CHECK-SAME:                  %[[ARG0:.*]]: tensor<?x?x?xf32>
-//   CHECK-NOT:   linalg.pad_tensor
-//       CHECK:   %[[RESULT:.*]] = tensor.cast %[[ARG0]] : tensor<?x?x?xf32> to tensor<2x3x4xf32>
-//       CHECK:   return %[[RESULT]]
-func @pad_static_zero_cast(%arg0: tensor<?x?x?xf32>, %pad_value: f32) -> tensor<2x3x4xf32> {
-  %c0 = arith.constant 0 : index
-  %0 = linalg.pad_tensor %arg0 low[0, %c0, 0] high[0, 0, %c0] {
-    ^bb0(%arg1: index, %arg2: index, %arg3: index):
-      linalg.yield %pad_value : f32
-    } : tensor<?x?x?xf32> to tensor<2x3x4xf32>
-
-  return %0 : tensor<2x3x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @pad_nofold_static_zero(
-//  CHECK-SAME:                  %[[ARG0:.*]]: tensor<?x?x?xf32>
-//       CHECK:   %[[PAD:.*]] = linalg.pad_tensor
-//       CHECK:   return %[[PAD]]
-func @pad_nofold_static_zero(%arg0: tensor<?x?x?xf32>, %pad_value: f32) -> tensor<2x3x4xf32> {
-  %c0 = arith.constant 0 : index
-  %0 = linalg.pad_tensor %arg0 nofold low[0, %c0, 0] high[0, 0, %c0] {
-    ^bb0(%arg1: index, %arg2: index, %arg3: index):
-      linalg.yield %pad_value : f32
-    } : tensor<?x?x?xf32> to tensor<2x3x4xf32>
-
-  return %0 : tensor<2x3x4xf32>
-}
-
-// -----
-
 func private @some_use(%i : index, %j : index)
 
 // CHECK-LABEL: func @init_canonicalize
diff --git a/mlir/test/Dialect/Linalg/codegen-strategy.mlir b/mlir/test/Dialect/Linalg/codegen-strategy.mlir
--- a/mlir/test/Dialect/Linalg/codegen-strategy.mlir
+++ b/mlir/test/Dialect/Linalg/codegen-strategy.mlir
@@ -48,7 +48,7 @@
 func @matmul(%arg0: tensor<72x72xf32>, %arg1: tensor<72x72xf32>, %arg2: tensor<72x72xf32>) -> tensor<72x72xf32> {
 
   // Check the padding of the input operands has been hoisted out of the tile loop nest.
-  //      CHECK-PAD-COUNT=2: linalg.pad_tensor %{{.*}} nofold
+  //      CHECK-PAD-COUNT=2: tensor.pad %{{.*}} nofold
   //              CHECK-PAD: scf.for
   // Check CSE eliminates the duplicate min operations introduced by tiling.
   //              CHECK-PAD: affine.min #[[MAP0]]
diff --git a/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir b/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir
--- a/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir
@@ -9,9 +9,9 @@
 // CHECK:           return %[[PADDED]] : tensor<1x32x32x1xf32>
 func @generalize_pad_tensor_static_shape(%arg0: tensor<1x28x28x1xf32>) -> tensor<1x32x32x1xf32> {
   %cst = arith.constant 0.000000e+00 : f32
-  %0 = linalg.pad_tensor %arg0 low[0, 2, 2, 0] high[0, 2, 2, 0]  {
+  %0 = tensor.pad %arg0 low[0, 2, 2, 0] high[0, 2, 2, 0]  {
   ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index):  
-    linalg.yield %cst : f32
+    tensor.yield %cst : f32
   } : tensor<1x28x28x1xf32> to tensor<1x32x32x1xf32>
   return %0 : tensor<1x32x32x1xf32>
 }
@@ -38,9 +38,9 @@
 func @generalize_pad_tensor_dynamic_shape(%arg0: tensor<4x?x2x?xf32>, %arg1: index) -> tensor<4x?x?x?xf32> {
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.0 : f32
-  %out = linalg.pad_tensor %arg0 low[%c0, %c0, %arg1, %c0] high[%c0, %c0, %c0, %arg1]  {
+  %out = tensor.pad %arg0 low[%c0, %c0, %arg1, %c0] high[%c0, %c0, %c0, %arg1]  {
   ^bb0(%gen_arg1: index, %gen_arg2: index, %gen_arg3: index, %gen_arg4: index):  
-    linalg.yield %cst : f32
+    tensor.yield %cst : f32
   } : tensor<4x?x2x?xf32> to tensor<4x?x?x?xf32>
   return %out : tensor<4x?x?x?xf32>
 }
diff --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir
--- a/mlir/test/Dialect/Linalg/hoist-padding.mlir
+++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir
@@ -18,7 +18,7 @@
   //      MATVEC:  %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
   //        MATVEC:   %[[PIDX0:.*]] = affine.apply #[[DIV4]](%[[PIV0]])
   //        MATVEC:   %[[T1:.*]] = tensor.extract_slice %[[ARG1]][%[[PIV0]]] [4]
-  //        MATVEC:   %[[T2:.*]] = linalg.pad_tensor %[[T1]]
+  //        MATVEC:   %[[T2:.*]] = tensor.pad %[[T1]]
   //        MATVEC:   %[[T3:.*]] = tensor.insert_slice %[[T1:.*]]{{.*}}[%[[PIDX0]]
 
   //      MATVEC:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
@@ -29,9 +29,9 @@
     //  MATVEC-DAG:   %[[IDX0:.*]] = affine.apply #[[DIV4]](%[[IV0]])
     //  MATVEC-DAG:   %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]]
     %2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32>
-    %3 = linalg.pad_tensor %2 nofold low[%c0] high[%c0]  {
+    %3 = tensor.pad %2 nofold low[%c0] high[%c0]  {
     ^bb0(%arg5: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
     } : tensor<4xf32> to tensor<4xf32>
 
     // Check matvec uses the packed input vector.
@@ -67,7 +67,7 @@
   //        MATVEC:   %[[TS0:.*]] = affine.min #[[MAP0]](%[[PIV0]])
   //        MATVEC:   %[[T1:.*]] = tensor.extract_slice %[[ARG1]][%[[PIV0]]] [%[[TS0]]]
   //        MATVEC:   %[[HPD0:.*]] = affine.apply #[[MAP1]](%[[TS0]])
-  //        MATVEC:   %[[T2:.*]] = linalg.pad_tensor %[[T1]]{{.*}}high[%[[HPD0]]
+  //        MATVEC:   %[[T2:.*]] = tensor.pad %[[T1]]{{.*}}high[%[[HPD0]]
   //        MATVEC:   %[[T3:.*]] = tensor.insert_slice %[[T1:.*]]{{.*}}[%[[PIDX0]]
 
   //      MATVEC:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
@@ -80,13 +80,13 @@
     //  MATVEC-DAG:   %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]]
     %3 = tensor.extract_slice %arg1[%arg3] [%1] [1] : tensor<12xf32> to tensor<?xf32>
     %4 = affine.apply #map1(%1)
-    %5 = linalg.pad_tensor %2 low[%c0, %c0] high[%c0, %4]  {
+    %5 = tensor.pad %2 low[%c0, %c0] high[%c0, %4]  {
     ^bb0(%arg5: index, %arg6: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
     } : tensor<24x?xf32> to tensor<24x5xf32>
-    %6 = linalg.pad_tensor %3 low[%c0] high[%4]  {
+    %6 = tensor.pad %3 low[%c0] high[%4]  {
     ^bb0(%arg5: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
     } : tensor<?xf32> to tensor<5xf32>
 
     // Check matvec uses the packed input vector.
@@ -127,7 +127,7 @@
   //        MATVEC:   %[[TS0:.*]] = affine.min #[[MAP0]](%[[PIV0]])[%[[D0]]]
   //        MATVEC:   %[[T1:.*]] = tensor.extract_slice %[[ARG1]][%[[PIV0]]] [%[[TS0]]]
   //        MATVEC:   %[[HPD0:.*]] = affine.apply #[[MAP1]](%[[TS0]])
-  //        MATVEC:   %[[T2:.*]] = linalg.pad_tensor %[[T1]]{{.*}}high[%[[HPD0]]
+  //        MATVEC:   %[[T2:.*]] = tensor.pad %[[T1]]{{.*}}high[%[[HPD0]]
   //        MATVEC:   %[[T3:.*]] = tensor.insert_slice %[[T1:.*]]{{.*}}[%[[PIDX0]]
 
   //      MATVEC:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
@@ -140,13 +140,13 @@
     //  MATVEC-DAG:   %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]]
     %4 = tensor.extract_slice %arg1[%arg3] [%2] [1] : tensor<?xf32> to tensor<?xf32>
     %5 = affine.apply #map1(%2)
-    %6 = linalg.pad_tensor %3 low[%c0, %c0] high[%c0, %5]  {
+    %6 = tensor.pad %3 low[%c0, %c0] high[%c0, %5]  {
     ^bb0(%arg5: index, %arg6: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
     } : tensor<24x?xf32> to tensor<24x4xf32>
-    %7 = linalg.pad_tensor %4 nofold low[%c0] high[%5]  {
+    %7 = tensor.pad %4 nofold low[%c0] high[%5]  {
     ^bb0(%arg5: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
     } : tensor<?xf32> to tensor<4xf32>
 
     // Check matvec uses the packed input vector.
@@ -174,13 +174,13 @@
 
     // Check the non constant padding is not hoisted.
     //      MATVEC:  %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
-    //      MATVEC:  %[[T1:.*]] = linalg.pad_tensor %[[T0]]
+    //      MATVEC:  %[[T1:.*]] = tensor.pad %[[T0]]
     %2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32>
-    %3 = linalg.pad_tensor %2 nofold low[%c0] high[%c0]  {
+    %3 = tensor.pad %2 nofold low[%c0] high[%c0]  {
     ^bb0(%arg5: index):  
       %5 = arith.index_cast %arg3 : index to i32
       %6 = arith.sitofp %5 : i32 to f32
-      linalg.yield %6 : f32
+      tensor.yield %6 : f32
     } : tensor<4xf32> to tensor<4xf32>
 
     // Check matvec uses the padded input vector.
@@ -209,13 +209,13 @@
     // Check the non constant op padding is not hoisted.
     //      MATVEC:  %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
     //      MATVEC:  %[[V0:.*]] = tensor.extract %[[ARG1]][%[[IV0]]
-    //      MATVEC:  %[[T1:.*]] = linalg.pad_tensor %[[T0]]
-    //        MATVEC:  linalg.yield %[[V0]]
+    //      MATVEC:  %[[T1:.*]] = tensor.pad %[[T0]]
+    //        MATVEC:  tensor.yield %[[V0]]
     %2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32>
     %3 = tensor.extract %arg1[%arg3] : tensor<12xf32>
-    %4 = linalg.pad_tensor %2 nofold low[%c0] high[%c0]  {
+    %4 = tensor.pad %2 nofold low[%c0] high[%c0]  {
     ^bb0(%arg5: index):  
-      linalg.yield %3 : f32
+      tensor.yield %3 : f32
     } : tensor<4xf32> to tensor<4xf32>
 
     // Check matvec uses the padded input vector.
@@ -247,12 +247,12 @@
     // Check the index_cast prevents hoisting due to its non index operand.
     //      MATVEC:  %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
     //      MATVEC:  %[[IDX0:.*]] = arith.index_cast %[[ARG3]]
-    //      MATVEC:  %[[T1:.*]] = linalg.pad_tensor %[[T0]]{{.*}}%[[IDX0]]
+    //      MATVEC:  %[[T1:.*]] = tensor.pad %[[T0]]{{.*}}%[[IDX0]]
     %2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32>
     %3 = arith.index_cast %arg3 : i32 to index
-    %4 = linalg.pad_tensor %2 nofold low[%3] high[%3]  {
+    %4 = tensor.pad %2 nofold low[%3] high[%3]  {
     ^bb0(%arg6: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
     } : tensor<4xf32> to tensor<4xf32>
 
     // Check matvec uses the padded input vector.
@@ -284,12 +284,12 @@
     // Check the load prevents hoisting due to its memory effect.
     //      MATVEC:  %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
     //      MATVEC:  %[[IDX0:.*]] = memref.load %[[ARG3]]
-    //      MATVEC:  %[[T1:.*]] = linalg.pad_tensor %[[T0]]{{.*}}%[[IDX0]]
+    //      MATVEC:  %[[T1:.*]] = tensor.pad %[[T0]]{{.*}}%[[IDX0]]
     %2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32>
     %3 = memref.load %arg3[%c0] : memref<?xindex>
-    %4 = linalg.pad_tensor %2 nofold low[%3] high[%3]  {
+    %4 = tensor.pad %2 nofold low[%3] high[%3]  {
     ^bb0(%arg6: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
     } : tensor<4xf32> to tensor<4xf32>
 
     // Check matvec uses the padded input vector.
@@ -321,15 +321,15 @@
     // Check the unexpected operation with a region prevents hoisting.
     //      MATVEC:  %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]]
     //      MATVEC:  %[[IDX0:.*]] = scf.for {{.*}} step %[[ARG3]]
-    //      MATVEC:  %[[T1:.*]] = linalg.pad_tensor %[[T0]]{{.*}}%[[IDX0]]
+    //      MATVEC:  %[[T1:.*]] = tensor.pad %[[T0]]{{.*}}%[[IDX0]]
     %2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32>
     %3 = scf.for %arg6 = %c0 to %c12 step %arg3 iter_args(%arg7 = %c0) -> (index) {
       %6 = arith.addi %arg3, %arg7 : index
       scf.yield %6 : index
     }
-    %4 = linalg.pad_tensor %2 nofold low[%3] high[%3]  {
+    %4 = tensor.pad %2 nofold low[%3] high[%3]  {
     ^bb0(%arg6: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
     } : tensor<4xf32> to tensor<4xf32>
 
     // Check matvec uses the padded input vector.
@@ -361,7 +361,7 @@
   // Check the second input operand is hoisted by two loop nests.
   //      MATMUL:  %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] =
   //        MATMUL:   %[[T1:.*]] = tensor.extract_slice %[[ARG1]]
-  //        MATMUL:   %[[T2:.*]] = linalg.pad_tensor %[[T1]]
+  //        MATMUL:   %[[T2:.*]] = tensor.pad %[[T1]]
 
   //      MATMUL:  scf.for %[[IV0:[0-9a-zA-Z]*]] =
   %0 = scf.for %arg3 = %c0 to %c12 step %c5 iter_args(%arg4 = %arg2) -> (tensor<12x24xf32>) {
@@ -372,9 +372,9 @@
     %3 = affine.apply #map1(%1)
 
     // Check the fused and padded fill op does not prevent hoisting.
-    %4 = linalg.pad_tensor %2 nofold low[%c0, %c0] high[%3, %c0]  {
+    %4 = tensor.pad %2 nofold low[%c0, %c0] high[%3, %c0]  {
     ^bb0(%arg5: index, %arg6: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
     } : tensor<?x24xf32> to tensor<5x24xf32>
     %5 = linalg.fill(%cst, %4) : f32, tensor<5x24xf32> -> tensor<5x24xf32>
     %6 = tensor.extract_slice %5[0, 0] [%1, 24] [1, 1] : tensor<5x24xf32> to tensor<?x24xf32>
@@ -382,7 +382,7 @@
     // Check the first input operand is hoisted by one loop nest.
     //      MATMUL:  %[[T3:.*]] = scf.for %[[PIV1:[0-9a-z]+]] =
     //        MATMUL:   %[[T4:.*]] = tensor.extract_slice %[[ARG0]]
-    //        MATMUL:   %[[T5:.*]] = linalg.pad_tensor %[[T4]]
+    //        MATMUL:   %[[T5:.*]] = tensor.pad %[[T4]]
 
     //      MATMUL:  scf.for %[[IV1:[0-9a-zA-Z]*]] =
     %7 = scf.for %arg5 = %c0 to %c6 step %c3 iter_args(%arg6 = %6) -> (tensor<?x24xf32>) {
@@ -393,20 +393,20 @@
       %9 = tensor.extract_slice %arg0[%arg3, %arg5] [%1, 3] [1, 1] : tensor<12x6xf32> to tensor<?x3xf32>
       %10 = tensor.extract_slice %arg1[%arg5, 0] [3, 24] [1, 1] : tensor<6x24xf32> to tensor<3x24xf32>
       %11 = tensor.extract_slice %arg6[0, 0] [%1, 24] [1, 1] : tensor<?x24xf32> to tensor<?x24xf32>
-      %12 = linalg.pad_tensor %9 nofold low[%c0, %c0] high[%3, %c0]  {
+      %12 = tensor.pad %9 nofold low[%c0, %c0] high[%3, %c0]  {
       ^bb0(%arg7: index, %arg8: index):  
-        linalg.yield %cst : f32
+        tensor.yield %cst : f32
       } : tensor<?x3xf32> to tensor<5x3xf32>
-      %13 = linalg.pad_tensor %10 nofold low[%c0, %c0] high[%c0, %c0]  {
+      %13 = tensor.pad %10 nofold low[%c0, %c0] high[%c0, %c0]  {
       ^bb0(%arg7: index, %arg8: index):  
-        linalg.yield %cst : f32
+        tensor.yield %cst : f32
       } : tensor<3x24xf32> to tensor<3x24xf32>
 
       // Check the output padding is not hoisted.
-      //      MATMUL:   %[[T8:.*]] = linalg.pad_tensor
-      %14 = linalg.pad_tensor %11 nofold low[%c0, %c0] high[%3, %c0]  {
+      //      MATMUL:   %[[T8:.*]] = tensor.pad
+      %14 = tensor.pad %11 nofold low[%c0, %c0] high[%3, %c0]  {
       ^bb0(%arg7: index, %arg8: index):  
-        linalg.yield %cst : f32
+        tensor.yield %cst : f32
       } : tensor<?x24xf32> to tensor<5x24xf32>
 
       // Check matmul uses the padded operands.
diff --git a/mlir/test/Dialect/Linalg/invalid.mlir b/mlir/test/Dialect/Linalg/invalid.mlir
--- a/mlir/test/Dialect/Linalg/invalid.mlir
+++ b/mlir/test/Dialect/Linalg/invalid.mlir
@@ -353,71 +353,6 @@
 
 // -----
 
-
-func @pad_result_type(%arg0: tensor<?x2x3x4xi32>, %arg1: index, %arg2: i32) -> tensor<?x?x?x8xf32> {
-  // expected-error @+1 {{specified type 'tensor<?x?x?x8xf32>' does not match the inferred type 'tensor<?x?x?x9xi32>}}
-  %0 = linalg.pad_tensor %arg0 low[1, %arg1, 2, 2] high[1, 2, %arg1, 3] {
-  ^bb0(%arg3: index, %arg4: index):  
-    linalg.yield %arg2 : i32
-  } : tensor<?x2x3x4xi32> to tensor<?x?x?x8xf32>
-  return %0 : tensor<?x?x?x8xf32>
-}
-
-// -----
-
-func @pad_number_of_block_args(%arg0: tensor<?x4xi32>, %arg1: i32) -> tensor<?x9xi32> {
-  // expected-error @+1 {{expected the block to have 2 arguments}}
-  %0 = linalg.pad_tensor %arg0 low[1, 2] high[2, 3] {
-  ^bb0(%arg2: index, %arg3: index, %arg4: index):  
-    linalg.yield %arg1 : i32
-  } : tensor<?x4xi32> to tensor<?x9xi32>
-  return %0 : tensor<?x9xi32>
-}
-
-// -----
-
-func @pad_no_block(%arg0: tensor<?x4xi32>, %arg1: i32) -> tensor<?x9xi32> {
-  // expected-error @+1 {{op region #0 ('region') failed to verify constraint: region with 1 blocks}}
-  %0 = linalg.pad_tensor %arg0 low[1, 2] high[2, 3] {
-  } : tensor<?x4xi32> to tensor<?x9xi32>
-  return %0 : tensor<?x9xi32>
-}
-
-// -----
-
-func @pad_block_args(%arg0: tensor<?x4xi32>, %arg1: i32) -> tensor<?x9xi32> {
-  // expected-error @+1 {{op expected block argument 1 to be an index}}
-  %0 = linalg.pad_tensor %arg0 low[1, 2] high[2, 3] {
-  ^bb0(%arg2: i32, %arg3: i32):  
-    linalg.yield %arg1 : i32
-  } : tensor<?x4xi32> to tensor<?x9xi32>
-  return %0 : tensor<?x9xi32>
-}
-
-// -----
-
-func @pad_num_yields(%arg0: tensor<?x4xi32>, %arg1: i32) -> tensor<?x9xi32> {
-  // expected-error @+3 {{op expected single yield operand (got 2)}}
-  %0 = linalg.pad_tensor %arg0 low[1, 2] high[2, 3] {
-  ^bb0(%arg2: index, %arg3: index):  
-    linalg.yield %arg1, %arg1 : i32, i32
-  } : tensor<?x4xi32> to tensor<?x9xi32>
-  return %0 : tensor<?x9xi32>
-}
-
-// -----
-
-func @pad_yield_type(%arg0: tensor<?x4xi32>, %arg1: i8) -> tensor<?x9xi32> {
-  // expected-error @+3 {{op expected yield type to match shape element type}}
-  %0 = linalg.pad_tensor %arg0 low[1, 2] high[2, 3] {
-  ^bb0(%arg2: index, %arg3: index):  
-    linalg.yield %arg1 : i8
-  } : tensor<?x4xi32> to tensor<?x9xi32>
-  return %0 : tensor<?x9xi32>
-}
-
-// -----
-
 func @illegal_fill_tensor_no_return(%arg0 : index, %arg1 : index, %arg2 : f32)
 {
   %0 = linalg.init_tensor [%arg0, %arg1] : tensor<?x?xf32>
diff --git a/mlir/test/Dialect/Linalg/lower-pad-tensor.mlir b/mlir/test/Dialect/Linalg/lower-pad-tensor.mlir
--- a/mlir/test/Dialect/Linalg/lower-pad-tensor.mlir
+++ b/mlir/test/Dialect/Linalg/lower-pad-tensor.mlir
@@ -6,9 +6,9 @@
 func @pad_tensor_with_memrefs(%arg0: memref<1x28x28x1xf32>) -> memref<2x31x31x3xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %0 = bufferization.to_tensor %arg0 : memref<1x28x28x1xf32>
-  %1 = linalg.pad_tensor %0 low[1, 1, 1, 2] high[0, 2, 2, 0]  {
+  %1 = tensor.pad %0 low[1, 1, 1, 2] high[0, 2, 2, 0]  {
   ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index):  
-    linalg.yield %cst : f32
+    tensor.yield %cst : f32
   } : tensor<1x28x28x1xf32> to tensor<2x31x31x3xf32>
   %2 = bufferization.to_memref %1 : memref<2x31x31x3xf32>
   return %2 : memref<2x31x31x3xf32>
@@ -25,9 +25,9 @@
 // CHECK-LABEL: func @pad_tensor_no_memrefs
 func @pad_tensor_no_memrefs(%arg0: tensor<1x28x28xf32>) -> tensor<2x32x32xf32> {
   %cst = arith.constant 0.000000e+00 : f32
-  %0 = linalg.pad_tensor %arg0 low[1, 2, 2] high[0, 2, 2]  {
+  %0 = tensor.pad %arg0 low[1, 2, 2] high[0, 2, 2]  {
   ^bb0(%arg1: index, %arg2: index, %arg3: index):  
-    linalg.yield %cst : f32
+    tensor.yield %cst : f32
   } : tensor<1x28x28xf32> to tensor<2x32x32xf32>
   return %0 : tensor<2x32x32xf32>
 }
@@ -43,9 +43,9 @@
 // CHECK-LABEL: func @pad_tensor_detailed
 func @pad_tensor_detailed(%arg0: tensor<1x28x28x1xf32>) -> tensor<1x32x32x1xf32> {
   %cst = arith.constant 0.000000e+00 : f32
-  %0 = linalg.pad_tensor %arg0 low[0, 2, 2, 0] high[0, 2, 2, 0]  {
+  %0 = tensor.pad %arg0 low[0, 2, 2, 0] high[0, 2, 2, 0]  {
   ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index):  
-    linalg.yield %cst : f32
+    tensor.yield %cst : f32
   } : tensor<1x28x28x1xf32> to tensor<1x32x32x1xf32>
   return %0 : tensor<1x32x32x1xf32>
 }
diff --git a/mlir/test/Dialect/Linalg/pad.mlir b/mlir/test/Dialect/Linalg/pad.mlir
--- a/mlir/test/Dialect/Linalg/pad.mlir
+++ b/mlir/test/Dialect/Linalg/pad.mlir
@@ -31,10 +31,10 @@
 
   // Check statically sized matmul inputs with partially divisible sizes are padded.
   //      MATMUL:   %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS2]]]
-  //      MATMUL:   %[[T3:.*]] = linalg.pad_tensor %[[T0]] nofold
+  //      MATMUL:   %[[T3:.*]] = tensor.pad %[[T0]] nofold
   // MATMUL-SAME:                  [%[[C0]], %[[C0]]]
   // MATMUL-SAME:                  [%[[C0]], %[[V0]]
-  //      MATMUL:   %[[T4:.*]] = linalg.pad_tensor %[[T1]] nofold
+  //      MATMUL:   %[[T4:.*]] = tensor.pad %[[T1]] nofold
 
   // Check the statically sized matmul output with fully divisible sizes is not padded.
   //      MATMUL:   %[[T5:.*]] = linalg.matmul
@@ -74,7 +74,7 @@
 
   // Check the statically sized matmul output with partially divisible sizes is padded.
   //      MATMUL:   %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS1]]]
-  //      MATMUL:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] low
+  //      MATMUL:   %[[T1:.*]] = tensor.pad %[[T0]] low
   // MATMUL-SAME:                  [%[[C0]], %[[C0]]]
   // MATMUL-SAME:                  [%[[C0]], %[[V0]]
 
@@ -137,11 +137,11 @@
   // Check all matmul operands are padded.
   //      MATMUL:   %[[V0:.*]] = affine.apply #[[MAP3]]()[%[[TS0]]]
   //      MATMUL:   %[[V1:.*]] = affine.apply #[[MAP4]]()[%[[TS2]]]
-  //      MATMUL:   %[[T3:.*]] = linalg.pad_tensor %{{.*}} nofold
+  //      MATMUL:   %[[T3:.*]] = tensor.pad %{{.*}} nofold
   // MATMUL-SAME:                  [%[[C0]], %[[C0]]]
   // MATMUL-SAME:                  [%[[V0]], %[[V1]]
-  //      MATMUL:   %[[T4:.*]] = linalg.pad_tensor %{{.*}} nofold
-  //      MATMUL:   %[[T5:.*]] = linalg.pad_tensor %{{.*}} low
+  //      MATMUL:   %[[T4:.*]] = tensor.pad %{{.*}} nofold
+  //      MATMUL:   %[[T5:.*]] = tensor.pad %{{.*}} low
 
   // Check the dynamic matmul has been erased.
   //  MATMUL-NOT:   = linalg.matmul {{.*}} tensor<?x?xf32>
@@ -172,7 +172,7 @@
   %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
 
   // Check both fill operations are padded by the same pad tensor operation.
-  //      FILL:  %[[T0:.*]] = linalg.pad_tensor
+  //      FILL:  %[[T0:.*]] = tensor.pad
   //      FILL:  %[[T1:.*]] = linalg.fill(%{{.*}}, %[[T0]])
   //      FILL:  %[[T2:.*]] = linalg.fill(%{{.*}}, %[[T1]])
   //      FILL:  = tensor.extract_slice %[[T2]]
@@ -197,20 +197,20 @@
   //      MATMUL:  %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
   // MATMUL-SAME:                                     [0, 0]
   // MATMUL-SAME:                                     [%[[SIZE]], %[[SIZE]]]
-  //      MATMUL:  %[[T1:.*]] = linalg.pad_tensor %[[T0]]
+  //      MATMUL:  %[[T1:.*]] = tensor.pad %[[T0]]
   //      MATMUL:  %[[T2:.*]] = linalg.fill(%{{.*}}, %[[T1]]
   //      MATMUL:  %[[T3:.*]] = linalg.fill(%{{.*}}, %[[T2]]
   %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-  %1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0]  {
+  %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0]  {
     ^bb0(%arg3: index, %arg4: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
   } : tensor<?x?xf32> to tensor<64x64xf32>
   %2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
   %3 = linalg.fill(%cst, %2) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
   %4 = tensor.extract_slice %3[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
 
   // Check there are no additional pad tensor operations.
-  //  MATMUL-NOT:  linalg.pad_tensor
+  //  MATMUL-NOT:  tensor.pad
 
   // Check the matmul directly uses the result of the fill operation.
   //      MATMUL:  %[[T4:.*]] = linalg.matmul ins(%[[T3]]
@@ -233,16 +233,16 @@
   %cst = arith.constant 42.0 : f32
   %size = affine.min #map0()[%iv0]
   %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-  %1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0]  {
+  %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0]  {
     ^bb0(%arg3: index, %arg4: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
   } : tensor<?x?xf32> to tensor<64x64xf32>
   %2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
   %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
 
   // Different padding values prevent composing the paddings (42.0 vs. 0.0).
   //      MATMUL:  = linalg.fill
-  //      MATMUL:  = linalg.pad_tensor
+  //      MATMUL:  = tensor.pad
   //      MATMUL:  = linalg.matmul
   %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
   return %5 : tensor<?x?xf32>
@@ -258,16 +258,16 @@
   %cst = arith.constant 0.0 : f32
   %size = affine.min #map0()[%iv0]
   %0 = tensor.extract_slice %arg0[0, 0] [%iv0, %iv0] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
-  %1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0]  {
+  %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0]  {
     ^bb0(%arg3: index, %arg4: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
   } : tensor<?x?xf32> to tensor<64x64xf32>
   %2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
   %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
 
   // Different dynamic sizes prevent composing the paddings (%iv0 vs %size).
   //      MATMUL:  = linalg.fill
-  //      MATMUL:  = linalg.pad_tensor
+  //      MATMUL:  = tensor.pad
   //      MATMUL:  = linalg.matmul
   %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
   return %5 : tensor<?x?xf32>
@@ -283,16 +283,16 @@
   %cst = arith.constant 0.0 : f32
   %size = affine.min #map0()[%iv0]
   %0 = tensor.extract_slice %arg0[0, 0, 0] [%size, %size, 1] [1, 1, 1] : tensor<64x64x1xf32> to tensor<?x?xf32>
-  %1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0]  {
+  %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0]  {
     ^bb0(%arg3: index, %arg4: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
   } : tensor<?x?xf32> to tensor<64x64xf32>
   %2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
   %3 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
 
   // Different dynamic ranks prevent composing the paddings ([%size, %size, 1] vs [%size, %size]).
   //      MATMUL:  = linalg.fill
-  //      MATMUL:  = linalg.pad_tensor
+  //      MATMUL:  = tensor.pad
   //      MATMUL:  = linalg.matmul
   %4 = linalg.matmul ins(%3, %3 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%3 : tensor<?x?xf32>) -> tensor<?x?xf32>
   return %4 : tensor<?x?xf32>
@@ -308,16 +308,16 @@
   %cst = arith.constant 0.0 : f32
   %size = affine.min #map0()[%iv0]
   %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>
-  %1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0]  {
+  %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0]  {
     ^bb0(%arg3: index, %arg4: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
   } : tensor<?x?xf32> to tensor<62x62xf32>
   %2 = linalg.fill(%cst, %1) : f32, tensor<62x62xf32> -> tensor<62x62xf32>
   %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>
 
   // Different static sizes prevent composing the paddings (62 vs 64 derived from #map0).
   //      MATMUL:  = linalg.fill
-  //      MATMUL:  = linalg.pad_tensor
+  //      MATMUL:  = tensor.pad
   //      MATMUL:  = linalg.matmul
   %5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
   return %5 : tensor<?x?xf32>
@@ -336,7 +336,7 @@
   %0 = affine.min #map0()[%iv0]
 
   //      FILL:   %[[T0:.*]] = tensor.extract_slice %[[ARG1]]
-  //      FILL:   %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold
+  //      FILL:   %[[T1:.*]] = tensor.pad %[[T0]] nofold
   %1 = tensor.extract_slice %arg1[0, 0] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32>
 
   // Check only the fill output operand is padded.
@@ -361,8 +361,8 @@
   %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor<?x5xf32>
 
   // Check the matmul inputs are padded despite the missing slice for the static output.
-  //      MATMUL:  %[[T0:.*]] = linalg.pad_tensor
-  //      MATMUL:  %[[T1:.*]] = linalg.pad_tensor
+  //      MATMUL:  %[[T0:.*]] = tensor.pad
+  //      MATMUL:  %[[T1:.*]] = tensor.pad
   //      MATMUL:  = linalg.matmul ins(%[[T0]], %[[T1]]
   // MATMUL-SAME:                 outs(%[[ARG2]]
   %3 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%arg2 : tensor<4x5xf32>) -> tensor<4x5xf32>
@@ -414,8 +414,8 @@
   %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32>
 
   // Check the matmul inputs are padded despite the failure to compute a padding value for the static output.
-  // INPUTS-ONLY:  %[[T1:.*]] = linalg.pad_tensor
-  // INPUTS-ONLY:  %[[T2:.*]] = linalg.pad_tensor
+  // INPUTS-ONLY:  %[[T1:.*]] = tensor.pad
+  // INPUTS-ONLY:  %[[T2:.*]] = tensor.pad
   // INPUTS-ONLY:  = linalg.matmul ins(%[[T1]], %[[T2]]
   // INPUTS-ONLY-SAME:             outs(%[[T0]]
   %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor<?x5xf32>) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32>
@@ -465,7 +465,7 @@
   %0 = tensor.extract_slice %arg0[0, 0, 0, 0] [1, %size, 1, %size] [1, 1, 1, 1] : tensor<1x64x1x64xf32> to tensor<1x?x?xf32>
 
   // Check the fill is padded despite the rank-reducing slice operation.
-  //      FILL:  %[[T0:.*]] = linalg.pad_tensor
+  //      FILL:  %[[T0:.*]] = tensor.pad
   //      FILL:  %[[T1:.*]] = linalg.fill(%{{.*}}, %[[T0]])
   // FILL-SAME:    tensor<1x64x64xf32>
   //      FILL:  = tensor.extract_slice %[[T1]]
diff --git a/mlir/test/Dialect/Linalg/pad_fusion.mlir b/mlir/test/Dialect/Linalg/pad_fusion.mlir
--- a/mlir/test/Dialect/Linalg/pad_fusion.mlir
+++ b/mlir/test/Dialect/Linalg/pad_fusion.mlir
@@ -15,9 +15,9 @@
       %1 = arith.mulf %arg6, %arg6 : f32
       linalg.yield %1 : f32
     } -> tensor<?x?xf32>
-  %1 = linalg.pad_tensor %0 low [%arg1, %arg2] high [%arg3, %arg4] {
+  %1 = tensor.pad %0 low [%arg1, %arg2] high [%arg3, %arg4] {
     ^bb0(%arg6: index, %arg7 : index):
-      linalg.yield %arg5 : f32
+      tensor.yield %arg5 : f32
     } : tensor<?x?xf32> to tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
 }
@@ -64,9 +64,9 @@
       %1 = arith.mulf %arg4, %arg4 : f32
       linalg.yield %1 : f32
     } -> tensor<42x?xf32>
-  %1 = linalg.pad_tensor %0 low [3, %arg1] high [4, %arg2] {
+  %1 = tensor.pad %0 low [3, %arg1] high [4, %arg2] {
     ^bb0(%arg4: index, %arg5 : index):
-      linalg.yield %arg3 : f32
+      tensor.yield %arg3 : f32
     } : tensor<42x?xf32> to tensor<49x?xf32>
   return %1 : tensor<49x?xf32>
 }
diff --git a/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir b/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir
--- a/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir
+++ b/mlir/test/Dialect/Linalg/resolve-shaped-type-result-dims.mlir
@@ -253,9 +253,9 @@
    %c3 = arith.constant 3 : index
    %c4 = arith.constant 4 : index
    %c5 = arith.constant 5 : index
-   %0 = linalg.pad_tensor %arg0 low[%c3, %arg1, %c4] high[7, %c5, %arg2] {
+   %0 = tensor.pad %arg0 low[%c3, %arg1, %c4] high[7, %c5, %arg2] {
      ^bb0(%arg4: index, %arg5: index, %arg6: index):
-       linalg.yield %arg3 : f32
+       tensor.yield %arg3 : f32
    } : tensor<2x?x?xf32> to tensor<?x?x?xf32>
    %1 = tensor.dim %0, %c0 : tensor<?x?x?xf32>
    %2 = tensor.dim %0, %c1 : tensor<?x?x?xf32>
diff --git a/mlir/test/Dialect/Linalg/roundtrip.mlir b/mlir/test/Dialect/Linalg/roundtrip.mlir
--- a/mlir/test/Dialect/Linalg/roundtrip.mlir
+++ b/mlir/test/Dialect/Linalg/roundtrip.mlir
@@ -15,77 +15,6 @@
 // CHECK-DAG: #[[$strided3D:.*]] = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2 + d2)>
 // CHECK-DAG: #[[$strided3DT:.*]] = affine_map<(d0, d1, d2)[s0, s1, s2] -> (d2 * s1 + s0 + d1 * s2 + d0)>
 
-func @pad_dynamic(%arg0: tensor<1x2x2x?xf32>, %low: index, %high: index,
-                  %pad_value: f32) -> tensor<6x?x?x?xf32> {
-  %0 = linalg.pad_tensor %arg0 low[2, %low, 3, 3] high[3, 3, %high, 2] {
-    ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index):
-      linalg.yield %pad_value : f32
-    } : tensor<1x2x2x?xf32> to tensor<6x?x?x?xf32>
-  return %0 : tensor<6x?x?x?xf32>
-}
-// CHECK-LABEL: func @pad_dynamic
-//  CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]
-//  CHECK-SAME: %[[LOW:[a-zA-Z0-9_]*]]
-//  CHECK-SAME: %[[HIGH:[a-zA-Z0-9_]*]]
-//       CHECK:   linalg.pad_tensor %[[ARG0]]
-//  CHECK-SAME:     low[2, %[[LOW]], 3, 3]
-//  CHECK-SAME:     high[3, 3, %[[HIGH]], 2]
-//       CHECK:    : tensor<1x2x2x?xf32> to tensor<6x?x?x?xf32>
-
-// -----
-
-func @pad_static(%arg0: tensor<3x4xf32>, %pad_value: f32) -> tensor<6x9xf32> {
-  %0 = linalg.pad_tensor %arg0 low[1, 2] high[2, 3] {
-    ^bb0(%arg1 : index, %arg2 : index):
-      linalg.yield %pad_value : f32
-    } : tensor<3x4xf32> to tensor<6x9xf32>
-  return %0 : tensor<6x9xf32>
-}
-// CHECK-LABEL: func @pad_static
-//  CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]
-//       CHECK:   linalg.pad_tensor %[[ARG0]] low[1, 2] high[2, 3]
-//       CHECK:    : tensor<3x4xf32> to tensor<6x9xf32>
-
-// -----
-
-func @pad_asymmetrical(%arg0: tensor<2x3xf32>, %ub0: index, %ub1: index,
-                       %pad_value: f32) -> tensor<?x?xf32> {
-  %0 = linalg.pad_tensor %arg0 low[0, 0] high[%ub0, %ub1] {
-    ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad_value : f32
-    } : tensor<2x3xf32> to tensor<?x?xf32>
-  return %0 : tensor<?x?xf32>
-}
-// CHECK-LABEL: func @pad_asymmetrical
-//  CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]
-//  CHECK-SAME: %[[UB0:[a-zA-Z0-9_]*]]
-//  CHECK-SAME: %[[UB1:[a-zA-Z0-9_]*]]
-//       CHECK:   linalg.pad_tensor %[[ARG0]]
-//  CHECK-SAME:     low[0, 0]
-//  CHECK-SAME:     high[%[[UB0]], %[[UB1]]]
-//       CHECK:    : tensor<2x3xf32> to tensor<?x?xf32>
-
-// -----
-
-func @pad_to_static_size(%arg0: tensor<?x?xf32>, %ub0: index, %ub1: index,
-                         %pad_value: f32) -> tensor<2x3xf32> {
-  %0 = linalg.pad_tensor %arg0 low[0, 0] high[%ub0, %ub1] {
-    ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad_value : f32
-    } : tensor<?x?xf32> to tensor<2x3xf32>
-  return %0 : tensor<2x3xf32>
-}
-// CHECK-LABEL: func @pad_to_static_size
-//  CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]
-//  CHECK-SAME: %[[UB0:[a-zA-Z0-9_]*]]
-//  CHECK-SAME: %[[UB1:[a-zA-Z0-9_]*]]
-//       CHECK:   linalg.pad_tensor %[[ARG0]]
-//  CHECK-SAME:     low[0, 0]
-//  CHECK-SAME:     high[%[[UB0]], %[[UB1]]]
-//       CHECK:    : tensor<?x?xf32> to tensor<2x3xf32>
-
-// -----
-
 func @views(%arg0: index) {
   %c0 = arith.constant 0 : index
   %0 = arith.muli %arg0, %arg0 : index
diff --git a/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir b/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir
--- a/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir
+++ b/mlir/test/Dialect/Linalg/subtensor-of-padtensor.mlir
@@ -6,9 +6,9 @@
 //       CHECK:   return %[[RESULT]]
 func @static_data_only(%arg0 : tensor<4x5xf32>, %pad : f32)
     -> tensor<2x1xf32> {
-  %0 = linalg.pad_tensor %arg0 low[0, 0] high[7, 8] {
+  %0 = tensor.pad %arg0 low[0, 0] high[7, 8] {
     ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad : f32
+      tensor.yield %pad : f32
     } : tensor<4x5xf32> to tensor<11x13xf32>
   %1 = tensor.extract_slice %0[1, 2] [2, 1] [1, 1] : tensor<11x13xf32> to tensor<2x1xf32>
   return %1 : tensor<2x1xf32>
@@ -18,16 +18,16 @@
 
 // CHECK-LABEL: @static_high_pad_only
 //  CHECK-SAME:   %[[ARG0:.*]]: tensor<4x5xf32>, %[[PAD:.*]]: f32
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //   CHECK-NOT:   tensor.extract_slice
 //       CHECK:   %[[RESULT:.*]] = tensor.generate
 //       CHECK:     tensor.yield %[[PAD]]
 //       CHECK:   return %[[RESULT]] : tensor<2x4xf32>
 func @static_high_pad_only(%arg0 : tensor<4x5xf32>, %pad : f32)
     -> tensor<2x4xf32> {
-  %0 = linalg.pad_tensor %arg0 low[0, 0] high[7, 8] {
+  %0 = tensor.pad %arg0 low[0, 0] high[7, 8] {
     ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad : f32
+      tensor.yield %pad : f32
     } : tensor<4x5xf32> to tensor<11x13xf32>
   %1 = tensor.extract_slice %0[4, 5] [2, 4] [1, 1] : tensor<11x13xf32> to tensor<2x4xf32>
   return %1 : tensor<2x4xf32>
@@ -37,16 +37,16 @@
 
 // CHECK-LABEL: @static_low_pad_only
 //  CHECK-SAME:   %[[ARG0:.*]]: tensor<4x5xf32>, %[[PAD:.*]]: f32
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //   CHECK-NOT:   tensor.extract_slice
 //       CHECK:   %[[RESULT:.*]] = tensor.generate
 //       CHECK:     tensor.yield %[[PAD]]
 //       CHECK:   return %[[RESULT]] : tensor<2x3xf32>
 func @static_low_pad_only(%arg0 : tensor<4x5xf32>, %pad : f32)
     -> tensor<2x3xf32> {
-  %0 = linalg.pad_tensor %arg0 low[3, 7] high[7, 8] {
+  %0 = tensor.pad %arg0 low[3, 7] high[7, 8] {
     ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad : f32
+      tensor.yield %pad : f32
     } : tensor<4x5xf32> to tensor<14x20xf32>
   %1 = tensor.extract_slice %0[1, 3] [2, 3] [1, 1] : tensor<14x20xf32> to tensor<2x3xf32>
   return %1 : tensor<2x3xf32>
@@ -56,16 +56,16 @@
 
 // CHECK-LABEL: @static_low_pad_only_2
 //  CHECK-SAME:   %[[ARG0:.*]]: tensor<4x5xf32>, %[[PAD:.*]]: f32
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //   CHECK-NOT:   tensor.extract_slice
 //       CHECK:   %[[RESULT:.*]] = tensor.generate
 //       CHECK:     tensor.yield %[[PAD]]
 //       CHECK:   return %[[RESULT]] : tensor<1x3xf32>
 func @static_low_pad_only_2(%arg0 : tensor<4x5xf32>, %pad : f32)
     -> tensor<1x3xf32> {
-  %0 = linalg.pad_tensor %arg0 low[3, 7] high[7, 8] {
+  %0 = tensor.pad %arg0 low[3, 7] high[7, 8] {
     ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad : f32
+      tensor.yield %pad : f32
     } : tensor<4x5xf32> to tensor<14x20xf32>
   %1 = tensor.extract_slice %0[1, 3] [1, 3] [1, 1] : tensor<14x20xf32> to tensor<1x3xf32>
   return %1 : tensor<1x3xf32>
@@ -75,16 +75,16 @@
 
 // CHECK-LABEL: @static_mixed_data_high_pad
 //  CHECK-SAME:   %[[ARG0:.*]]: tensor<4x5xf32>, %[[PAD:.*]]: f32
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //       CHECK:   %[[SUBTENSOR:.*]] = tensor.extract_slice %[[ARG0]][2, 4] [2, 1] [1, 1] : tensor<4x5xf32> to tensor<2x1xf32>
-//       CHECK:   %[[RESULT:.*]] = linalg.pad_tensor %[[SUBTENSOR]] low[0, 0] high[1, 3]
-//       CHECK:     linalg.yield %[[PAD]]
+//       CHECK:   %[[RESULT:.*]] = tensor.pad %[[SUBTENSOR]] low[0, 0] high[1, 3]
+//       CHECK:     tensor.yield %[[PAD]]
 //       CHECK:   return %[[RESULT]] : tensor<3x4xf32>
 func @static_mixed_data_high_pad(%arg0 : tensor<4x5xf32>, %pad : f32)
     -> tensor<3x4xf32> {
-  %0 = linalg.pad_tensor %arg0 low[0, 0] high[7, 8] {
+  %0 = tensor.pad %arg0 low[0, 0] high[7, 8] {
     ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad : f32
+      tensor.yield %pad : f32
     } : tensor<4x5xf32> to tensor<11x13xf32>
   %1 = tensor.extract_slice %0[2, 4] [3, 4] [1, 1] : tensor<11x13xf32> to tensor<3x4xf32>
   return %1 : tensor<3x4xf32>
@@ -94,16 +94,16 @@
 
 // CHECK-LABEL: @static_mixed_data_low_pad
 //  CHECK-SAME:   %[[ARG0:.*]]: tensor<4x5xf32>, %[[PAD:.*]]: f32
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //       CHECK:   %[[SUBTENSOR:.*]] = tensor.extract_slice %[[ARG0]][0, 0] [2, 1] [1, 1] : tensor<4x5xf32> to tensor<2x1xf32>
-//       CHECK:   %[[RESULT:.*]] = linalg.pad_tensor %[[SUBTENSOR]] low[1, 3] high[0, 0]
-//       CHECK:     linalg.yield %[[PAD]]
+//       CHECK:   %[[RESULT:.*]] = tensor.pad %[[SUBTENSOR]] low[1, 3] high[0, 0]
+//       CHECK:     tensor.yield %[[PAD]]
 //       CHECK:   return %[[RESULT]] : tensor<3x4xf32>
 func @static_mixed_data_low_pad(%arg0 : tensor<4x5xf32>, %pad : f32)
     -> tensor<3x4xf32> {
-  %0 = linalg.pad_tensor %arg0 low[3, 7] high[7, 8] {
+  %0 = tensor.pad %arg0 low[3, 7] high[7, 8] {
     ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad : f32
+      tensor.yield %pad : f32
     } : tensor<4x5xf32> to tensor<14x20xf32>
   %1 = tensor.extract_slice %0[2, 4] [3, 4] [1, 1] : tensor<14x20xf32> to tensor<3x4xf32>
   return %1 : tensor<3x4xf32>
@@ -113,15 +113,15 @@
 
 // CHECK-LABEL: @static_mixed_data_low_high_pad
 //  CHECK-SAME:   %[[ARG0:.*]]: tensor<4x5xf32>, %[[PAD:.*]]: f32
-//   CHECK-NOT:   linalg.pad_tensor
-//       CHECK:   %[[RESULT:.*]] = linalg.pad_tensor %[[ARG0]] low[1, 1] high[2, 3]
-//       CHECK:     linalg.yield %[[PAD]]
+//   CHECK-NOT:   tensor.pad
+//       CHECK:   %[[RESULT:.*]] = tensor.pad %[[ARG0]] low[1, 1] high[2, 3]
+//       CHECK:     tensor.yield %[[PAD]]
 //       CHECK:   return %[[RESULT]] : tensor<7x9xf32>
 func @static_mixed_data_low_high_pad(%arg0 : tensor<4x5xf32>, %pad : f32)
     -> tensor<7x9xf32> {
-  %0 = linalg.pad_tensor %arg0 low[2, 3] high[7, 8] {
+  %0 = tensor.pad %arg0 low[2, 3] high[7, 8] {
     ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad : f32
+      tensor.yield %pad : f32
     } : tensor<4x5xf32> to tensor<13x16xf32>
   %1 = tensor.extract_slice %0[1, 2] [7, 9] [1, 1] : tensor<13x16xf32> to tensor<7x9xf32>
   return %1 : tensor<7x9xf32>
@@ -131,7 +131,7 @@
 
 // CHECK-LABEL: @dynamic_high_pad
 //  CHECK-SAME:     %[[ARG0:.*]]: tensor<?x5xf32>
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //       CHECK:   %[[C0:.*]] = arith.constant 0 : index
 //       CHECK:   tensor.dim %[[ARG0]], %[[C0]]
 //       CHECK:   %[[RESULT:.*]] = scf.if %{{.*}} -> (tensor<3x4xf32>) {
@@ -139,14 +139,14 @@
 //       CHECK:     scf.yield %[[GEN]]
 //       CHECK:   } else {
 //       CHECK:     %[[SUBTENSOR:.*]] = tensor.extract_slice %[[ARG0]][%{{.*}}, 4] [%{{.*}}, 1] [1, 1] : tensor<?x5xf32> to tensor<?x1xf32>
-//       CHECK:     %[[PADTENSOR:.*]] = linalg.pad_tensor %[[SUBTENSOR]] low[0, 0] high[%{{.*}}, 3]
+//       CHECK:     %[[PADTENSOR:.*]] = tensor.pad %[[SUBTENSOR]] low[0, 0] high[%{{.*}}, 3]
 //       CHECK:     scf.yield %[[PADTENSOR]]
 //       CHECK:   }
 //       CHECK:   return %[[RESULT]]
 func @dynamic_high_pad(%arg0 : tensor<?x5xf32>, %h1: index, %pad : f32) -> tensor<3x4xf32> {
-  %0 = linalg.pad_tensor %arg0 low[0, 0] high[%h1, 8] {
+  %0 = tensor.pad %arg0 low[0, 0] high[%h1, 8] {
     ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad : f32
+      tensor.yield %pad : f32
     } : tensor<?x5xf32> to tensor<?x13xf32>
   %1 = tensor.extract_slice %0[2, 4] [3, 4] [1, 1] : tensor<?x13xf32> to tensor<3x4xf32>
   return %1 : tensor<3x4xf32>
@@ -156,7 +156,7 @@
 
 // CHECK-LABEL: @dynamic_extract_size
 //  CHECK-SAME:     %[[ARG0:.*]]: tensor<?x5xf32>, %[[ARG1:.*]]: index
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //       CHECK:   %[[C0:.*]] = arith.constant 0 : index
 //       CHECK:   tensor.dim %[[ARG0]], %[[C0]]
 //       CHECK:   %[[RESULT:.*]] = scf.if %{{.*}} -> (tensor<?x4xf32>) {
@@ -164,14 +164,14 @@
 //       CHECK:     scf.yield %[[GEN]]
 //       CHECK:   } else {
 //       CHECK:     %[[SUBTENSOR:.*]] = tensor.extract_slice %[[ARG0]][%{{.*}}, 4] [%{{.*}}, 1] [1, 1] : tensor<?x5xf32> to tensor<?x1xf32>
-//       CHECK:     %[[PADTENSOR:.*]] = linalg.pad_tensor %[[SUBTENSOR]] low[0, 0] high[%{{.*}}, 3]
+//       CHECK:     %[[PADTENSOR:.*]] = tensor.pad %[[SUBTENSOR]] low[0, 0] high[%{{.*}}, 3]
 //       CHECK:     scf.yield %[[PADTENSOR]]
 //       CHECK:   }
 //       CHECK:   return %[[RESULT]]
 func @dynamic_extract_size(%arg0 : tensor<?x5xf32>, %s1: index, %pad : f32) -> tensor<?x4xf32> {
-  %0 = linalg.pad_tensor %arg0 low[0, 0] high[7, 8] {
+  %0 = tensor.pad %arg0 low[0, 0] high[7, 8] {
     ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad : f32
+      tensor.yield %pad : f32
     } : tensor<?x5xf32> to tensor<?x13xf32>
   %1 = tensor.extract_slice %0[2, 4] [%s1, 4] [1, 1] : tensor<?x13xf32> to tensor<?x4xf32>
   return %1 : tensor<?x4xf32>
@@ -184,14 +184,14 @@
 //       CHECK:     tensor.generate
 //       CHECK:   else
 //       CHECK:     %[[SLICE:.*]] = tensor.extract_slice
-//       CHECK:     linalg.pad_tensor %[[SLICE]] low[0, 0]
+//       CHECK:     tensor.pad %[[SLICE]] low[0, 0]
 func @dynamic_zero_low_padding(%arg0 : tensor<?x?xf32>, %pad : f32,
                                %o1 : index, %o2 : index,
                                %s1 : index, %s2 : index)
     -> tensor<?x?xf32> {
-  %0 = linalg.pad_tensor %arg0 low[0, 0] high[7, 8] {
+  %0 = tensor.pad %arg0 low[0, 0] high[7, 8] {
     ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad : f32
+      tensor.yield %pad : f32
     } : tensor<?x?xf32> to tensor<?x?xf32>
   %1 = tensor.extract_slice %0[%o1, %o2] [%s1, %s2] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
@@ -204,14 +204,14 @@
 //       CHECK:     tensor.generate
 //       CHECK:   else
 //       CHECK:     %[[SLICE:.*]] = tensor.extract_slice
-//       CHECK:     linalg.pad_tensor %[[SLICE]] low[%{{.*}}, %{{.*}}] high[0, 0]
+//       CHECK:     tensor.pad %[[SLICE]] low[%{{.*}}, %{{.*}}] high[0, 0]
 func @dynamic_zero_high_padding(%arg0 : tensor<?x?xf32>, %pad : f32,
                                 %o1 : index, %o2 : index,
                                 %s1 : index, %s2 : index)
     -> tensor<?x?xf32> {
-  %0 = linalg.pad_tensor %arg0 low[7, 8] high[0, 0] {
+  %0 = tensor.pad %arg0 low[7, 8] high[0, 0] {
     ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad : f32
+      tensor.yield %pad : f32
     } : tensor<?x?xf32> to tensor<?x?xf32>
   %1 = tensor.extract_slice %0[%o1, %o2] [%s1, %s2] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
   return %1 : tensor<?x?xf32>
diff --git a/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir
--- a/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir
+++ b/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir
@@ -288,7 +288,7 @@
 //     CHECK:         tensor.generate
 //     CHECK:       else
 //     CHECK:         tensor.extract_slice
-//     CHECK:         linalg.pad_tensor
+//     CHECK:         tensor.pad
 //     CHECK:       tensor.extract_slice
 //     CHECK:       tensor.extract_slice
 //     CHECK:       linalg.generic
@@ -303,9 +303,9 @@
   %d0 = tensor.dim %large_input, %c0 : tensor<64x128xf32>
   %d1 = tensor.dim %large_input, %c1 : tensor<64x128xf32>
 
-  %pad = linalg.pad_tensor %small_input low[4, 60] high[2, 67] {
+  %pad = tensor.pad %small_input low[4, 60] high[2, 67] {
   ^bb0(%arg0: index, %arg1: index):
-    linalg.yield %zero : f32
+    tensor.yield %zero : f32
   } : tensor<58x1xf32> to tensor<64x128xf32>
 
   %fill = linalg.fill(%zero, %large_input) : f32, tensor<64x128xf32> -> tensor<64x128xf32>
diff --git a/mlir/test/Dialect/Linalg/tile-pad-tensor-op.mlir b/mlir/test/Dialect/Linalg/tile-pad-tensor-op.mlir
--- a/mlir/test/Dialect/Linalg/tile-pad-tensor-op.mlir
+++ b/mlir/test/Dialect/Linalg/tile-pad-tensor-op.mlir
@@ -23,7 +23,7 @@
 //       TILE2:         tensor.generate
 //       TILE2:       else
 //       TILE2:         %[[SLICE:.*]] = tensor.extract_slice %[[IN]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1]
-//       TILE2:         %[[PAD:.*]] = linalg.pad_tensor %[[SLICE]]
+//       TILE2:         %[[PAD:.*]] = tensor.pad %[[SLICE]]
 //       TILE2:       tensor.insert_slice %[[SWAP_RESULT]] into %[[INNER_OUT]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1]
 //       TILE2:   return %[[RESULT]]
 
@@ -43,15 +43,15 @@
 //       TILE1:       tensor.generate
 //       TILE1:     else
 //       TILE1:       %[[SLICE:.*]] = tensor.extract_slice %[[IN]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1]
-//       TILE1:       %[[PAD:.*]] = linalg.pad_tensor %[[SLICE]] low[3, %{{.*}}] high[{{.*}}, {{.*}}]
+//       TILE1:       %[[PAD:.*]] = tensor.pad %[[SLICE]] low[3, %{{.*}}] high[{{.*}}, {{.*}}]
 //       TILE1:     tensor.insert_slice %[[SWAP_RESULT]] into %[[INNER_OUT]][0, {{.*}}] [%[[DIM0]], {{.*}}] [1, 1]
 //       TILE1:   return %[[RESULT]]
 
 func @dynamic_pad_tensor(%input_tensor: tensor<?x?xf32>,
                          %pad_value: f32) -> tensor<?x?xf32> {
-  %0 = linalg.pad_tensor %input_tensor low[3, 4] high[5, 3] {
+  %0 = tensor.pad %input_tensor low[3, 4] high[5, 3] {
     ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad_value : f32
+      tensor.yield %pad_value : f32
     } : tensor<?x?xf32> to tensor<?x?xf32>
   return %0 : tensor<?x?xf32>
 }
@@ -71,7 +71,7 @@
 //       TILE2:         tensor.generate
 //       TILE2:       else
 //       TILE2:         %[[SLICE:.*]] = tensor.extract_slice %[[IN]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1]
-//       TILE2:         %[[PAD:.*]] = linalg.pad_tensor %[[SLICE]]
+//       TILE2:         %[[PAD:.*]] = tensor.pad %[[SLICE]]
 //       TILE2:       tensor.insert_slice %[[SWAP_RESULT]] into %[[INNER_OUT]][{{.*}}, {{.*}}] [{{.*}}, {{.*}}] [1, 1]
 //       TILE2:   return %[[RESULT]]
 
@@ -86,15 +86,15 @@
 //       TILE1:       tensor.generate
 //       TILE1:     else
 //       TILE1:       %[[SLICE:.*]] = tensor.extract_slice %[[IN]][0, {{.*}}] [7, {{.*}}] [1, 1]
-//       TILE1:       %[[PAD:.*]] = linalg.pad_tensor %[[SLICE]] low[3, %{{.*}}] high[5, {{.*}}]
+//       TILE1:       %[[PAD:.*]] = tensor.pad %[[SLICE]] low[3, %{{.*}}] high[5, {{.*}}]
 //       TILE1:     tensor.insert_slice %[[SWAP_RESULT]] into %[[INNER_OUT]][0, {{.*}}] [15, {{.*}}] [1, 1]
 //       TILE1:   return %[[RESULT]]
 
 func @static_pad_tensor(%input_tensor: tensor<7x9xf32>,
                         %pad_value: f32) -> tensor<15x16xf32> {
-  %0 = linalg.pad_tensor %input_tensor low[3, 4] high[5, 3] {
+  %0 = tensor.pad %input_tensor low[3, 4] high[5, 3] {
     ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad_value : f32
+      tensor.yield %pad_value : f32
     } : tensor<7x9xf32> to tensor<15x16xf32>
   return %0 : tensor<15x16xf32>
 }
@@ -112,7 +112,7 @@
 //       TILE1:       scf.yield %[[GEN]] : tensor<14x3xf32>
 //       TILE1:     else
 //       TILE1:       %[[SLICE:.*]] = tensor.extract_slice %arg0[0, %{{.*}}] [7, %{{.*}}] [1, 1] : tensor<7x9xf32> to tensor<7x?xf32>
-//       TILE1:       %[[PAD:.*]] = linalg.pad_tensor %[[SLICE]] low[0, 0] high[7, %{{.*}}]
+//       TILE1:       %[[PAD:.*]] = tensor.pad %[[SLICE]] low[0, 0] high[7, %{{.*}}]
 //       TILE1:       scf.yield %[[PAD]] : tensor<14x3xf32>
 //       TILE1:     %[[R3:.*]] = tensor.insert_slice %[[R2]] into %[[INNER_OUT]][0, %[[IV]]] [14, 3] [1, 1] : tensor<14x3xf32> into tensor<14x15xf32>
 //       TILE1:     scf.yield %[[R3]] : tensor<14x15xf32>
@@ -120,9 +120,9 @@
 func @static_pad_tile_evenly(%input_tensor: tensor<7x9xf32>,
                              %output_tensor: tensor<14x15xf32>,
                              %pad_value: f32) -> tensor<14x15xf32> {
-  %0 = linalg.pad_tensor %input_tensor low[0, 0] high[7, 6] {
+  %0 = tensor.pad %input_tensor low[0, 0] high[7, 6] {
     ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %pad_value : f32
+      tensor.yield %pad_value : f32
     } : tensor<7x9xf32> to tensor<14x15xf32>
   return %0 : tensor<14x15xf32>
 }
diff --git a/mlir/test/Dialect/Linalg/vectorization.mlir b/mlir/test/Dialect/Linalg/vectorization.mlir
--- a/mlir/test/Dialect/Linalg/vectorization.mlir
+++ b/mlir/test/Dialect/Linalg/vectorization.mlir
@@ -537,7 +537,7 @@
 
 // CHECK-LABEL: func @pad_static(
 //  CHECK-SAME:                  %[[ARG0:.*]]: tensor<2x?x2xf32>, %[[PAD:.*]]: f32
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
 //   CHECK-DAG:   %[[INIT:.*]] = linalg.init_tensor [2, 3, 4] : tensor<2x3x4xf32>
@@ -547,9 +547,9 @@
 //       CHECK:   %[[RESULT:.*]] = vector.transfer_write %[[READ]], %[[FILL]][%[[C0]], %[[C0]], %[[C2]]] {in_bounds = [true, true, true]} : vector<2x3x2xf32>, tensor<2x3x4xf32>
 //       CHECK:   return %[[RESULT]]
 func @pad_static(%arg0: tensor<2x?x2xf32>, %pad_value: f32) -> tensor<2x3x4xf32> {
-  %0 = linalg.pad_tensor %arg0 low[0, 0, 2] high[0, 1, 0] {
+  %0 = tensor.pad %arg0 low[0, 0, 2] high[0, 1, 0] {
     ^bb0(%arg1: index, %arg2: index, %arg3: index):
-      linalg.yield %pad_value : f32
+      tensor.yield %pad_value : f32
     } : tensor<2x?x2xf32> to tensor<2x3x4xf32>
   return %0 : tensor<2x3x4xf32>
 }
@@ -558,7 +558,7 @@
 
 // CHECK-LABEL: func @pad_static_source(
 //  CHECK-SAME:                  %[[ARG0:.*]]: tensor<2x5x2xf32>, %[[PAD:.*]]: f32
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
 //       CHECK:   %[[INIT:.*]] = linalg.init_tensor [2, 6, 4] : tensor<2x6x4xf32>
@@ -568,9 +568,9 @@
 //       CHECK:   %[[WRITE:.*]] = vector.transfer_write %[[READ]], %[[FILL]][%[[C0]], %[[C0]], %[[C2]]] {in_bounds = [true, true, true]} : vector<2x5x2xf32>, tensor<2x6x4xf32>
 //       CHECK:   return %[[WRITE]]
 func @pad_static_source(%arg0: tensor<2x5x2xf32>, %pad_value: f32) -> tensor<2x6x4xf32> {
-  %0 = linalg.pad_tensor %arg0 low[0, 0, 2] high[0, 1, 0] {
+  %0 = tensor.pad %arg0 low[0, 0, 2] high[0, 1, 0] {
     ^bb0(%arg1: index, %arg2: index, %arg3: index):
-      linalg.yield %pad_value : f32
+      tensor.yield %pad_value : f32
     } : tensor<2x5x2xf32> to tensor<2x6x4xf32>
   return %0 : tensor<2x6x4xf32>
 }
@@ -579,7 +579,7 @@
 
 // CHECK-LABEL: func @pad_static_dynamic(
 //  CHECK-SAME:                          %[[SRC:.*]]: tensor<1x2x2x?xf32>, %[[LOW:.*]]: index, %[[HIGH:.*]]: index
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //   CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
 //   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
 //   CHECK-DAG:   %[[C5:.*]] = arith.constant 5 : index
@@ -596,9 +596,9 @@
 //       CHECK:   return %[[RESULT]]
 func @pad_static_dynamic(%arg0: tensor<1x2x2x?xf32>, %low: index, %high: index,
                   %pad_value: f32) -> tensor<6x?x?x?xf32> {
-  %0 = linalg.pad_tensor %arg0 low[2, %low, 3, 3] high[3, 3, %high, 2] {
+  %0 = tensor.pad %arg0 low[2, %low, 3, 3] high[3, 3, %high, 2] {
     ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index):
-      linalg.yield %pad_value : f32
+      tensor.yield %pad_value : f32
     } : tensor<1x2x2x?xf32> to tensor<6x?x?x?xf32>
   return %0 : tensor<6x?x?x?xf32>
 }
@@ -607,7 +607,7 @@
 
 // CHECK-LABEL: func @pad_and_transfer_read
 //  CHECK-SAME:     %[[ARG0:.*]]: tensor<5x6xf32>
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C5:.*]] = arith.constant 5.0
 //       CHECK:   %[[RESULT:.*]] = vector.transfer_read %[[ARG0]][%[[C0]], %[[C0]]], %[[C5]] : tensor<5x6xf32>, vector<7x9xf32>
@@ -616,9 +616,9 @@
   %c0 = arith.constant 0 : index
   %c5 = arith.constant 5.0 : f32
   %c6 = arith.constant 6.0 : f32
-  %0 = linalg.pad_tensor %arg0 low[0, 0] high[5, 7] {
+  %0 = tensor.pad %arg0 low[0, 0] high[5, 7] {
     ^bb0(%arg1: index, %arg2: index):
-      linalg.yield %c5 : f32
+      tensor.yield %c5 : f32
   } : tensor<5x6xf32> to tensor<10x13xf32>
   %1 = vector.transfer_read %0[%c0, %c0], %c6
       : tensor<10x13xf32>, vector<7x9xf32>
@@ -631,7 +631,7 @@
 
 // CHECK-LABEL: func @pad_and_transfer_write_static
 //  CHECK-SAME:     %[[ARG0:.*]]: tensor<5x6xf32>
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //       CHECK:   %[[C0:.*]] = arith.constant 0 : index
 //       CHECK:   %[[VEC0:.*]] = call @make_vector() : () -> vector<7x9xf32>
 //       CHECK:   %[[RESULT:.*]] = vector.transfer_write %[[VEC0]], %[[ARG0]][%[[C0]], %[[C0]]] : vector<7x9xf32>, tensor<5x6xf32>
@@ -640,9 +640,9 @@
     %arg0: tensor<5x6xf32>) -> tensor<5x6xf32> {
   %c0 = arith.constant 0 : index
   %c5 = arith.constant 5.0 : f32
-  %0 = linalg.pad_tensor %arg0 low[0, 0] high[5, 7] {
+  %0 = tensor.pad %arg0 low[0, 0] high[5, 7] {
     ^bb0(%arg2: index, %arg3: index):
-      linalg.yield %c5 : f32
+      tensor.yield %c5 : f32
   } : tensor<5x6xf32> to tensor<10x13xf32>
   %1 = call @make_vector() : () -> vector<7x9xf32>
   %2 = vector.transfer_write %1, %0[%c0, %c0]
@@ -657,7 +657,7 @@
 
 // CHECK-LABEL: func @pad_and_transfer_write_dynamic_static
 //  CHECK-SAME:     %[[ARG0:.*]]: tensor<?x?xf32>, %[[SIZE:.*]]: index, %[[PADDING:.*]]: index
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //       CHECK:   %[[C0:.*]] = arith.constant 0 : index
 //       CHECK:   %[[SUB:.*]] = tensor.extract_slice %[[ARG0]][0, 0] [%[[SIZE]], 6] [1, 1] : tensor<?x?xf32> to tensor<?x6xf32>
 //       CHECK:   %[[VEC0:.*]] = call @make_vector() : () -> vector<7x9xf32>
@@ -669,9 +669,9 @@
   %c5 = arith.constant 5.0 : f32
   %s = tensor.extract_slice %arg0[0, 0] [%size, 6] [1, 1]
       : tensor<?x?xf32> to tensor<?x6xf32>
-  %0 = linalg.pad_tensor %s low[0, 0] high[%padding, 7] {
+  %0 = tensor.pad %s low[0, 0] high[%padding, 7] {
     ^bb0(%arg2: index, %arg3: index):
-      linalg.yield %c5 : f32
+      tensor.yield %c5 : f32
   } : tensor<?x6xf32> to tensor<?x13xf32>
   %1 = call @make_vector() : () -> vector<7x9xf32>
   %2 = vector.transfer_write %1, %0[%c0, %c0]
@@ -686,7 +686,7 @@
 
 // CHECK-LABEL: func @pad_and_insert_slice_source
 //  CHECK-SAME:     %[[ARG0:.*]]: tensor<5x6xf32>
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C5:.*]] = arith.constant 5.0
 //       CHECK:   %[[VEC0:.*]] = call @make_vector() : () -> tensor<12x13xf32>
@@ -697,9 +697,9 @@
     %arg0: tensor<5x6xf32>) -> tensor<12x13xf32> {
   %c0 = arith.constant 0 : index
   %c5 = arith.constant 5.0 : f32
-  %0 = linalg.pad_tensor %arg0 low[0, 0] high[2, 3] {
+  %0 = tensor.pad %arg0 low[0, 0] high[2, 3] {
     ^bb0(%arg2: index, %arg3: index):
-      linalg.yield %c5 : f32
+      tensor.yield %c5 : f32
   } : tensor<5x6xf32> to tensor<7x9xf32>
   %1 = call @make_vector() : () -> tensor<12x13xf32>
   %r = tensor.insert_slice %0 into %1[0, 0][7, 9][1, 1] : tensor<7x9xf32> into tensor<12x13xf32>
@@ -717,9 +717,9 @@
 func @pad_and_insert_slice_dest(
     %arg0: tensor<1x5x6xf32>) -> tensor<1x12x13xf32> {
   %c5 = arith.constant 5.0 : f32
-  %0 = linalg.pad_tensor %arg0 low[0, 0, 0] high[0, 7, 7] {
+  %0 = tensor.pad %arg0 low[0, 0, 0] high[0, 7, 7] {
     ^bb0(%arg2: index, %arg3: index, %arg4: index):
-      linalg.yield %c5 : f32
+      tensor.yield %c5 : f32
   } : tensor<1x5x6xf32> to tensor<1x12x13xf32>
   %1 = call @make_vector() : () -> tensor<12x13xf32>
   %r = tensor.insert_slice %1 into %0[0, 0, 0][1, 12, 13][1, 1, 1] : tensor<12x13xf32> into tensor<1x12x13xf32>
@@ -730,7 +730,7 @@
 
 // CHECK-LABEL: func @pad_tensor_non_const_pad_value
 //  CHECK-SAME:     %[[ARG0:.*]]: tensor<5x6xf32>
-//   CHECK-NOT:   linalg.pad_tensor
+//   CHECK-NOT:   tensor.pad
 //   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
 //   CHECK-DAG:   %[[C4:.*]] = arith.constant 4 : index
@@ -743,14 +743,14 @@
 func @pad_tensor_non_const_pad_value(%arg0: tensor<5x6xf32>) -> tensor<12x13xf32> {
   %c0 = arith.constant 0 : index
   %c5 = arith.constant 5.0 : f32
-  %0 = linalg.pad_tensor %arg0 low[3, 4] high[4, 3] {
+  %0 = tensor.pad %arg0 low[3, 4] high[4, 3] {
     ^bb0(%arg1: index, %arg2: index):
       %i1 = arith.index_cast %arg1 : index to i32
       %i2 = arith.index_cast %arg2 : index to i32
       %f1 = arith.sitofp %i1 : i32 to f32
       %f2 = arith.sitofp %i2 : i32 to f32
       %m = arith.mulf %f1, %f2 : f32
-      linalg.yield %m : f32
+      tensor.yield %m : f32
   } : tensor<5x6xf32> to tensor<12x13xf32>
   return %0 : tensor<12x13xf32>
 }
diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir
--- a/mlir/test/Dialect/Tensor/canonicalize.mlir
+++ b/mlir/test/Dialect/Tensor/canonicalize.mlir
@@ -982,3 +982,199 @@
   // CHECK-NEXT: return [[C3]]
   return %rank_0 : index
 }
+
+// -----
+
+// CHECK-LABEL: func @pad_tensor_same_static_shape(
+//  CHECK-SAME:   %[[ARG0:.*]]: tensor<5x6xf32>
+//   CHECK-NOT:   tensor.pad
+//       CHECK:   return %[[ARG0]]
+func @pad_tensor_same_static_shape(%arg0: tensor<5x6xf32>, %a: index)
+    -> tensor<5x6xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.pad %arg0 low[%a, 0] high[0, %a] {
+        ^bb0(%arg1: index, %arg2: index):
+          tensor.yield %cst : f32
+  } : tensor<5x6xf32> to tensor<5x6xf32>
+  return %0 : tensor<5x6xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @pad_tensor_nofold_same_static_shape(
+//  CHECK-SAME:   %[[ARG0:.*]]: tensor<5x6xf32>
+//       CHECK:   %[[PAD:.*]] = tensor.pad
+//       CHECK:   return %[[PAD]]
+func @pad_tensor_nofold_same_static_shape(%arg0: tensor<5x6xf32>, %a: index)
+    -> tensor<5x6xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.pad %arg0 nofold low[%a, 0] high[0, %a] {
+        ^bb0(%arg1: index, %arg2: index):
+          tensor.yield %cst : f32
+  } : tensor<5x6xf32> to tensor<5x6xf32>
+  return %0 : tensor<5x6xf32>
+}
+
+// -----
+
+// CHECK-LABEL:   func @pad_tensor_after_cast_different_shape(
+// CHECK-SAME:      %[[INPUT:.*]]: tensor<?x64x?x?xf32>) -> tensor<?x?x?x?xf32> {
+// CHECK:           %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[PADDED:.*]] = tensor.pad %[[INPUT]]
+// CHECK-SAME:        low[0, 0, 1, 1] high[0, 0, 1, 1]  {
+// CHECK:           ^bb0(%[[ARG1:.*]]: index, %[[ARG2:.*]]: index, %[[ARG3:.*]]: index, %[[ARG4:.*]]: index):
+// CHECK:             tensor.yield %[[CST]] : f32
+// CHECK:           } : tensor<?x64x?x?xf32> to tensor<?x64x?x?xf32>
+// CHECK:           %[[DYNAMIC:.*]] = tensor.cast %[[PADDED:.*]] :
+// CHECK-SAME:         tensor<?x64x?x?xf32> to tensor<?x?x?x?xf32>
+// CHECK:           return %[[DYNAMIC]] : tensor<?x?x?x?xf32>
+// CHECK:         }
+func @pad_tensor_after_cast_different_shape(%arg0: tensor<?x64x?x?xf32>)
+    -> tensor<?x?x?x?xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %dynamic = tensor.cast %arg0 : tensor<?x64x?x?xf32> to tensor<?x?x?x?xf32>
+  %padded = tensor.pad %dynamic low[0, 0, 1, 1] high[0, 0, 1, 1]  {
+    ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index):
+    tensor.yield %cst: f32
+  } : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32>
+  return %padded: tensor<?x?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL:   func @pad_tensor_after_cast_same_shape(
+// CHECK-SAME:      %[[INPUT:.*]]: tensor<?x64x?x?xf32>,
+// CHECK-SAME:      %[[PADDING:.*]]: index) -> tensor<?x?x?x?xf32> {
+// CHECK:           %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK:           %[[PADDED:.*]] = tensor.pad %[[INPUT]]
+// CHECK-SAME:        low[0, %[[PADDING]], 1, 1] high[0, %[[PADDING]], 1, 1]  {
+// CHECK:           ^bb0(%[[ARG1:.*]]: index, %[[ARG2:.*]]: index, %[[ARG3:.*]]: index, %[[ARG4:.*]]: index):
+// CHECK:             tensor.yield %[[CST]] : f32
+// CHECK:           } : tensor<?x64x?x?xf32> to tensor<?x?x?x?xf32>
+// CHECK:           return %[[PADDED:.*]] : tensor<?x?x?x?xf32>
+// CHECK:         }
+func @pad_tensor_after_cast_same_shape(%arg0: tensor<?x64x?x?xf32>, %padding : index)
+    -> tensor<?x?x?x?xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %dynamic = tensor.cast %arg0 : tensor<?x64x?x?xf32> to tensor<?x?x?x?xf32>
+  %padded = tensor.pad %dynamic low[0, %padding, 1, 1] high[0, %padding, 1, 1]  {
+    ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index):
+    tensor.yield %cst: f32
+  } : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32>
+  return %padded: tensor<?x?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @pad_tensor_of_cast(
+// CHECK-NOT:     tensor.cast
+// CHECK:         tensor.pad
+// CHECK:         tensor<8x?xf32> to tensor<8x32xf32>
+func @pad_tensor_of_cast(%t: tensor<8x?xf32>, %s: index) -> tensor<8x32xf32> {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.cast %t : tensor<8x?xf32> to tensor<?x?xf32>
+  %1 = tensor.pad %0 low[%c0, %c0] high[%c0, %s]  {
+  ^bb0(%arg9: index, %arg10: index):
+    tensor.yield %cst : f32
+  } : tensor<?x?xf32> to tensor<8x32xf32>
+  return %1 : tensor<8x32xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @cast_of_pad_more_static
+func @cast_of_pad_more_static(%arg0: tensor<?x?xf32>, %padding: index) -> tensor<32x32xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  // CHECK: %[[PAD:.*]] = tensor.pad
+  // CHECK: tensor<?x?xf32> to tensor<32x32xf32>
+  %padded = tensor.pad %arg0 low[%padding, %padding] high[0, 0] {
+  ^bb0(%arg1: index, %arg2: index):
+    tensor.yield %cst : f32
+  } : tensor<?x?xf32> to tensor<?x?xf32>
+  // CHECK-NOT: tensor.cast
+  %casted = tensor.cast %padded : tensor<?x?xf32> to tensor<32x32xf32>
+  // CHECK: return %[[PAD]]
+  return %casted : tensor<32x32xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @cast_of_pad_less_static
+func @cast_of_pad_less_static(%arg0: tensor<32x?x?xf32>, %padding: index) -> tensor<?x32x32xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  // CHECK: tensor.pad
+  %padded = tensor.pad %arg0 low[%padding, %padding, %padding] high[0, 0, 0] {
+  ^bb0(%arg1: index, %arg2: index, %arg3: index):
+    tensor.yield %cst : f32
+  } : tensor<32x?x?xf32> to tensor<32x?x?xf32>
+  // CHECK: %[[CAST:.*]] = tensor.cast
+  %casted = tensor.cast %padded : tensor<32x?x?xf32> to tensor<?x32x32xf32>
+  // CHECK: return %[[CAST]]
+  return %casted : tensor<?x32x32xf32>
+}
+
+// -----
+
+func @tensor_pad_cast_fold(%arg0: tensor<4x4xf32>) -> tensor<4x4xf32> {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.0 : f32
+  %0 = tensor.cast %arg0 : tensor<4x4xf32> to tensor<?x?xf32>
+  %1 = tensor.pad %0 low[%c0, %c0] high[%c0, %c0]  {
+    ^bb0(%arg1: index, %arg2: index):
+      tensor.yield %cst : f32
+  } : tensor<?x?xf32> to tensor<4x4xf32>
+  return %1 : tensor<4x4xf32>
+}
+// CHECK-LABEL: @tensor_pad_cast
+// CHECK-SAME: %[[ARG0:.+]]: tensor<4x4xf32>
+// CHECK: return %[[ARG0]]
+
+// -----
+
+// CHECK-LABEL: func @fold_pad_tensor_source_cast(
+//  CHECK-SAME:                  %[[ARG0:.*]]: tensor<4x?xf32>
+//   CHECK-NOT:   tensor.cast
+//       CHECK:   %[[RESULT:.*]] = tensor.pad %[[ARG0]]
+func @fold_pad_tensor_source_cast(%arg0: tensor<4x?xf32>) -> tensor<4x4xf32> {
+  %cst = arith.constant 0.0 : f32
+  %0 = tensor.cast %arg0 : tensor<4x?xf32> to tensor<?x?xf32>
+  %1 = tensor.pad %0 low[0, 0] high[0, 1]  {
+    ^bb0(%arg1: index, %arg2: index):
+      tensor.yield %cst : f32
+  } : tensor<?x?xf32> to tensor<4x4xf32>
+  return %1 : tensor<4x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @pad_static_zero_cast(
+//  CHECK-SAME:                  %[[ARG0:.*]]: tensor<?x?x?xf32>
+//   CHECK-NOT:   tensor.pad
+//       CHECK:   %[[RESULT:.*]] = tensor.cast %[[ARG0]] : tensor<?x?x?xf32> to tensor<2x3x4xf32>
+//       CHECK:   return %[[RESULT]]
+func @pad_static_zero_cast(%arg0: tensor<?x?x?xf32>, %pad_value: f32) -> tensor<2x3x4xf32> {
+  %c0 = arith.constant 0 : index
+  %0 = tensor.pad %arg0 low[0, %c0, 0] high[0, 0, %c0] {
+    ^bb0(%arg1: index, %arg2: index, %arg3: index):
+      tensor.yield %pad_value : f32
+    } : tensor<?x?x?xf32> to tensor<2x3x4xf32>
+
+  return %0 : tensor<2x3x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @pad_nofold_static_zero(
+//  CHECK-SAME:                  %[[ARG0:.*]]: tensor<?x?x?xf32>
+//       CHECK:   %[[PAD:.*]] = tensor.pad
+//       CHECK:   return %[[PAD]]
+func @pad_nofold_static_zero(%arg0: tensor<?x?x?xf32>, %pad_value: f32) -> tensor<2x3x4xf32> {
+  %c0 = arith.constant 0 : index
+  %0 = tensor.pad %arg0 nofold low[0, %c0, 0] high[0, 0, %c0] {
+    ^bb0(%arg1: index, %arg2: index, %arg3: index):
+      tensor.yield %pad_value : f32
+    } : tensor<?x?x?xf32> to tensor<2x3x4xf32>
+
+  return %0 : tensor<2x3x4xf32>
+}
diff --git a/mlir/test/Dialect/Tensor/invalid.mlir b/mlir/test/Dialect/Tensor/invalid.mlir
--- a/mlir/test/Dialect/Tensor/invalid.mlir
+++ b/mlir/test/Dialect/Tensor/invalid.mlir
@@ -317,3 +317,58 @@
   %0 = tensor.insert_slice %arg0 into %arg1[0, 0] [%arg2, %arg3] [1, 1] : tensor<?x?xf32> into tensor<?x?x?xf32>
   return
 }
+
+// -----
+
+
+func @pad_result_type(%arg0: tensor<?x2x3x4xi32>, %arg1: index, %arg2: i32) -> tensor<?x?x?x8xf32> {
+  // expected-error @+1 {{specified type 'tensor<?x?x?x8xf32>' does not match the inferred type 'tensor<?x?x?x9xi32>}}
+  %0 = tensor.pad %arg0 low[1, %arg1, 2, 2] high[1, 2, %arg1, 3] {
+  ^bb0(%arg3: index, %arg4: index):
+    tensor.yield %arg2 : i32
+  } : tensor<?x2x3x4xi32> to tensor<?x?x?x8xf32>
+  return %0 : tensor<?x?x?x8xf32>
+}
+
+// -----
+
+func @pad_number_of_block_args(%arg0: tensor<?x4xi32>, %arg1: i32) -> tensor<?x9xi32> {
+  // expected-error @+1 {{expected the block to have 2 arguments}}
+  %0 = tensor.pad %arg0 low[1, 2] high[2, 3] {
+  ^bb0(%arg2: index, %arg3: index, %arg4: index):
+    tensor.yield %arg1 : i32
+  } : tensor<?x4xi32> to tensor<?x9xi32>
+  return %0 : tensor<?x9xi32>
+}
+
+// -----
+
+func @pad_no_block(%arg0: tensor<?x4xi32>, %arg1: i32) -> tensor<?x9xi32> {
+  // expected-error @+1 {{op region #0 ('region') failed to verify constraint: region with 1 blocks}}
+  %0 = tensor.pad %arg0 low[1, 2] high[2, 3] {
+  } : tensor<?x4xi32> to tensor<?x9xi32>
+  return %0 : tensor<?x9xi32>
+}
+
+// -----
+
+func @pad_block_args(%arg0: tensor<?x4xi32>, %arg1: i32) -> tensor<?x9xi32> {
+  // expected-error @+1 {{op expected block argument 1 to be an index}}
+  %0 = tensor.pad %arg0 low[1, 2] high[2, 3] {
+  ^bb0(%arg2: i32, %arg3: i32):
+    tensor.yield %arg1 : i32
+  } : tensor<?x4xi32> to tensor<?x9xi32>
+  return %0 : tensor<?x9xi32>
+}
+
+// -----
+
+func @pad_yield_type(%arg0: tensor<?x4xi32>, %arg1: i8) -> tensor<?x9xi32> {
+  // expected-error @+1 {{op expected yield type to match shape element type}}
+  %0 = tensor.pad %arg0 low[1, 2] high[2, 3] {
+  ^bb0(%arg2: index, %arg3: index):
+    tensor.yield %arg1 : i8
+  } : tensor<?x4xi32> to tensor<?x9xi32>
+  return %0 : tensor<?x9xi32>
+}
+
diff --git a/mlir/test/Dialect/Tensor/ops.mlir b/mlir/test/Dialect/Tensor/ops.mlir
--- a/mlir/test/Dialect/Tensor/ops.mlir
+++ b/mlir/test/Dialect/Tensor/ops.mlir
@@ -176,3 +176,77 @@
   %1 = tensor.rank %t : tensor<4x4x?xf32>
   return
 }
+
+// -----
+
+func @pad_dynamic(%arg0: tensor<1x2x2x?xf32>, %low: index, %high: index,
+                  %pad_value: f32) -> tensor<6x?x?x?xf32> {
+  %0 = tensor.pad %arg0 low[2, %low, 3, 3] high[3, 3, %high, 2] {
+    ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index):
+      tensor.yield %pad_value : f32
+    } : tensor<1x2x2x?xf32> to tensor<6x?x?x?xf32>
+  return %0 : tensor<6x?x?x?xf32>
+}
+// CHECK-LABEL: func @pad_dynamic
+//  CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]
+//  CHECK-SAME: %[[LOW:[a-zA-Z0-9_]*]]
+//  CHECK-SAME: %[[HIGH:[a-zA-Z0-9_]*]]
+//       CHECK:   tensor.pad %[[ARG0]]
+//  CHECK-SAME:     low[2, %[[LOW]], 3, 3]
+//  CHECK-SAME:     high[3, 3, %[[HIGH]], 2]
+//       CHECK:    : tensor<1x2x2x?xf32> to tensor<6x?x?x?xf32>
+
+// -----
+
+func @pad_static(%arg0: tensor<3x4xf32>, %pad_value: f32) -> tensor<6x9xf32> {
+  %0 = tensor.pad %arg0 low[1, 2] high[2, 3] {
+    ^bb0(%arg1 : index, %arg2 : index):
+      tensor.yield %pad_value : f32
+    } : tensor<3x4xf32> to tensor<6x9xf32>
+  return %0 : tensor<6x9xf32>
+}
+// CHECK-LABEL: func @pad_static
+//  CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]
+//       CHECK:   tensor.pad %[[ARG0]] low[1, 2] high[2, 3]
+//       CHECK:    : tensor<3x4xf32> to tensor<6x9xf32>
+
+// -----
+
+func @pad_asymmetrical(%arg0: tensor<2x3xf32>, %ub0: index, %ub1: index,
+                       %pad_value: f32) -> tensor<?x?xf32> {
+  %0 = tensor.pad %arg0 low[0, 0] high[%ub0, %ub1] {
+    ^bb0(%arg1: index, %arg2: index):
+      tensor.yield %pad_value : f32
+    } : tensor<2x3xf32> to tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
+// CHECK-LABEL: func @pad_asymmetrical
+//  CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]
+//  CHECK-SAME: %[[UB0:[a-zA-Z0-9_]*]]
+//  CHECK-SAME: %[[UB1:[a-zA-Z0-9_]*]]
+//       CHECK:   tensor.pad %[[ARG0]]
+//  CHECK-SAME:     low[0, 0]
+//  CHECK-SAME:     high[%[[UB0]], %[[UB1]]]
+//       CHECK:    : tensor<2x3xf32> to tensor<?x?xf32>
+
+// -----
+
+func @pad_to_static_size(%arg0: tensor<?x?xf32>, %ub0: index, %ub1: index,
+                         %pad_value: f32) -> tensor<2x3xf32> {
+  %0 = tensor.pad %arg0 low[0, 0] high[%ub0, %ub1] {
+    ^bb0(%arg1: index, %arg2: index):
+      tensor.yield %pad_value : f32
+    } : tensor<?x?xf32> to tensor<2x3xf32>
+  return %0 : tensor<2x3xf32>
+}
+// CHECK-LABEL: func @pad_to_static_size
+//  CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]*]]
+//  CHECK-SAME: %[[UB0:[a-zA-Z0-9_]*]]
+//  CHECK-SAME: %[[UB1:[a-zA-Z0-9_]*]]
+//       CHECK:   tensor.pad %[[ARG0]]
+//  CHECK-SAME:     low[0, 0]
+//  CHECK-SAME:     high[%[[UB0]], %[[UB1]]]
+//       CHECK:    : tensor<?x?xf32> to tensor<2x3xf32>
+
+// -----
+
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-comprehensive-bufferize.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-comprehensive-bufferize.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-comprehensive-bufferize.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-comprehensive-bufferize.mlir
@@ -21,9 +21,9 @@
     %8 = affine.apply #map1(%arg3, %c0)[%c2]
     %9 = tensor.extract_slice %arg1[%arg3] [2] [1] : tensor<64xf32> to tensor<2xf32>
     %10 = tensor.cast %9 : tensor<2xf32> to tensor<?xf32>
-    %11 = linalg.pad_tensor %10 low[%c0] high[%c0]  {
+    %11 = tensor.pad %10 low[%c0] high[%c0]  {
     ^bb0(%arg5: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
     } : tensor<?xf32> to tensor<2xf32>
     %12 = tensor.insert_slice %11 into %arg4[%8, 0] [1, 2] [1, 1] : tensor<2xf32> into tensor<?x2xf32>
     scf.yield %12 : tensor<?x2xf32>
@@ -38,9 +38,9 @@
     %8 = affine.apply #map1(%arg3, %c0)[%c2]
     %9 = tensor.extract_slice %arg0[%arg3] [2] [1] : tensor<64xf32> to tensor<2xf32>
     %10 = tensor.cast %9 : tensor<2xf32> to tensor<?xf32>
-    %11 = linalg.pad_tensor %10 low[%c0] high[%c0]  {
+    %11 = tensor.pad %10 low[%c0] high[%c0]  {
     ^bb0(%arg5: index):  
-      linalg.yield %cst : f32
+      tensor.yield %cst : f32
     } : tensor<?xf32> to tensor<2xf32>
     %12 = tensor.insert_slice %11 into %arg4[%8, 0] [1, 2] [1, 1] : tensor<2xf32> into tensor<?x2xf32>
     scf.yield %12 : tensor<?x2xf32>
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
--- a/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/test-padtensor.mlir
@@ -13,9 +13,9 @@
   %offset = arith.constant 2 : index
   %cst = arith.constant 2.3 : f32
   %c0 = arith.constant 0 : index
-  %out = linalg.pad_tensor %dynamic low[%c0, %offset, %c0] high[%c0, %c0, %offset]  {
+  %out = tensor.pad %dynamic low[%c0, %offset, %c0] high[%c0, %c0, %offset]  {
   ^bb0(%gen_arg1: index, %gen_arg2: index, %gen_arg3: index):
-    linalg.yield %cst : f32
+    tensor.yield %cst : f32
   } : tensor<1x?x3xf32> to tensor<1x?x?xf32>
   %unranked = tensor.cast %out: tensor<1x?x?xf32> to tensor<*xf32>
   call @print_memref_f32(%unranked) : (tensor<*xf32>) -> ()
diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
@@ -42,6 +42,7 @@
                     memref::MemRefDialect,
                     scf::SCFDialect,
                     StandardOpsDialect,
+                    linalg::LinalgDialect,
                     vector::VectorDialect,
                     gpu::GPUDialect>();
     // clang-format on
@@ -549,20 +550,20 @@
       funcOp.getContext(),
       LinalgTransformationFilter()
           .addOpFilter<ContractionOpInterface, FillOp, CopyOp, GenericOp>());
-  populatePadTensorOpVectorizationPatterns(patterns);
+  populatePadOpVectorizationPatterns(patterns);
   populateConvolutionVectorizationPatterns(patterns);
   (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
 }
 
 static void applyPadTensorToGenericPatterns(FuncOp funcOp) {
   RewritePatternSet patterns(funcOp.getContext());
-  patterns.add<PadTensorOpTransformationPattern>(funcOp.getContext());
+  patterns.add<PadOpTransformationPattern>(funcOp.getContext());
   (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
 }
 
 static void applyGeneralizePadTensorPatterns(FuncOp funcOp) {
   RewritePatternSet patterns(funcOp.getContext());
-  patterns.add<GeneralizePadTensorOpPattern>(funcOp.getContext());
+  patterns.add<GeneralizePadOpPattern>(funcOp.getContext());
   (void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
 }
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -4280,6 +4280,7 @@
         ":InferTypeOpInterfaceTdFiles",
         ":OpBaseTdFiles",
         ":SideEffectInterfacesTdFiles",
+        ":TilingInterfaceTdFiles",
         ":ViewLikeInterfaceTdFiles",
     ],
 )
@@ -4336,6 +4337,7 @@
         ":StandardOps",
         ":Support",
         ":TensorOpsIncGen",
+        ":TilingInterface",
         ":ViewLikeInterface",
         "//llvm:Support",
     ],
@@ -4356,6 +4358,38 @@
     ],
 )
 
+cc_library(
+    name = "TensorTilingInterfaceImpl",
+    srcs = ["lib/Dialect/Tensor/IR/TensorTilingInterfaceImpl.cpp"],
+    hdrs = ["include/mlir/Dialect/Tensor/IR/TensorTilingInterfaceImpl.h"],
+    includes = ["include"],
+    deps = [
+        ":Affine",
+        ":IR",
+        ":LinalgOps",
+        ":SCFDialect",
+        ":StandardOps",
+        ":TensorDialect",
+        ":TilingInterface",
+        "//llvm:Support",
+    ],
+)
+
+cc_library(
+    name = "TensorUtils",
+    srcs = ["lib/Dialect/Tensor/Utils/Utils.cpp"],
+    hdrs = ["include/mlir/Dialect/Tensor/Utils/Utils.h"],
+    includes = ["include"],
+    deps = [
+        ":Affine",
+        ":ArithmeticDialect",
+        ":IR",
+        ":Support",
+        ":TensorDialect",
+        "//llvm:Support",
+    ],
+)
+
 gentbl_cc_library(
     name = "TensorPassIncGen",
     strip_include_prefix = "include",
@@ -5634,6 +5668,7 @@
         ":StandardToSPIRV",
         ":TensorDialect",
         ":TensorInferTypeOpInterfaceImpl",
+        ":TensorTilingInterfaceImpl",
         ":TensorTransforms",
         ":TosaDialect",
         ":TosaToLinalg",
@@ -6911,6 +6946,7 @@
         ":Support",
         ":TensorBufferizableOpInterfaceImpl",
         ":TensorDialect",
+        ":TensorUtils",
         ":TransformUtils",
         ":VectorBufferizableOpInterfaceImpl",
         ":VectorOps",
@@ -6957,7 +6993,6 @@
     deps = [
         ":IR",
         ":Support",
-        ":TensorDialect",
         ":TilingInterfaceIncGen",
         ":ViewLikeInterface",
         "//llvm:Support",
@@ -7260,6 +7295,7 @@
         ":SCFDialect",
         ":StandardOps",
         ":TensorDialect",
+        ":TensorUtils",
         ":TosaDialect",
         ":Transforms",
     ],