diff --git a/mlir/include/mlir/Interfaces/SideEffects.td b/mlir/include/mlir/Interfaces/SideEffects.td
--- a/mlir/include/mlir/Interfaces/SideEffects.td
+++ b/mlir/include/mlir/Interfaces/SideEffects.td
@@ -118,6 +118,9 @@
   /// The name of the base effects class.
   string baseEffectName = interface.baseEffectName;
 
+  /// The parent interface that the effect belongs to.
+  string interfaceTrait = interface.trait;
+
   /// The derived effect that is being applied.
   string effect = effectName;
 
diff --git a/mlir/include/mlir/Support/LLVM.h b/mlir/include/mlir/Support/LLVM.h
--- a/mlir/include/mlir/Support/LLVM.h
+++ b/mlir/include/mlir/Support/LLVM.h
@@ -49,6 +49,9 @@
 template <typename Fn> class function_ref;
 template <typename IteratorT> class iterator_range;
 template <typename T, typename ResultT> class TypeSwitch;
+class MallocAllocator;
+template <typename AllocatorTy>
+class StringSet;
 
 // Other common classes.
 class raw_ostream;
@@ -74,6 +77,8 @@
 using DenseMap = llvm::DenseMap<KeyT, ValueT, KeyInfoT, BucketT>;
 template <typename ValueT, typename ValueInfoT = DenseMapInfo<ValueT>>
 using DenseSet = llvm::DenseSet<ValueT, ValueInfoT>;
+template <typename AllocatorTy = llvm::MallocAllocator>
+using StringSet = llvm::StringSet<AllocatorTy>;
 template <typename Fn> using function_ref = llvm::function_ref<Fn>;
 using llvm::iterator_range;
 using llvm::MutableArrayRef;
diff --git a/mlir/include/mlir/TableGen/OpClass.h b/mlir/include/mlir/TableGen/OpClass.h
--- a/mlir/include/mlir/TableGen/OpClass.h
+++ b/mlir/include/mlir/TableGen/OpClass.h
@@ -26,6 +26,7 @@
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 
 #include <string>
 
@@ -157,7 +158,8 @@
 
 private:
   StringRef extraClassDeclaration;
-  SmallVector<std::string, 4> traits;
+  SmallVector<std::string, 4> traitsVec;
+  StringSet<> traitsSet;
   bool hasOperandAdaptor;
 };
 
diff --git a/mlir/include/mlir/TableGen/SideEffects.h b/mlir/include/mlir/TableGen/SideEffects.h
--- a/mlir/include/mlir/TableGen/SideEffects.h
+++ b/mlir/include/mlir/TableGen/SideEffects.h
@@ -29,6 +29,9 @@
   // Return the name of the base C++ effect.
   StringRef getBaseEffectName() const;
 
+  // Return the name of the Interface that the effect belongs to.
+  StringRef getInterfaceTrait() const;
+
   // Return the name of the resource class.
   StringRef getResource() const;
 
diff --git a/mlir/lib/TableGen/OpClass.cpp b/mlir/lib/TableGen/OpClass.cpp
--- a/mlir/lib/TableGen/OpClass.cpp
+++ b/mlir/lib/TableGen/OpClass.cpp
@@ -195,12 +195,15 @@
   hasOperandAdaptor = has;
 }
 
-// Adds the given trait to this op.
-void tblgen::OpClass::addTrait(Twine trait) { traits.push_back(trait.str()); }
+void tblgen::OpClass::addTrait(Twine trait) {
+  auto traitStr = trait.str();
+  if (traitsSet.insert(traitStr).second)
+    traitsVec.push_back(std::move(traitStr));
+}
 
 void tblgen::OpClass::writeDeclTo(raw_ostream &os) const {
   os << "class " << className << " : public Op<" << className;
-  for (const auto &trait : traits)
+  for (const auto &trait : traitsVec)
     os << ", " << trait;
   os << "> {\npublic:\n";
   os << "  using Op::Op;\n";
diff --git a/mlir/lib/TableGen/SideEffects.cpp b/mlir/lib/TableGen/SideEffects.cpp
--- a/mlir/lib/TableGen/SideEffects.cpp
+++ b/mlir/lib/TableGen/SideEffects.cpp
@@ -24,6 +24,10 @@
   return def->getValueAsString("baseEffectName");
 }
 
+StringRef SideEffect::getInterfaceTrait() const {
+  return def->getValueAsString("interfaceTrait");
+}
+
 StringRef SideEffect::getResource() const {
   auto value = def->getValueAsString("resource");
   return value.empty() ? "::mlir::SideEffects::DefaultResource" : value;
diff --git a/mlir/test/Conversion/StandardToSPIRV/legalization.mlir b/mlir/test/Conversion/StandardToSPIRV/legalization.mlir
--- a/mlir/test/Conversion/StandardToSPIRV/legalization.mlir
+++ b/mlir/test/Conversion/StandardToSPIRV/legalization.mlir
@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: @fold_static_stride_subview_with_load
 // CHECK-SAME: [[ARG0:%.*]]: memref<12x32xf32>, [[ARG1:%.*]]: index, [[ARG2:%.*]]: index, [[ARG3:%.*]]: index, [[ARG4:%.*]]: index
-func @fold_static_stride_subview_with_load(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index) {
+func @fold_static_stride_subview_with_load(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index) -> f32 {
   // CHECK-NOT: subview
   // CHECK: [[C2:%.*]] = constant 2 : index
   // CHECK: [[C3:%.*]] = constant 3 : index
@@ -13,12 +13,12 @@
   // CHECK: load [[ARG0]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
   %0 = subview %arg0[%arg1, %arg2][][] : memref<12x32xf32> to memref<4x4xf32, offset:?, strides: [64, 3]>
   %1 = load %0[%arg3, %arg4] : memref<4x4xf32, offset:?, strides: [64, 3]>
-  return
+  return %1 : f32
 }
 
 // CHECK-LABEL: @fold_dynamic_stride_subview_with_load
 // CHECK-SAME: [[ARG0:%.*]]: memref<12x32xf32>, [[ARG1:%.*]]: index, [[ARG2:%.*]]: index, [[ARG3:%.*]]: index, [[ARG4:%.*]]: index, [[ARG5:%.*]]: index, [[ARG6:%.*]]: index
-func @fold_dynamic_stride_subview_with_load(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index) {
+func @fold_dynamic_stride_subview_with_load(%arg0 : memref<12x32xf32>, %arg1 : index, %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, %arg6 : index) -> f32 {
   // CHECK-NOT: subview
   // CHECK: [[STRIDE1:%.*]] = muli [[ARG3]], [[ARG5]] : index
   // CHECK: [[INDEX1:%.*]] = addi [[ARG1]], [[STRIDE1]] : index
@@ -27,7 +27,7 @@
   // CHECK: load [[ARG0]]{{\[}}[[INDEX1]], [[INDEX2]]{{\]}}
   %0 = subview %arg0[%arg1, %arg2][][%arg5, %arg6] : memref<12x32xf32> to memref<4x4xf32, offset:?, strides: [?, ?]>
   %1 = load %0[%arg3, %arg4] : memref<4x4xf32, offset:?, strides: [?, ?]>
-  return
+  return %1 : f32
 }
 
 // CHECK-LABEL: @fold_static_stride_subview_with_store
diff --git a/mlir/test/Conversion/VectorToLoops/vector-to-loops.mlir b/mlir/test/Conversion/VectorToLoops/vector-to-loops.mlir
--- a/mlir/test/Conversion/VectorToLoops/vector-to-loops.mlir
+++ b/mlir/test/Conversion/VectorToLoops/vector-to-loops.mlir
@@ -64,7 +64,6 @@
   // CHECK-NEXT:      affine.for %[[I2:.*]] = 0 to %{{.*}} {
   // CHECK-NEXT:        affine.for %[[I3:.*]] = 0 to %{{.*}} step 5 {
   //      CHECK:          %[[ALLOC:.*]] = alloc() : memref<5x4x3xf32>
-  // CHECK-NEXT:          %[[VECTOR_VIEW:.*]] = vector.type_cast %[[ALLOC]] : memref<5x4x3xf32>
   // CHECK-NEXT:          loop.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
   // CHECK-NEXT:            loop.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
   // CHECK-NEXT:              loop.for %[[I6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
@@ -99,7 +98,6 @@
   // CHECK-NEXT:              }
   // CHECK-NEXT:            }
   // CHECK-NEXT:          }
-  //      CHECK:          {{.*}} = load %[[VECTOR_VIEW]][] : memref<vector<5x4x3xf32>>
   // CHECK-NEXT:          dealloc %[[ALLOC]] : memref<5x4x3xf32>
   // CHECK-NEXT:        }
   // CHECK-NEXT:      }
diff --git a/mlir/test/Dialect/Affine/canonicalize.mlir b/mlir/test/Dialect/Affine/canonicalize.mlir
--- a/mlir/test/Dialect/Affine/canonicalize.mlir
+++ b/mlir/test/Dialect/Affine/canonicalize.mlir
@@ -54,30 +54,30 @@
     %x1_1 = affine.apply affine_map<(d0, d1) -> (d1)> (%x0, %x0)
 
     // CHECK: [[I0A:%[0-9]+]] = affine.apply [[MAP0]](%{{.*}})
-    // CHECK-NEXT: load %0{{\[}}[[I0A]], [[I0A]]{{\]}}
+    // CHECK-NEXT: [[V0:%[0-9]+]] = load %0{{\[}}[[I0A]], [[I0A]]{{\]}}
     %v0 = load %0[%x1_0, %x1_1] : memref<4x4xf32>
 
-    // Test load[%y, %y]
+    // Test store[%y, %y]
     %y0 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i0)
     %y1_0 = affine.apply affine_map<(d0, d1) -> (d0)> (%y0, %y0)
     %y1_1 = affine.apply affine_map<(d0, d1) -> (d1)> (%y0, %y0)
 
     // CHECK-NEXT: [[I1A:%[0-9]+]] = affine.apply [[MAP1]](%{{.*}})
-    // CHECK-NEXT: load %0{{\[}}[[I1A]], [[I1A]]{{\]}}
-    %v1 = load %0[%y1_0, %y1_1] : memref<4x4xf32>
+    // CHECK-NEXT: store [[V0]], %0{{\[}}[[I1A]], [[I1A]]{{\]}}
+    store %v0, %0[%y1_0, %y1_1] : memref<4x4xf32>
 
-    // Test load[%x, %y]
+    // Test store[%x, %y]
     %xy_0 = affine.apply affine_map<(d0, d1) -> (d0)> (%x0, %y0)
     %xy_1 = affine.apply affine_map<(d0, d1) -> (d1)> (%x0, %y0)
 
-    // CHECK-NEXT: load %0{{\[}}[[I0A]], [[I1A]]{{\]}}
-    %v2 = load %0[%xy_0, %xy_1] : memref<4x4xf32>
+    // CHECK-NEXT: store [[V0]], %0{{\[}}[[I0A]], [[I1A]]{{\]}}
+    store %v0, %0[%xy_0, %xy_1] : memref<4x4xf32>
 
-    // Test load[%y, %x]
+    // Test store[%y, %x]
     %yx_0 = affine.apply affine_map<(d0, d1) -> (d0)> (%y0, %x0)
     %yx_1 = affine.apply affine_map<(d0, d1) -> (d1)> (%y0, %x0)
-    // CHECK-NEXT: load %0{{\[}}[[I1A]], [[I0A]]{{\]}}
-    %v3 = load %0[%yx_0, %yx_1] : memref<4x4xf32>
+    // CHECK-NEXT: store [[V0]], %0{{\[}}[[I1A]], [[I0A]]{{\]}}
+    store %v0, %0[%yx_0, %yx_1] : memref<4x4xf32>
   }
   return
 }
@@ -92,29 +92,29 @@
     %x0 = affine.apply affine_map<(d0)[s0] -> (d0 - s0)> (%i0)[%c4]
 
     // CHECK: [[I0:%[0-9]+]] = affine.apply [[MAP4]](%{{.*}})
-    // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I0]], [[I0]]{{\]}}
+    // CHECK-NEXT: [[V0:%[0-9]+]] = load %{{[0-9]+}}{{\[}}[[I0]], [[I0]]{{\]}}
     %v0 = load %0[%x0, %x0] : memref<4x4xf32>
 
     // Test load[%x0, %x1] with symbol %c4 captured by '%x0' map.
     %x1 = affine.apply affine_map<(d0) -> (d0 + 1)> (%i0)
     %y1 = affine.apply affine_map<(d0, d1) -> (d0+d1)> (%x0, %x1)
     // CHECK-NEXT: [[I1:%[0-9]+]] = affine.apply [[MAP7]](%{{.*}})
-    // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I1]], [[I1]]{{\]}}
-    %v1 = load %0[%y1, %y1] : memref<4x4xf32>
+    // CHECK-NEXT: store [[V0]], %{{[0-9]+}}{{\[}}[[I1]], [[I1]]{{\]}}
+    store %v0, %0[%y1, %y1] : memref<4x4xf32>
 
-    // Test load[%x1, %x0] with symbol %c4 captured by '%x0' map.
+    // Test store[%x1, %x0] with symbol %c4 captured by '%x0' map.
     %y2 = affine.apply affine_map<(d0, d1) -> (d0 + d1)> (%x1, %x0)
     // CHECK-NEXT: [[I2:%[0-9]+]] = affine.apply [[MAP7]](%{{.*}})
-    // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I2]], [[I2]]{{\]}}
-    %v2 = load %0[%y2, %y2] : memref<4x4xf32>
+    // CHECK-NEXT: store [[V0]], %{{[0-9]+}}{{\[}}[[I2]], [[I2]]{{\]}}
+    store %v0, %0[%y2, %y2] : memref<4x4xf32>
 
-    // Test load[%x2, %x0] with symbol %c4 from '%x0' and %c5 from '%x2'
+    // Test store[%x2, %x0] with symbol %c4 from '%x0' and %c5 from '%x2'
     %c5 = constant 5 : index
     %x2 = affine.apply affine_map<(d0)[s0] -> (d0 + s0)> (%i0)[%c5]
     %y3 = affine.apply affine_map<(d0, d1) -> (d0 + d1)> (%x2, %x0)
     // CHECK: [[I3:%[0-9]+]] = affine.apply [[MAP7a]](%{{.*}})
-    // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I3]], [[I3]]{{\]}}
-    %v3 = load %0[%y3, %y3] : memref<4x4xf32>
+    // CHECK-NEXT: store [[V0]], %{{[0-9]+}}{{\[}}[[I3]], [[I3]]{{\]}}
+    store %v0, %0[%y3, %y3] : memref<4x4xf32>
   }
   return
 }
@@ -175,15 +175,15 @@
         // CHECK: [[I0:%[0-9]+]] = affine.apply [[MAP9]](%{{.*}})
         // CHECK: [[I1:%[0-9]+]] = affine.apply [[MAP4b]](%{{.*}})
         // CHECK: [[I2:%[0-9]+]] = affine.apply [[MAP10]](%{{.*}})
-        // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I0]], [[I1]]{{\]}}
+        // CHECK-NEXT: [[V0:%[0-9]+]] = load %{{[0-9]+}}{{\[}}[[I0]], [[I1]]{{\]}}
         %v0 = load %0[%x00, %x01] : memref<16x32xf32>
 
-        // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I0]], [[I2]]{{\]}}
-        %v1 = load %0[%x00, %x02] : memref<16x32xf32>
+        // CHECK-NEXT: store [[V0]], %{{[0-9]+}}{{\[}}[[I0]], [[I2]]{{\]}}
+        store %v0, %0[%x00, %x02] : memref<16x32xf32>
 
         // Swizzle %i0, %i1
-        // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I1]], [[I0]]{{\]}}
-        %v2 = load %0[%x01, %x00] : memref<16x32xf32>
+        // CHECK-NEXT: store [[V0]], %{{[0-9]+}}{{\[}}[[I1]], [[I0]]{{\]}}
+        store %v0, %0[%x01, %x00] : memref<16x32xf32>
 
         // Swizzle %x00, %x01 and %c3, %c7
         %x10 = affine.apply affine_map<(d0, d1)[s0, s1] -> (d0 * s1)>
@@ -193,18 +193,16 @@
 
         // CHECK-NEXT: [[I2A:%[0-9]+]] = affine.apply [[MAP12]](%{{.*}})
         // CHECK-NEXT: [[I2B:%[0-9]+]] = affine.apply [[MAP11]](%{{.*}})
-        // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I2A]], [[I2B]]{{\]}}
-        %v3 = load %0[%x10, %x11] : memref<16x32xf32>
+        // CHECK-NEXT: store [[V0]], %{{[0-9]+}}{{\[}}[[I2A]], [[I2B]]{{\]}}
+        store %v0, %0[%x10, %x11] : memref<16x32xf32>
       }
     }
   }
   return
 }
 
-// CHECK-LABEL: func @compose_affine_maps_diamond_dependency() {
-func @compose_affine_maps_diamond_dependency() {
-  %0 = alloc() : memref<4x4xf32>
-
+// CHECK-LABEL: func @compose_affine_maps_diamond_dependency
+func @compose_affine_maps_diamond_dependency(%arg0: f32, %arg1: memref<4x4xf32>) {
   affine.for %i0 = 0 to 15 {
     %a = affine.apply affine_map<(d0) -> (d0 - 1)> (%i0)
     %b = affine.apply affine_map<(d0) -> (d0 + 7)> (%a)
@@ -213,15 +211,15 @@
     %d1 = affine.apply affine_map<(d0, d1) -> (d1 floordiv 3)> (%b, %c)
     // CHECK: [[I0:%[0-9]+]] = affine.apply [[MAP13A]](%{{.*}})
     // CHECK: [[I1:%[0-9]+]] = affine.apply [[MAP13B]](%{{.*}})
-    // CHECK-NEXT: load %{{[0-9]+}}{{\[}}[[I0]], [[I1]]{{\]}}
-    %v = load %0[%d0, %d1] : memref<4x4xf32>
+    // CHECK-NEXT: store %arg0, %arg1{{\[}}[[I0]], [[I1]]{{\]}}
+    store %arg0, %arg1[%d0, %d1] : memref<4x4xf32>
   }
 
   return
 }
 
 // CHECK-LABEL: func @arg_used_as_dim_and_symbol
-func @arg_used_as_dim_and_symbol(%arg0: memref<100x100xf32>, %arg1: index) {
+func @arg_used_as_dim_and_symbol(%arg0: memref<100x100xf32>, %arg1: index, %arg2: f32) {
   %c9 = constant 9 : index
   %1 = alloc() : memref<100x100xf32, 1>
   %2 = alloc() : memref<1xi32>
@@ -231,8 +229,8 @@
         (%i0, %i1)[%arg1, %c9]
       %4 = affine.apply affine_map<(d0, d1, d3) -> (d3 - (d0 + d1))>
         (%arg1, %c9, %3)
-      // CHECK: load %{{[0-9]+}}{{\[}}%{{.*}}, %{{.*}}{{\]}}
-      %5 = load %1[%4, %arg1] : memref<100x100xf32, 1>
+      // CHECK: store %arg2, %{{[0-9]+}}{{\[}}%{{.*}}, %{{.*}}{{\]}}
+      store %arg2, %1[%4, %arg1] : memref<100x100xf32, 1>
     }
   }
   return
@@ -252,7 +250,7 @@
 
     %3 = affine.apply affine_map<()[] -> (0)>()[]
     store %cst, %0[%3] : memref<10xf32>
-    %4 = load %0[%c0] : memref<10xf32>
+    store %2, %0[%c0] : memref<10xf32>
   }
   return
 }
diff --git a/mlir/test/Dialect/GPU/all-reduce-max.mlir b/mlir/test/Dialect/GPU/all-reduce-max.mlir
--- a/mlir/test/Dialect/GPU/all-reduce-max.mlir
+++ b/mlir/test/Dialect/GPU/all-reduce-max.mlir
@@ -195,7 +195,6 @@
     // CHECK:   br ^bb42
     // CHECK: ^bb42:
     // CHECK:   gpu.barrier
-    // CHECK:   [[VAL_134:%.*]] = load [[VAL_1]]{{\[}}[[VAL_4]]] : memref<32xf32, 3>
     %sum = "gpu.all_reduce"(%arg0) ({}) {op = "max"} : (f32) -> (f32)
     gpu.return
   }
diff --git a/mlir/test/Dialect/GPU/all-reduce.mlir b/mlir/test/Dialect/GPU/all-reduce.mlir
--- a/mlir/test/Dialect/GPU/all-reduce.mlir
+++ b/mlir/test/Dialect/GPU/all-reduce.mlir
@@ -175,7 +175,6 @@
     // CHECK:   br ^bb42
     // CHECK: ^bb42:
     // CHECK:   gpu.barrier
-    // CHECK:   [[VAL_114:%.*]] = load [[VAL_1]]{{\[}}[[VAL_4]]] : memref<32xf32, 3>
     %sum = "gpu.all_reduce"(%arg0) ({}) {op = "add"} : (f32) -> (f32)
     gpu.return
   }
diff --git a/mlir/test/Dialect/Linalg/loops.mlir b/mlir/test/Dialect/Linalg/loops.mlir
--- a/mlir/test/Dialect/Linalg/loops.mlir
+++ b/mlir/test/Dialect/Linalg/loops.mlir
@@ -856,7 +856,6 @@
 //   CHECKLOOP-NOT: loop.for
 //   CHECKLOOP-DAG: load %[[ARG0]][]
 //   CHECKLOOP-DAG: load %[[ARG1]][]
-//   CHECKLOOP-DAG: load %[[ARG2]][]
 //       CHECKLOOP: addf
 //       CHECKLOOP: store %{{.*}}, %[[ARG2]][]
 
@@ -867,6 +866,5 @@
 //   CHECKPARALLEL-NOT: loop.for
 //   CHECKPARALLEL-DAG: load %[[ARG0]][]
 //   CHECKPARALLEL-DAG: load %[[ARG1]][]
-//   CHECKPARALLEL-DAG: load %[[ARG2]][]
 //       CHECKPARALLEL: addf
 //       CHECKPARALLEL: store %{{.*}}, %[[ARG2]][]
diff --git a/mlir/test/Dialect/Linalg/parallel_loops.mlir b/mlir/test/Dialect/Linalg/parallel_loops.mlir
--- a/mlir/test/Dialect/Linalg/parallel_loops.mlir
+++ b/mlir/test/Dialect/Linalg/parallel_loops.mlir
@@ -24,7 +24,6 @@
 // CHECK: loop.parallel (%[[I:.*]], %[[J:.*]]) = {{.*}}
 // CHECK:   %[[LHS_ELEM:.*]] = load %[[LHS]][%[[I]], %[[J]]]
 // CHECK:   %[[RHS_ELEM:.*]] = load %[[RHS]][%[[I]], %[[J]]]
-// CHECK:   %[[SUM_ELEM:.*]] = load %[[SUM]][%[[I]], %[[J]]]
 // CHECK:   %[[SUM:.*]] = addf %[[LHS_ELEM]], %[[RHS_ELEM]] : f32
 // CHECK:   store %[[SUM]], %{{.*}}[%[[I]], %[[J]]]
 // CHECK:   loop.yield
@@ -60,4 +59,4 @@
 //       CHECK:   loop.for %[[IV2:.*]] = %[[C0]] to %[[D2]] step %[[C1]]
 //       CHECK:     loop.for %[[IV3:.*]] = %[[C0]] to %[[D3]] step %[[C1]]
 //       CHECK:       load %{{.*}}[%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
-//       CHECK:       store %{{.*}}, %{{.*}}[%[[IV0]], %[[IV1]], %[[IV3]]]
\ No newline at end of file
+//       CHECK:       store %{{.*}}, %{{.*}}[%[[IV0]], %[[IV1]], %[[IV3]]]
diff --git a/mlir/test/Transforms/canonicalize.mlir b/mlir/test/Transforms/canonicalize.mlir
--- a/mlir/test/Transforms/canonicalize.mlir
+++ b/mlir/test/Transforms/canonicalize.mlir
@@ -56,6 +56,16 @@
   return
 }
 
+// CHECK-LABEL: func @load_dce
+func @load_dce(%arg0: index) {
+  %c4 = constant 4 : index
+  %a = alloc(%c4) : memref<?xf32>
+  %2 = load %a[%arg0] : memref<?xf32>
+  dealloc %a: memref<?xf32>
+  // CHECK-NEXT: return
+  return
+}
+
 // CHECK-LABEL: func @addi_zero
 func @addi_zero(%arg0: i32) -> i32 {
   // CHECK-NEXT: return %arg0
@@ -648,7 +658,7 @@
 // CHECK-DAG: #[[VIEW_MAP5:map[0-9]+]] = affine_map<(d0, d1) -> (d0 * 7 + d1)>
 
 // CHECK-LABEL: func @view
-func @view(%arg0 : index) {
+func @view(%arg0 : index) -> (f32, f32, f32, f32, f32, f32) {
   // CHECK: %[[ALLOC_MEM:.*]] = alloc() : memref<2048xi8>
   %0 = alloc() : memref<2048xi8>
   %c0 = constant 0 : index
@@ -660,41 +670,41 @@
   // CHECK: std.view %[[ALLOC_MEM]][][] : memref<2048xi8> to memref<7x11xf32, #[[VIEW_MAP0]]>
   %1 = view %0[%c15][%c7, %c11]
     : memref<2048xi8> to memref<?x?xf32, #TEST_VIEW_MAP0>
-  load %1[%c0, %c0] : memref<?x?xf32, #TEST_VIEW_MAP0>
+  %r0 = load %1[%c0, %c0] : memref<?x?xf32, #TEST_VIEW_MAP0>
 
   // Test: fold constant sizes but not offset, update map with static stride.
   // Test that we do not a fold dynamic dim which is not produced by a constant.
   // CHECK: std.view %[[ALLOC_MEM]][%arg0][] : memref<2048xi8> to memref<7x11xf32, #[[VIEW_MAP1]]>
   %2 = view %0[%arg0][%c7, %c11]
     : memref<2048xi8> to memref<?x?xf32, #TEST_VIEW_MAP0>
-  load %2[%c0, %c0] : memref<?x?xf32, #TEST_VIEW_MAP0>
+  %r1 = load %2[%c0, %c0] : memref<?x?xf32, #TEST_VIEW_MAP0>
 
   // Test: fold constant offset but not sizes, update map with constant offset.
   // Test that we fold constant offset but not dynamic dims.
   // CHECK: std.view %[[ALLOC_MEM]][][%arg0, %arg0] : memref<2048xi8> to memref<?x?xf32, #[[VIEW_MAP2]]>
   %3 = view %0[%c15][%arg0, %arg0]
     : memref<2048xi8> to memref<?x?xf32,  #TEST_VIEW_MAP0>
-  load %3[%c0, %c0] : memref<?x?xf32, #TEST_VIEW_MAP0>
+  %r2 = load %3[%c0, %c0] : memref<?x?xf32, #TEST_VIEW_MAP0>
 
   // Test: fold one constant dim, no offset, should update with constant
   // stride on dim 1, but leave dynamic stride on dim 0.
   // CHECK: std.view %[[ALLOC_MEM]][][%arg0, %arg0] : memref<2048xi8> to memref<?x?x7xf32, #[[VIEW_MAP3]]>
   %4 = view %0[][%arg0, %arg0, %c7]
     : memref<2048xi8> to memref<?x?x?xf32, #TEST_VIEW_MAP1>
-  load %4[%c0, %c0, %c0] : memref<?x?x?xf32, #TEST_VIEW_MAP1>
+  %r3 = load %4[%c0, %c0, %c0] : memref<?x?x?xf32, #TEST_VIEW_MAP1>
 
   // Test: preserve an existing static dim size while folding a dynamic
   // dimension and offset.
   // CHECK: std.view %[[ALLOC_MEM]][][] : memref<2048xi8> to memref<7x4xf32, #[[VIEW_MAP4]]>
   %5 = view %0[%c15][%c7] : memref<2048xi8> to memref<?x4xf32, #TEST_VIEW_MAP2>
-  load %5[%c0, %c0] : memref<?x4xf32, #TEST_VIEW_MAP2>
+  %r4 = load %5[%c0, %c0] : memref<?x4xf32, #TEST_VIEW_MAP2>
 
   // Test: folding static alloc and memref_cast into a view.
   // CHECK: std.view %[[ALLOC_MEM]][][] : memref<2048xi8> to memref<15x7xf32, #[[VIEW_MAP5]]>
   %6 = memref_cast %0 : memref<2048xi8> to memref<?xi8>
   %7 = view %6[%c15][%c7] : memref<?xi8> to memref<?x?xf32>
-  load %7[%c0, %c0] : memref<?x?xf32>
-  return
+  %r5 = load %7[%c0, %c0] : memref<?x?xf32>
+  return %r0, %r1, %r2, %r3, %r4, %r5 : f32, f32, f32, f32, f32, f32
 }
 
 // -----
@@ -735,7 +745,7 @@
     : memref<8x16x4xf32, affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>> to
       memref<?x?x?xf32,
        affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>>
-  load %1[%c0, %c0, %c0] : memref<?x?x?xf32,
+  %v0 = load %1[%c0, %c0, %c0] : memref<?x?x?xf32,
        affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>>
 
   // Test: subview with one dynamic operand should not be folded.
@@ -744,7 +754,7 @@
     : memref<8x16x4xf32, affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>> to
       memref<?x?x?xf32,
        affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>>
-  load %2[%c0, %c0, %c0] : memref<?x?x?xf32,
+  store %v0, %2[%c0, %c0, %c0] : memref<?x?x?xf32,
        affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>>
 
   // CHECK: %[[ALLOC1:.*]] = alloc(%[[ARG0]])
@@ -755,7 +765,7 @@
     : memref<?x16x4xf32, affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>> to
       memref<?x?x?xf32,
        affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>>
-  load %4[%c0, %c0, %c0] : memref<?x?x?xf32,
+  store %v0, %4[%c0, %c0, %c0] : memref<?x?x?xf32,
        affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>>
 
   // Test: subview offset operands are folded correctly w.r.t. base strides.
@@ -764,7 +774,7 @@
     : memref<8x16x4xf32, affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>> to
       memref<?x?x?xf32,
        affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>>
-  load %5[%c0, %c0, %c0] : memref<?x?x?xf32,
+  store %v0, %5[%c0, %c0, %c0] : memref<?x?x?xf32,
        affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>>
 
   // Test: subview stride operands are folded correctly w.r.t. base strides.
@@ -773,40 +783,40 @@
     : memref<8x16x4xf32, affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>> to
       memref<?x?x?xf32,
        affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>>
-  load %6[%c0, %c0, %c0] : memref<?x?x?xf32,
+  store %v0, %6[%c0, %c0, %c0] : memref<?x?x?xf32,
        affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + d1 * s2 + d2 * s3 + s0)>>
 
   // Test: subview shape are folded, but offsets and strides are not even if base memref is static
   // CHECK: subview %[[ALLOC0]][%[[ARG0]], %[[ARG0]], %[[ARG0]]] [] [%[[ARG1]], %[[ARG1]], %[[ARG1]]] : memref<8x16x4xf32, #[[BASE_MAP0]]> to memref<7x11x2xf32, #[[SUBVIEW_MAP3]]>
   %10 = subview %0[%arg0, %arg0, %arg0] [%c7, %c11, %c2] [%arg1, %arg1, %arg1] : memref<8x16x4xf32, offset:0, strides:[64, 4, 1]> to memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
-  load %10[%arg1, %arg1, %arg1] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+  store %v0, %10[%arg1, %arg1, %arg1] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
 
   // Test: subview strides are folded, but offsets and shape are not even if base memref is static
   // CHECK: subview %[[ALLOC0]][%[[ARG0]], %[[ARG0]], %[[ARG0]]] [%[[ARG1]], %[[ARG1]], %[[ARG1]]] [] : memref<8x16x4xf32, #[[BASE_MAP0]]> to memref<?x?x?xf32, #[[SUBVIEW_MAP4]]
   %11 = subview %0[%arg0, %arg0, %arg0] [%arg1, %arg1, %arg1] [%c2, %c7, %c11] : memref<8x16x4xf32, offset:0, strides:[64, 4, 1]> to memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
-  load %11[%arg0, %arg0, %arg0] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+  store %v0, %11[%arg0, %arg0, %arg0] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
 
   // Test: subview offsets are folded, but strides and shape are not even if base memref is static
   // CHECK: subview %[[ALLOC0]][] [%[[ARG1]], %[[ARG1]], %[[ARG1]]] [%[[ARG0]], %[[ARG0]], %[[ARG0]]] : memref<8x16x4xf32, #[[BASE_MAP0]]> to memref<?x?x?xf32, #[[SUBVIEW_MAP5]]
   %13 = subview %0[%c1, %c2, %c7] [%arg1, %arg1, %arg1] [%arg0, %arg0, %arg0] :  memref<8x16x4xf32, offset:0, strides:[64, 4, 1]> to memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
-  load %13[%arg1, %arg1, %arg1] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+  store %v0, %13[%arg1, %arg1, %arg1] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
 
   // CHECK: %[[ALLOC2:.*]] = alloc(%[[ARG0]], %[[ARG0]], %[[ARG1]])
   %14 = alloc(%arg0, %arg0, %arg1) : memref<?x?x?xf32>
   // Test: subview shape are folded, even if base memref is not static
   // CHECK: subview %[[ALLOC2]][%[[ARG0]], %[[ARG0]], %[[ARG0]]] [] [%[[ARG1]], %[[ARG1]], %[[ARG1]]] : memref<?x?x?xf32> to memref<7x11x2xf32, #[[SUBVIEW_MAP3]]>
   %15 = subview %14[%arg0, %arg0, %arg0] [%c7, %c11, %c2] [%arg1, %arg1, %arg1] : memref<?x?x?xf32> to memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
-  load %15[%arg1, %arg1, %arg1] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+  store %v0, %15[%arg1, %arg1, %arg1] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
 
   // TEST: subview strides are not folded when the base memref is not static
   // CHECK: subview %[[ALLOC2]][%[[ARG0]], %[[ARG0]], %[[ARG0]]] [%[[ARG1]], %[[ARG1]], %[[ARG1]]] [%[[C2]], %[[C2]], %[[C2]]] : memref<?x?x?xf32> to memref<?x?x?xf32, #[[SUBVIEW_MAP3]]
   %16 = subview %14[%arg0, %arg0, %arg0] [%arg1, %arg1, %arg1] [%c2, %c2, %c2] : memref<?x?x?xf32> to memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
-  load %16[%arg0, %arg0, %arg0] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+  store %v0, %16[%arg0, %arg0, %arg0] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
 
   // TEST: subview offsets are not folded when the base memref is not static
   // CHECK: subview %[[ALLOC2]][%[[C1]], %[[C1]], %[[C1]]] [%[[ARG0]], %[[ARG0]], %[[ARG0]]] [%[[ARG1]], %[[ARG1]], %[[ARG1]]] : memref<?x?x?xf32> to memref<?x?x?xf32, #[[SUBVIEW_MAP3]]
   %17 = subview %14[%c1, %c1, %c1] [%arg0, %arg0, %arg0] [%arg1, %arg1, %arg1] : memref<?x?x?xf32> to memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
-  load %17[%arg0, %arg0, %arg0] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
+  store %v0, %17[%arg0, %arg0, %arg0] : memref<?x?x?xf32, offset: ?, strides: [?, ?, ?]>
 
   // CHECK: %[[ALLOC3:.*]] = alloc() : memref<12x4xf32>
   %18 = alloc() : memref<12x4xf32>
@@ -815,12 +825,12 @@
   // TEST: subview strides are maintained when sizes are folded
   // CHECK: subview %[[ALLOC3]][%arg1, %arg1] [] [] : memref<12x4xf32> to memref<2x4xf32, #[[SUBVIEW_MAP6]]>
   %19 = subview %18[%arg1, %arg1] [%c2, %c4] [] : memref<12x4xf32> to memref<?x?xf32, offset: ?, strides:[4, 1]>
-  load %19[%arg1, %arg1] : memref<?x?xf32, offset: ?, strides:[4, 1]>
+  store %v0, %19[%arg1, %arg1] : memref<?x?xf32, offset: ?, strides:[4, 1]>
 
   // TEST: subview strides and sizes are maintained when offsets are folded
   // CHECK: subview %[[ALLOC3]][] [] [] : memref<12x4xf32> to memref<12x4xf32, #[[SUBVIEW_MAP7]]>
   %20 = subview %18[%c2, %c4] [] [] : memref<12x4xf32> to memref<12x4xf32, offset: ?, strides:[4, 1]>
-  load %20[%arg1, %arg1] : memref<12x4xf32, offset: ?, strides:[4, 1]>
+  store %v0, %20[%arg1, %arg1] : memref<12x4xf32, offset: ?, strides:[4, 1]>
 
   // Test: dim on subview is rewritten to size operand.
   %7 = dim %4, 0 : memref<?x?x?xf32,
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -1286,9 +1286,11 @@
   auto resolveDecorators = [&](Operator::var_decorator_range decorators,
                                unsigned index, unsigned kind) {
     for (auto decorator : decorators)
-      if (SideEffect *effect = dyn_cast<SideEffect>(&decorator))
+      if (SideEffect *effect = dyn_cast<SideEffect>(&decorator)) {
+        opClass.addTrait(effect->getInterfaceTrait());
         interfaceEffects[effect->getBaseEffectName()].push_back(
             EffectLocation{*effect, index, kind});
+      }
   };
 
   // Collect effects that were specified via: