diff --git a/mlir/lib/Dialect/Linalg/Transforms/SplitReduction.cpp b/mlir/lib/Dialect/Linalg/Transforms/SplitReduction.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/SplitReduction.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/SplitReduction.cpp
@@ -15,6 +15,7 @@
 
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
@@ -171,11 +172,14 @@
     outputExpr.push_back(
         b.getAffineDimExpr(dim < insertSplitDimension ? dim : dim + 1));
   }
-  Value initTensor = b.create<linalg::InitTensorOp>(
-      loc, newOutputShape, op.getRegionOutputArgs()[0].getType());
+  Value allocTensor = b.create<bufferization::AllocTensorOp>(
+      loc,
+      RankedTensorType::get(newOutputShape,
+                            op.getRegionOutputArgs()[0].getType()),
+      ValueRange{});
   Value constantOp = b.create<arith::ConstantOp>(loc, identity);
   Value identityTensor =
-      b.create<linalg::FillOp>(op->getLoc(), constantOp, initTensor)
+      b.create<linalg::FillOp>(op->getLoc(), constantOp, allocTensor)
           .getResult(0);
 
   newMaps.push_back(AffineMap::get(oldOutputMap.getNumDims() + 1, 0, outputExpr,
@@ -189,7 +193,7 @@
   // Create the new op matching the original op with an extra parallel
   // dimension.
   GenericOp genericOp = b.create<GenericOp>(
-      loc, TypeRange({initTensor.getType()}), newInputs,
+      loc, TypeRange({allocTensor.getType()}), newInputs,
       ValueRange({identityTensor}), newMaps, newIteratorTypes);
   b.inlineRegionBefore(op->getRegion(0), genericOp.region(),
                        genericOp.region().begin());
@@ -297,7 +301,7 @@
     return b.notifyMatchFailure(op, "unknown reduction neutral");
 
   // TODO: relax this when multi-reduction support is available.
-  if (op.getNumOutputs() != neutralElements.size())
+  if (op.getNumOutputs() != (int)neutralElements.size())
     return b.notifyMatchFailure(op, "expect one reduction per output");
 
   // Rewrite part.
@@ -327,8 +331,7 @@
         reductionDimSize / splitFactor, insertSplitDimension);
     SmallVector<Value> dims =
         tensor::createDynamicDimValues(b, loc, rankedTensor);
-    Value initTensor = b.create<linalg::InitTensorOp>(
-        loc, dims, newT.getShape(), t.getElementType());
+    Value initTensor = b.create<bufferization::AllocTensorOp>(loc, newT, dims);
     Value constantOp = b.create<arith::ConstantOp>(loc, std::get<1>(it));
     fillOps.push_back(
         b.create<linalg::FillOp>(op->getLoc(), constantOp, initTensor));
diff --git a/mlir/test/Dialect/Linalg/split_reduction.mlir b/mlir/test/Dialect/Linalg/split_reduction.mlir
--- a/mlir/test/Dialect/Linalg/split_reduction.mlir
+++ b/mlir/test/Dialect/Linalg/split_reduction.mlir
@@ -15,7 +15,7 @@
 //  CHECK-DAG: %[[ID:.*]] = arith.constant 0.000000e+00 : f32
 //  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] : tensor<16x256xf32> into tensor<16x4x64xf32>
 //  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] : tensor<256x32xf32> into tensor<4x64x32xf32>
-//  CHECK-DAG: %[[INI:.*]] = linalg.init_tensor [16, 32, 4] : tensor<16x32x4xf32>
+//  CHECK-DAG: %[[INI:.*]] = bufferization.alloc_tensor() : tensor<16x32x4xf32>
 //      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<16x32x4xf32>) -> tensor<16x32x4xf32>
 //      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]]
 // CHECK-SAME:   , iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
@@ -57,7 +57,7 @@
 //CHECK-LABEL: @generic_split_1d
 //      CHECK: %[[ID:.*]] = arith.constant 1.000000e+00 : f32
 //      CHECK: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1]] : tensor<32xf32> into tensor<4x8xf32>
-//      CHECK: %[[INI:.*]] = linalg.init_tensor [4] : tensor<4xf32>
+//      CHECK: %[[INI:.*]] = bufferization.alloc_tensor() : tensor<4xf32>
 //      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<4xf32>) -> tensor<4xf32>
 //      CHECK: %[[G:.*]] = linalg.generic
 //      CHECK:   {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]],
@@ -103,7 +103,7 @@
 //      CHECK: %[[ID:.*]] = arith.constant -3.40282347E+38 : f32
 //  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] : tensor<32x2xf32> into tensor<4x8x2xf32>
 //  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] : tensor<5x32xf32> into tensor<5x4x8xf32>
-//      CHECK: %[[INI:.*]] = linalg.init_tensor [5, 2, 4] : tensor<5x2x4xf32>
+//      CHECK: %[[INI:.*]] = bufferization.alloc_tensor() : tensor<5x2x4xf32>
 //      CHECK: %[[F:.*]] = linalg.fill ins(%[[ID]] : f32) outs(%[[INI]] : tensor<5x2x4xf32>) -> tensor<5x2x4xf32>
 //      CHECK: %[[G:.*]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "reduction", "parallel", "parallel"]}
 // CHECK-SAME:   ins(%[[I1]], %[[I2]] : tensor<4x8x2xf32>, tensor<5x4x8xf32>) outs(%[[F]] : tensor<5x2x4xf32>) {
diff --git a/mlir/test/Dialect/Linalg/transform-op-split-reduction-by-scaling.mlir b/mlir/test/Dialect/Linalg/transform-op-split-reduction-by-scaling.mlir
--- a/mlir/test/Dialect/Linalg/transform-op-split-reduction-by-scaling.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-split-reduction-by-scaling.mlir
@@ -3,6 +3,7 @@
 // CHECK-LABEL: func.func @matmul_split
 func.func @matmul_split(%A : tensor<?x256xf32>, %B: tensor<256x32xf32>, %C: tensor<?x32xf32>) -> tensor<?x32xf32> {
 
+  //      CHECK: bufferization.alloc_tensor({{.*}}) : tensor<?x32x64xf32>
   //      CHECK: linalg.generic 
   // CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel", "reduction"]
   // CHECK-SAME: ins(%{{[a-zA-Z0-9]*}}, %{{[a-zA-Z0-9]*}}, %{{[a-zA-Z0-9]*}} : tensor<?x256xf32>, tensor<256x32xf32>, tensor<64x4xi1>)
diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
--- a/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Dialect/Linalg/TestLinalgTransforms.cpp
@@ -12,6 +12,7 @@
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
@@ -41,6 +42,7 @@
   void getDependentDialects(DialectRegistry &registry) const override {
     // clang-format off
     registry.insert<AffineDialect,
+                    bufferization::BufferizationDialect,
                     memref::MemRefDialect,
                     scf::SCFDialect,
                     linalg::LinalgDialect,
diff --git a/ntv-trunk/PASC-2022-examples/cf-dialect.mlir b/ntv-trunk/PASC-2022-examples/cf-dialect.mlir
new file mode 100644
--- /dev/null
+++ b/ntv-trunk/PASC-2022-examples/cf-dialect.mlir
@@ -0,0 +1,15 @@
+// mlir-opt ./ntv-trunk/PASC-2022-examples/cf-dialect.mlir -convert-func-to-llvm -convert-cf-to-llvm  | mlir-translate --mlir-to-llvmir | opt -O3 -enable-matrix -matrix-allow-contract -mtriple=x86_64-grtev4-linux-gnu -mcpu=skylake-avx512 | llc -O3 -mtriple=x86_64-grtev4-linux-gnu -mcpu=skylake-avx512
+
+func.func @simple_loop(%begin: i32, %end: i32, %step: i32) {
+  cf.br ^bb1
+^bb1:   // pred: ^bb0
+  cf.br ^bb2(%begin : i32)
+^bb2(%0: i32):        // 2 preds: ^bb1, ^bb3
+  %1 = arith.cmpi slt, %0, %end : i32
+  cf.cond_br %1, ^bb3, ^bb4
+^bb3:   // pred: ^bb2
+  %2 = arith.addi %0, %step : i32
+  cf.br ^bb2(%2 : i32)
+^bb4:   // pred: ^bb2
+  return
+}
diff --git a/ntv-trunk/PASC-2022-examples/linalg-dsl.py b/ntv-trunk/PASC-2022-examples/linalg-dsl.py
new file mode 100644
--- /dev/null
+++ b/ntv-trunk/PASC-2022-examples/linalg-dsl.py
@@ -0,0 +1,13 @@
+@linalg_structured_op
+def conv_2d_ngchw_fgchw(I=TensorDef(T1, S.N, S.G, S.C, S.OH * S.SH + S.KH * S.DH,
+                                  S.OW * S.SW + S.KW * S.DW),
+                      K=TensorDef(T2, S.FG, S.G, S.C, S.KH, S.KW),
+                      O=TensorDef(U, S.N, S.G, S.FG, S.OH, S.OW, output=True),
+                      strides=IndexAttrDef(S.SH, S.SW, default=[1, 1]),
+                      dilations=IndexAttrDef(S.DH, S.DW, default=[1, 1])):
+  implements(ConvolutionOpInterface)
+  domain(D.n, D.g, D.fg, D.oh, D.ow, D.c, D.kh, D.kw)
+  O[D.n, D.g, D.fg, D.oh, D.ow] += TypeFn.cast_signed(
+      U, I[D.n, D.g, D.c, D.oh * S.SH + D.kh * S.DH, D.ow * S.SW +
+          D.kw * S.DW]) * TypeFn.cast_signed(U, K[D.fg, D.g, D.c, D.kh, D.kw])
+
diff --git a/ntv-trunk/PASC-2022-examples/llvm-dialect.mlir b/ntv-trunk/PASC-2022-examples/llvm-dialect.mlir
new file mode 100644
--- /dev/null
+++ b/ntv-trunk/PASC-2022-examples/llvm-dialect.mlir
@@ -0,0 +1,9 @@
+// mlir-opt ./ntv-trunk/PASC-2022-examples/llvm-dialect.mlir | mlir-translate --mlir-to-llvmir | opt -O3 -enable-matrix -matrix-allow-contract -mtriple=x86_64-grtev4-linux-gnu -mcpu=skylake-avx512 | llc -O3 -mtriple=x86_64-grtev4-linux-gnu -mcpu=skylake-avx512 > /tmp/aaa.s
+
+llvm.func @matrix_intrinsics(%A: vector<64 x f64>, %B: vector<64 x f64>)
+  -> vector<64 x f64> {
+  %C = llvm.intr.matrix.multiply %A, %B
+    { lhs_rows = 8: i32, lhs_columns = 8: i32 , rhs_columns = 8: i32} :
+    (vector<64 x f64>, vector<64 x f64>) -> vector<64 x f64>
+  llvm.return %C: vector<64 x f64>
+}
diff --git a/ntv-trunk/PASC-2022-examples/scf-dialect.mlir b/ntv-trunk/PASC-2022-examples/scf-dialect.mlir
new file mode 100644
--- /dev/null
+++ b/ntv-trunk/PASC-2022-examples/scf-dialect.mlir
@@ -0,0 +1,16 @@
+func.func @conditional_reduce(%buffer: memref<1024xf32>, %lb: index, %ub: index, %step: index) -> (f32) {
+  %sum_0 = arith.constant 0.0 : f32
+  %c0 = arith.constant 0.0 : f32
+  %sum = scf.for %iv = %lb to %ub step %step iter_args(%sum_iter = %sum_0) -> (f32) {
+	  %t = memref.load %buffer[%iv] : memref<1024xf32>
+	  %cond = arith.cmpf ugt, %t, %c0 : f32
+	  %sum_next = scf.if %cond -> (f32) {
+	    %new_sum = arith.addf %sum_iter, %t : f32
+      scf.yield %new_sum : f32
+	  } else {
+  		scf.yield %sum_iter : f32
+	  }
+    scf.yield %sum_next : f32
+  }
+  return %sum : f32
+}
diff --git a/ntv-trunk/PASC-2022-examples/transform-dialect.mlir b/ntv-trunk/PASC-2022-examples/transform-dialect.mlir
new file mode 100644
--- /dev/null
+++ b/ntv-trunk/PASC-2022-examples/transform-dialect.mlir
@@ -0,0 +1,55 @@
+// RUN: iree-opt %s
+
+transform.with_pdl_patterns {
+^bb0(%arg0: !pdl.operation):
+  pdl.pattern @pdl_generic_target : benefit(1) {
+    %args = operands
+    %results = types
+    %0 = operation "linalg.generic"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+    // TODO: we don't want this, but it is the required terminator for pdl.pattern
+    rewrite %0 with "transform.dialect"
+  }
+  pdl.pattern @pdl_if_op_target : benefit(1) {
+    %args = operands
+    %results = types
+    %0 = operation "scf.if"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+    // TODO: we don't want this, but it is the required terminator for pdl.pattern
+    rewrite %0 with "transform.dialect"
+  }
+  pdl.pattern @pdl_foreach_thread_op_target : benefit(1) {
+    %args = operands
+    %results = types
+    %0 = operation "scf.foreach_thread"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+    // TODO: we don't want this, but it is the required terminator for pdl.pattern
+    rewrite %0 with "transform.dialect"
+  }
+  pdl.pattern @pdl_warp_execute_on_lane_0_op_target : benefit(1) {
+    %args = operands
+    %results = types
+    %0 = operation "vector.warp_execute_on_lane_0"(%args : !pdl.range<value>) -> (%results : !pdl.range<type>)
+    // TODO: we don't want this, but it is the required terminator for pdl.pattern
+    rewrite %0 with "transform.dialect"
+  }
+
+  transform.structured.canonicalized_sequence %arg0 {
+  ^bb1(%arg1: !pdl.operation):
+    %0 = pdl_match @pdl_generic_target in %arg1
+    %fill_op, %more_parallel_op, %combiner_op = transform.structured.split_reduction %0
+      { split_factor = 2, insert_split_dimension = 1}
+    
+    %foreach_thread_2, %tiled_more_parallel_op = 
+       tile_to_foreach_thread_op %more_parallel_op {num_threads = [4, 2], thread_dim_mapping = [2, 1, 0]}
+    
+    %isolated = transform.get_closest_isolated_parent %foreach_thread_2
+    transform.structured.vectorize %isolated
+
+    transform.iree.bufferize
+    %some_foreach_thread = pdl_match @pdl_foreach_thread_op_target in %arg1
+    %isolated_handle_2 = transform.get_closest_isolated_parent %some_foreach_thread
+    transform.iree.foreach_thread_to_gpu_and_translation_info %isolated_handle_2 { workgroup_size = [32, 2, 4] }
+    
+    %warp = pdl_match @pdl_warp_execute_on_lane_0_op_target in %arg1
+    %isolated_2 = transform.get_closest_isolated_parent %warp
+    transform.iree.vector.warp_distribute %isolated_2
+  }
+}
diff --git a/ntv-trunk/PASC-2022-examples/vector-dialect.mlir b/ntv-trunk/PASC-2022-examples/vector-dialect.mlir
new file mode 100644
--- /dev/null
+++ b/ntv-trunk/PASC-2022-examples/vector-dialect.mlir
@@ -0,0 +1,11 @@
+// mlir-opt -test-vector-multi-reduction-lowering-patterns="use-outer-reductions" ntv-trunk/PASC-2022-examples/vector-dialect.mlir
+//
+// mlir-opt -test-vector-multi-reduction-lowering-patterns="use-outer-reductions" ntv-trunk/PASC-2022-examples/vector-dialect.mlir | \
+// mlir-opt -convert-vector-to-llvm -convert-func-to-llvm -convert-cf-to-llvm -canonicalize -cse | \
+// mlir-translate --mlir-to-llvmir | \
+// opt -O3 -enable-matrix -matrix-allow-contract -mtriple=x86_64-grtev4-linux-gnu -mcpu=skylake-avx512 | \
+// llc -O3 -mtriple=x86_64-grtev4-linux-gnu -mcpu=skylake-avx512
+func.func @vector_reduction_outer(%arg0: vector<2x3x4x5xi32>) -> vector<2x3xi32> {
+    %0 = vector.multi_reduction <add>, %arg0 [2, 3] : vector<2x3x4x5xi32> to vector<2x3xi32>
+    return %0 : vector<2x3xi32>
+}