diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/lit.local.cfg
@@ -0,0 +1,2 @@
+if not config.enable_cuda_runner:
+  config.unsupported = True
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
@@ -0,0 +1,276 @@
+// RUN: mlir-opt --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse,gpu.module(gpu-to-cubin{chip=sm_80 features=+ptx71}))" %s \
+// RUN: | mlir-opt --convert-scf-to-cf -convert-cf-to-llvm --convert-vector-to-llvm --convert-arith-to-llvm --gpu-to-llvm --reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_cuda_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --e main --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module attributes {gpu.container_module} {
+
+  // Kernels that run on the device.
+
+  gpu.module @kernels {
+
+    // GPU kernel to compute
+    //   C = A x B
+    // using NVidia 2:4 structured sparsity for A.
+    //
+    // Operand A values (2:4 sparse), row major format, logically "16x32xf16".
+    // Operand A metadata.
+    // - Threads     2i -> col 0
+    //           2i + 1 -> col 1
+    // Operand B (dense). Col major format.
+    // Operand C (accum)
+    //
+    gpu.func @mma_sp_sync_f16_16832(
+        %argA: memref<16x16xf16>,
+        %argA_meta: memref<16x2xi16>,
+        %argB: memref<8x32xf16>,
+        %argC: memref<16x8xf16>) kernel {
+
+      // Assume we have a linear thread id and the kernel launches 32 threads (1 warp).
+      // So CUDA launch would be threadblock = (32, 1, 1), grid = (1, 1, 1)
+      %lane_id = gpu.thread_id x
+      // Which group of 4 threads do we belong to?
+      %quad_id = affine.apply affine_map<()[s0]->(s0 floordiv 4)>()[%lane_id]
+      // Are we even group or odd group?
+      %pair_id = affine.apply affine_map<()[s0]->(s0 mod 2)>()[%lane_id]
+
+      // Load & combine the two pieces of i16 metadata required. Obviously, it's
+      // possible to re-pack the metadata before launching the kernel in order
+      // to eliminate this cost and load a single i32 operand. This just shows
+      // how to put them together if you do the naive load per the diagram in
+      // the PTX docs. Technically only the first two threads in each quad need
+      // to do this.
+      %meta_A_per_thread0 = memref.load %argA_meta[%quad_id, %pair_id] : memref<16x2xi16>
+      %row2 = affine.apply affine_map<()[s0]->(s0 + 8)>()[%quad_id]
+      %meta_A_per_thread1 = memref.load %argA_meta[%row2, %pair_id] : memref<16x2xi16>
+      %meta_init = arith.constant dense<0> : vector<2xi16>
+      %meta_low = vector.insert %meta_A_per_thread0, %meta_init[0] : i16 into vector<2xi16>
+      %meta = vector.insert %meta_A_per_thread1, %meta_low[1] : i16 into vector<2xi16>
+
+      // LOAD A
+      // Load the actual fragments for the dense values. This can be done using ldmatrix,
+      // but here we just do naive individual loads.
+      %c4 = arith.constant 4 : index
+      %c8 = arith.constant 8 : index
+      %row0, %col0 = affine.delinearize_index %lane_id into (%c8, %c4) : index, index
+      %row8 = arith.addi %row0, %c8 : index
+      %col8 = arith.addi %col0, %c8 : index
+      %f0  = arith.constant 0.0 : f16
+      %quad00 = vector.transfer_read %argA[%row0, %col0], %f0 {in_bounds = [true]} : memref<16x16xf16>, vector<2xf16>
+      %quad10 = vector.transfer_read %argA[%row8, %col0], %f0 {in_bounds = [true]} : memref<16x16xf16>, vector<2xf16>
+      %quad01 = vector.transfer_read %argA[%row0, %col8], %f0 {in_bounds = [true]} : memref<16x16xf16>, vector<2xf16>
+      %quad11 = vector.transfer_read %argA[%row8, %col8], %f0 {in_bounds = [true]} : memref<16x16xf16>, vector<2xf16>
+      %init = arith.constant dense<0.0> : vector<4x2xf16>
+      %A_data = vector.insert %quad00, %init[0] : vector<2xf16> into vector<4x2xf16>
+      %A_data1 = vector.insert %quad10, %A_data[0] : vector<2xf16> into vector<4x2xf16>
+      %A_data2 = vector.insert %quad01, %A_data1[0] : vector<2xf16> into vector<4x2xf16>
+      %A_data3 = vector.insert %quad11, %A_data2[0] : vector<2xf16> into vector<4x2xf16>
+
+      // LOAD B
+      // Load the actual fragments for the dense values. This can be done using ldmatrix,
+      // but here we just do naive individual loads.
+      %b_row0 = affine.apply affine_map<()[s0]->( (s0 mod 4) * 2 )>()[%lane_id]
+      %b_row8 = affine.apply affine_map<()[s0]->( (s0 mod 4) * 2 +8 )>()[%lane_id]
+      %b_row16 = affine.apply affine_map<()[s0]->( (s0 mod 4) * 2 +16 )>()[%lane_id]
+      %b_row24 = affine.apply affine_map<()[s0]->( (s0 mod 4) * 2 +24 )>()[%lane_id]
+      %b_col = affine.apply affine_map<()[s0]->(s0 floordiv 4)>()[%lane_id]
+      %quad0_b = vector.transfer_read %argB[%b_col, %b_row0], %f0 {in_bounds = [true]} : memref<8x32xf16>, vector<2xf16>
+      %quad1_b = vector.transfer_read %argB[%b_col, %b_row8], %f0 {in_bounds = [true]} : memref<8x32xf16>, vector<2xf16>
+      %quad2_b = vector.transfer_read %argB[%b_col, %b_row16], %f0 {in_bounds = [true]} : memref<8x32xf16>, vector<2xf16>
+      %quad3_b = vector.transfer_read %argB[%b_col, %b_row24], %f0 {in_bounds = [true]} : memref<8x32xf16>, vector<2xf16>
+      %B_data = vector.insert %quad0_b, %init[0] : vector<2xf16> into vector<4x2xf16>
+      %B_data1 = vector.insert %quad1_b, %A_data[0] : vector<2xf16> into vector<4x2xf16>
+      %B_data2 = vector.insert %quad2_b, %A_data1[0] : vector<2xf16> into vector<4x2xf16>
+      %B_data3 = vector.insert %quad3_b, %A_data2[0] : vector<2xf16> into vector<4x2xf16>
+
+      // For now just say accum is a zero-d register
+      %accum = arith.constant dense<0.0> : vector<2x2xf16>
+
+      // Sparsity selector. For 16x8x32, "0" means Threads T0/T1 within each group of four threads contribute metadata.
+      //  "mma.sp.sync.aligned.m16n8k32.row.col.f16.f16.f16.f16
+      //     {%0,%1}, " "{%2,%3,%4,%5}, {%6,%7,%8,%9}, {%10,%11}, %12, 0x0;\n"
+      //    : "=r"(D[0]), "=r"(D[1])
+      //    : "r"(A[0]), "r"(A[1]), "r"(A[2]), "r"(A[3]),
+      //      "r"(B[0]), "r"(B[1]), "r"(B[2]), "r"(B[3]),
+      //      "r"(C[0]), "r"(C[1]),
+      //      "r"(E));
+      %d = nvgpu.mma.sp.sync(%A_data3, %B_data3, %accum)
+           metadata(%meta)
+           {mmaShape = [16, 8, 32]} : (vector<4x2xf16>, vector<4x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
+
+      // TODO: fix
+      %C_0 = vector.extract %d[0] : vector<2x2xf16>
+      %C_1 = vector.extract %d[0] : vector<2x2xf16>
+
+      // TODO: These indices are wrong, will fix later.
+      %c0 = arith.constant 0 : index
+      vector.transfer_write %C_0, %argC[%c0, %c0] : vector<2xf16>, memref<16x8xf16>
+      vector.transfer_write %C_1, %argC[%c8, %c0] : vector<2xf16>, memref<16x8xf16>
+      gpu.return
+    }
+  }
+
+  // Code than runs on the host.
+
+  //
+  // This test performs a matrix multiplication
+  //   C = A x B
+  // using NVidia 2:4 structured sparsity for A.
+  //
+  func.func @main() {
+    %f0  = arith.constant 0.0 : f16
+    %c0  = arith.constant 0   : index
+    %c1  = arith.constant 1   : index
+    %c8  = arith.constant 8   : index
+    %c16 = arith.constant 16  : index
+    %c32 = arith.constant 32  : index
+    %c64 = arith.constant 64  : index
+
+    // Matrices A, B, C (16x32, 32x8, 16x8)
+    %a = memref.alloc() : memref<16x16xf16>  // 16x32 but 2:4, row-major
+    %b = memref.alloc() : memref<8x32xf16>   // regular dense  column-major
+    %c = memref.alloc() : memref<16x8xf16>   // accumulator    row-major
+
+    // Metadata for A.
+    %m = memref.alloc() : memref<16x2xi16>
+
+    //
+    // Setup matrix A.
+    //
+    scf.for %ai = %c0 to %c16 step %c1 {
+      scf.for %aj = %c0 to %c16 step %c1 {
+        %a0 = arith.addi %ai, %aj : index
+        %a1 = arith.subi %c64, %a0 : index
+        %a2 = arith.index_cast %a1 : index to i32
+        %a3 = arith.sitofp %a2 : i32 to f16
+        memref.store %a3, %a[%ai, %aj] : memref<16x16xf16>
+      }
+    }
+
+    //
+    // Setup metadata for matrix A.
+    //
+    // Here we assume that all 2:4 elements are in pos 0 and 2,
+    // viz. in matrix
+    //   | X 0 Y 0 | A 0 B 0 |
+    // and thus, in registers
+    //     { B  A  Y  X  }
+    //     { 11 00 11 00 }
+    //
+    %bits = arith.constant 0x2020 : i16
+    scf.for %mi = %c0 to %c16 step %c1 {
+      memref.store %bits, %m[%mi, %c0] : memref<16x2xi16>
+      memref.store %bits, %m[%mi, %c1] : memref<16x2xi16>
+    }
+
+    //
+    // Setup matrix B.
+    //
+    scf.for %bi = %c0 to %c8 step %c1 {
+      scf.for %bj = %c0 to %c32 step %c1 {
+        %b0 = arith.addi %bi, %bj : index
+        %b1 = arith.addi %b0, %c16 : index
+        %b2 = arith.index_cast %b1 : index to i32
+        %b3 = arith.sitofp %b2 : i32 to f16
+        memref.store %b3, %b[%bi, %bj] : memref<8x32xf16>
+      }
+    }
+
+    //
+    // Reset matrix C.
+    //
+    scf.for %ci = %c0 to %c16 step %c1 {
+      scf.for %cj = %c0 to %c8 step %c1 {
+        memref.store %f0, %c[%ci, %cj] : memref<16x8xf16>
+      }
+    }
+
+    //
+    // Sanity check on input matrix A.
+    //
+    // CHOCK:      ( 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49 )
+    // CHOCK-NEXT: ( 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48 )
+    // CHOCK-NEXT: ( 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47 )
+    // CHOCK-NEXT: ( 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46 )
+    // CHOCK-NEXT: ( 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45 )
+    // CHOCK-NEXT: ( 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44 )
+    // CHOCK-NEXT: ( 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43 )
+    // CHOCK-NEXT: ( 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42 )
+    // CHOCK-NEXT: ( 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41 )
+    // CHOCK-NEXT: ( 55, 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40 )
+    // CHOCK-NEXT: ( 54, 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39 )
+    // CHOCK-NEXT: ( 53, 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38 )
+    // CHOCK-NEXT: ( 52, 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37 )
+    // CHOCK-NEXT: ( 51, 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36 )
+    // CHOCK-NEXT: ( 50, 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35 )
+    // CHOCK-NEXT: ( 49, 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34 )
+    //
+    scf.for %pai = %c0 to %c16 step %c1 {
+      %pa0 = vector.transfer_read %a[%pai, %c0], %f0 : memref<16x16xf16>, vector<16xf16>
+      vector.print %pa0 : vector<16xf16>
+    }
+
+    //
+    // Sanity check on input matrix 32x8 B.
+    // Note that this is really printed as B^T
+    //
+    // CHOCK-NEXT: ( 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47 )
+    // CHOCK-NEXT: ( 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48 )
+    // CHOCK-NEXT: ( 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49 )
+    // CHOCK-NEXT: ( 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50 )
+    // CHOCK-NEXT: ( 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51 )
+    // CHOCK-NEXT: ( 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52 )
+    // CHOCK-NEXT: ( 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53 )
+    // CHOCK-NEXT: ( 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54 )
+    //
+    scf.for %pbi = %c0 to %c8 step %c1 {
+      %pb0 = vector.transfer_read %b[%pbi, %c0], %f0 : memref<8x32xf16>, vector<32xf16>
+      vector.print %pb0 : vector<32xf16>
+    }
+
+    // Call the kernel, using a single warp of 32 threads.
+    %t1 = arith.constant 1  : index
+    %tt = arith.constant 32 : index
+    gpu.launch_func @kernels::@mma_sp_sync_f16_16832
+            blocks  in (%t1, %t1, %t1) // gridSizeX,Y,Z
+            threads in (%tt, %t1, %t1) // blockSizeX,Y,Z
+            args(%a : memref<16x16xf16>,
+                 %m : memref<16x2xi16>,
+		 %b : memref<8x32xf16>,
+		 %c : memref<16x8xf16>)
+
+    //
+    // Verify computed matrix C.
+    //
+    // TBD: correct result
+    //
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHOCK-NEXT: ( 0, 0, 0, 0, 0, 0, 0, 0 )
+    //
+    // CHECK: BIK TRIED CUDA
+    //
+    scf.for %pci = %c0 to %c16 step %c1 {
+      %pc0 = vector.transfer_read %c[%pci, %c0], %f0 : memref<16x8xf16>, vector<8xf16>
+      vector.print %pc0 : vector<8xf16>
+    }
+
+    return
+  }
+}