diff --git a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
--- a/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
+++ b/mlir/include/mlir/Dialect/LoopOps/LoopOps.td
@@ -13,6 +13,7 @@
 #ifndef LOOP_OPS
 #define LOOP_OPS
 
+include "mlir/Dialect/AffineOps/AffineOpsBase.td"
 include "mlir/IR/OpBase.td"
 include "mlir/Transforms/LoopLikeInterface.td"
 
@@ -130,6 +131,16 @@
   }];
 }
 
+def LoopMappingAttr : StructAttr<"LoopMapping", Loop_Dialect, [
+  StructFieldAttr<"processor", APIntAttr>,
+  StructFieldAttr<"map", AffineMapAttr>]> {
+  let description = "Defines the mapping of a parallel loop dimension to the target.";
+}
+
+def LoopMappingArrayAttr : TypedArrayAttrBase<LoopMappingAttr, "loop mapping array attribute"> {
+
+}
+
 def ParallelOp : Loop_Op<"parallel",
     [SameVariadicOperandSize, SingleBlockImplicitTerminator<"TerminatorOp">]> {
   let summary = "parallel for operation";
@@ -174,7 +185,8 @@
 
   let arguments = (ins Variadic<Index>:$lowerBound,
                        Variadic<Index>:$upperBound,
-                       Variadic<Index>:$step);
+                       Variadic<Index>:$step,
+                       OptionalAttr<LoopMappingArrayAttr>:$mapping);
   let results = (outs Variadic<AnyType>:$results);
   let regions = (region SizedRegion<1>:$region);
 
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -20,8 +20,12 @@
 #include "mlir/Dialect/LoopOps/LoopOps.h"
 #include "mlir/Dialect/StandardOps/Ops.h"
 #include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/LoopUtils.h"
+#include "mlir/Transforms/Passes.h"
 #include "mlir/Transforms/RegionUtils.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/Support/Debug.h"
@@ -487,3 +491,158 @@
                                            ArrayRef<Value> workGroupSizes) {
   return ::convertLoopToGPULaunch(forOp, numWorkGroups, workGroupSizes);
 }
+
+namespace {
+struct ParallelToGpuLaunchLowering : public OpRewritePattern<ParallelOp> {
+  using OpRewritePattern<ParallelOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(ParallelOp parallelOp,
+                                PatternRewriter &rewriter) const override;
+};
+
+std::tuple<unsigned, AffineMap, AffineMap>
+extractMapAndOperand(Attribute attribute) {
+  DictionaryAttr dict = attribute.dyn_cast<DictionaryAttr>();
+  unsigned processor = dict.get("processor").dyn_cast<IntegerAttr>().getValue().getSExtValue();
+  AffineMap map = dict.get("map").dyn_cast<AffineMapAttr>().getValue();
+  AffineMapAttr boundAttr = dict.get("bound").dyn_cast_or_null<AffineMapAttr>();
+  AffineMap bound;
+  if (boundAttr) bound = boundAttr.getValue();
+  return {processor, map, bound};
+}
+
+LogicalResult processParallelLoop(ParallelOp parallelOp, gpu::LaunchOp launchOp,
+                                  BlockAndValueMapping &cloning_map,
+                                  SmallVectorImpl<Operation *> &worklist,
+                                  PatternRewriter &rewriter) {
+  // TODO(herhut): Support reductions.
+  if (!parallelOp.mapping() || !parallelOp.mapping()->isa<ArrayAttr>() ||
+      parallelOp.getNumResults() != 0)
+    return failure();
+
+  Location loc = parallelOp.getLoc();
+  auto mapping = parallelOp.mapping()->dyn_cast<ArrayAttr>();
+  // TODO(herhut): Verify that this is a valid GPU mapping.
+  // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
+  
+  for (auto config : llvm::zip(mapping, parallelOp.getInductionVars(), parallelOp.lowerBound(), parallelOp.upperBound(), parallelOp.step())) {
+    unsigned processor; 
+    AffineMap map;
+    AffineMap bound;
+    std::tie(processor, map, bound) = extractMapAndOperand(std::get<0>(config));
+    Value newIndex;
+    if (processor < gpu::LaunchOp::kNumConfigOperands) {
+      // Use the corresponding thread/grid index as replacement for the loop iv.
+      Value operand = launchOp.body().front().getArgument(processor);
+      Value appliedMap = rewriter.create<AffineApplyOp>(loc, map, operand);
+      // Add the lower bound, as the maps are 0 based but the loop might not be.
+      newIndex = rewriter.create<AddIOp>(
+          loc, appliedMap, cloning_map.lookupOrDefault(std::get<2>(config)));
+      // If there was also a bound, insert that, too.
+      // TODO(herhut): Check that we do not assign bounds twice.
+      if (bound) {
+        auto save = rewriter.saveInsertionPoint();
+        rewriter.setInsertionPoint(launchOp);
+        // We pass as the single opererand to the bound-map the number of
+        // iterations, which is upperBound - lowerBound.
+        Value iterations = rewriter.create<SubIOp>(
+            loc, cloning_map.lookupOrDefault(std::get<3>(config)),
+            cloning_map.lookupOrDefault(std::get<2>(config)));
+        Value newBound = rewriter.create<AffineApplyOp>(loc, bound, iterations);
+        launchOp.setOperand(processor, newBound);
+        rewriter.restoreInsertionPoint(save);
+      }
+    } else {
+      // Create a sequential for loop.
+      auto loopOp = rewriter.create<loop::ForOp>(
+          loc, cloning_map.lookupOrDefault(std::get<2>(config)),
+          cloning_map.lookupOrDefault(std::get<3>(config)),
+          cloning_map.lookupOrDefault(std::get<4>(config)));
+      newIndex = loopOp.getInductionVar();
+      rewriter.setInsertionPointToStart(loopOp.getBody());
+      // Put a sentinel into the worklist so we know when to pop out of the loop
+      // body again. We use the launchOp here, as that cannot be part of the
+      // bodies instruction.
+      worklist.push_back(launchOp.getOperation());
+    }
+    cloning_map.map(std::get<1>(config), newIndex);
+  }
+  Block &body = parallelOp.body().front();
+  worklist.reserve(worklist.size() + body.getOperations().size());
+  for (Operation &op : llvm::reverse(body.without_terminator()))
+    worklist.push_back(&op);
+  return success();
+}
+
+} // namespace
+
+PatternMatchResult
+ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
+                                             PatternRewriter &rewriter) const {
+  // Create a launch operation. We start with bound one for all grid/block
+  // sizes. Those will be refined later as we discover them from mappings.
+  Location loc = parallelOp.getLoc();
+  Value constantOne = rewriter.create<ConstantIndexOp>(parallelOp.getLoc(), 1);
+  gpu::LaunchOp launchOp = rewriter.create<gpu::LaunchOp>(
+      parallelOp.getLoc(), constantOne, constantOne, constantOne, constantOne,
+      constantOne, constantOne);
+  rewriter.setInsertionPointToEnd(&launchOp.body().front());
+  rewriter.create<gpu::TerminatorOp>(loc);
+  rewriter.setInsertionPointToStart(&launchOp.body().front());
+
+  BlockAndValueMapping cloning_map;
+  SmallVector<Operation *, 16> worklist;
+  if (failed(processParallelLoop(parallelOp, launchOp, cloning_map, worklist,
+                                 rewriter)))
+    return matchFailure();
+
+  while (!worklist.empty()) {
+    Operation *op = worklist.pop_back_val();
+
+    // Now walk over the body and clone it.
+    // TODO: This is only correct if there either is no further loop.parallel
+    // nested
+    //       or this code is side-effect free. Otherwise we might need
+    //       predication.
+    if (auto nestedParallel = dyn_cast<ParallelOp>(op)) {
+      // A nested loop.parallel needs insertion of code to compute indices.
+      // Insert that now.
+      processParallelLoop(nestedParallel, launchOp, cloning_map, worklist,
+                          rewriter);
+    } else if (op == launchOp.getOperation()) {
+      // Found our sentinel value. We have finished the operations from one
+      // nesting level, pop one level back up.
+      auto parent = rewriter.getInsertionPoint()->getParentOp();
+      rewriter.setInsertionPointAfter(parent);
+    } else {
+      // Otherwise we copy it over.
+      Operation *clone = rewriter.clone(*op, cloning_map);
+      cloning_map.map(op->getResults(), clone->getResults());
+    }
+  }
+
+  rewriter.eraseOp(parallelOp);
+  return matchSuccess();
+}
+
+namespace {
+  struct ParallelLoopToGpuPass : public OperationPass<ParallelLoopToGpuPass> {
+  void runOnOperation() override;
+};
+}
+
+void ParallelLoopToGpuPass::runOnOperation() {
+  OwningRewritePatternList patterns;
+  patterns.insert<ParallelToGpuLaunchLowering>(&getContext());
+  ConversionTarget target(getContext());
+  target.addLegalDialect<StandardOpsDialect>();
+  target.addLegalDialect<gpu::GPUDialect>();
+  target.addLegalDialect<loop::LoopOpsDialect>();
+  target.addIllegalOp<loop::ParallelOp>();
+  if (failed(applyPartialConversion(getOperation(), target, patterns)))
+    signalPassFailure();
+}
+
+static PassRegistration<ParallelLoopToGpuPass>
+    pass("convert-parallel-loop-to-gpu", "Convert mapped loop,parallel op to "
+                                "gpu launch operations.");
\ No newline at end of file
diff --git a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
@@ -0,0 +1,178 @@
+// RUN: mlir-opt -convert-parallel-loop-to-gpu -split-input-file %s | FileCheck %s -dump-input-on-failure
+
+// 2-d parallel loop mapped to block.y and block.x
+
+func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
+                    %arg3 : index, %arg4 : index, 
+                    %buf : memref<?x?xf32>,
+                    %res : memref<?x?xf32>) {
+  %step = constant 2 : index
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%arg4, %step)  {
+    %val = load %buf[%i0, %i1] : memref<?x?xf32>
+    store %val, %res[%i1, %i0] : memref<?x?xf32>
+  } { mapping = [{processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, {processor = 0, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}] }
+  return
+}
+
+// CHECK-LABEL:   func @parallel_loop(
+// CHECK-SAME:                        [[VAL_0:%.*]]: index, [[VAL_1:%.*]]: index, [[VAL_2:%.*]]: index, [[VAL_3:%.*]]: index, [[VAL_4:%.*]]: index, [[VAL_5:%.*]]: memref<?x?xf32>, [[VAL_6:%.*]]: memref<?x?xf32>) {
+// CHECK:           [[VAL_7:%.*]] = constant 2 : index
+// CHECK:           [[VAL_8:%.*]] = constant 1 : index
+// CHECK:           [[VAL_9:%.*]] = subi [[VAL_2]], [[VAL_0]] : index
+// CHECK:           [[VAL_10:%.*]] = subi [[VAL_3]], [[VAL_1]] : index
+// CHECK:           gpu.launch blocks([[VAL_11:%.*]], [[VAL_12:%.*]], [[VAL_13:%.*]]) in ([[VAL_14:%.*]] = [[VAL_10]], [[VAL_15:%.*]] = [[VAL_9]], [[VAL_16:%.*]] = [[VAL_8]]) threads([[VAL_17:%.*]], [[VAL_18:%.*]], [[VAL_19:%.*]]) in ([[VAL_20:%.*]] = [[VAL_8]], [[VAL_21:%.*]] = [[VAL_8]], [[VAL_22:%.*]] = [[VAL_8]]) {
+// CHECK:             [[VAL_23:%.*]] = addi [[VAL_12]], [[VAL_0]] : index
+// CHECK:             [[VAL_24:%.*]] = addi [[VAL_11]], [[VAL_1]] : index
+// CHECK:             [[VAL_25:%.*]] = load [[VAL_5]]{{\[}}[[VAL_23]], [[VAL_24]]] : memref<?x?xf32>
+// CHECK:             store [[VAL_25]], [[VAL_6]]{{\[}}[[VAL_24]], [[VAL_23]]] : memref<?x?xf32>
+// CHECK:             gpu.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+// CHECK:       }
+
+// -----
+
+// tiled 2-d parallel loop mapped to block.y and block.x and thread.y and thread.x.
+
+func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
+                    %arg3 : index,
+                    %buf : memref<?x?xf32>,
+                    %res : memref<?x?xf32>) {
+  %zero = constant 0 : index
+  %one = constant 1 : index
+  %four = constant 4 : index
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+    loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four)
+                                            step (%one, %one)  {
+      %idx0 = addi %i0, %si0 : index
+      %idx1 = addi %i1, %si1 : index
+      %val = load %buf[%idx0, %idx1] : memref<?x?xf32>
+      store %val, %res[%idx1, %idx0] : memref<?x?xf32>
+    } { mapping = [
+        {processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+        {processor = 3, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+     ] }
+  } { mapping = [
+      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+      {processor = 0, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+    ] }
+  return
+}
+
+// CHECK-LABEL:   func @parallel_loop(
+// CHECK-SAME:                        [[VAL_26:%.*]]: index, [[VAL_27:%.*]]: index, [[VAL_28:%.*]]: index, [[VAL_29:%.*]]: index, [[VAL_30:%.*]]: memref<?x?xf32>, [[VAL_31:%.*]]: memref<?x?xf32>) {
+// CHECK:           [[VAL_32:%.*]] = constant 0 : index
+// CHECK:           [[VAL_33:%.*]] = constant 1 : index
+// CHECK:           [[VAL_34:%.*]] = constant 4 : index
+// CHECK:           [[VAL_35:%.*]] = constant 1 : index
+// CHECK:           [[VAL_36:%.*]] = subi [[VAL_28]], [[VAL_26]] : index
+// CHECK:           [[VAL_37:%.*]] = subi [[VAL_29]], [[VAL_27]] : index
+// CHECK:           [[VAL_38:%.*]] = subi [[VAL_34]], [[VAL_32]] : index
+// CHECK:           [[VAL_39:%.*]] = subi [[VAL_34]], [[VAL_32]] : index
+// CHECK:           gpu.launch blocks([[VAL_40:%.*]], [[VAL_41:%.*]], [[VAL_42:%.*]]) in ([[VAL_43:%.*]] = [[VAL_37]], [[VAL_44:%.*]] = [[VAL_36]], [[VAL_45:%.*]] = [[VAL_35]]) threads([[VAL_46:%.*]], [[VAL_47:%.*]], [[VAL_48:%.*]]) in ([[VAL_49:%.*]] = [[VAL_39]], [[VAL_50:%.*]] = [[VAL_38]], [[VAL_51:%.*]] = [[VAL_35]]) {
+// CHECK:             [[VAL_52:%.*]] = addi [[VAL_41]], [[VAL_26]] : index
+// CHECK:             [[VAL_53:%.*]] = addi [[VAL_40]], [[VAL_27]] : index
+// CHECK:             [[VAL_54:%.*]] = addi [[VAL_47]], [[VAL_32]] : index
+// CHECK:             [[VAL_55:%.*]] = addi [[VAL_46]], [[VAL_32]] : index
+// CHECK:             [[VAL_56:%.*]] = addi [[VAL_52]], [[VAL_54]] : index
+// CHECK:             [[VAL_57:%.*]] = addi [[VAL_53]], [[VAL_55]] : index
+// CHECK:             [[VAL_58:%.*]] = load [[VAL_30]]{{\[}}[[VAL_56]], [[VAL_57]]] : memref<?x?xf32>
+// CHECK:             store [[VAL_58]], [[VAL_31]]{{\[}}[[VAL_57]], [[VAL_56]]] : memref<?x?xf32>
+// CHECK:             gpu.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+// 2-d parallel loop mapped to block.y and sequential
+
+func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
+                    %arg3 : index, %arg4 : index, 
+                    %buf : memref<?x?xf32>,
+                    %res : memref<?x?xf32>) {
+  %step = constant 2 : index
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%arg4, %step)  {
+    %val = load %buf[%i0, %i1] : memref<?x?xf32>
+    store %val, %res[%i1, %i0] : memref<?x?xf32>
+  } { mapping = [
+      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+      {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+    ] }
+  return
+}
+
+// CHECK-LABEL:   func @parallel_loop(
+// CHECK-SAME:                        [[VAL_59:%.*]]: index, [[VAL_60:%.*]]: index, [[VAL_61:%.*]]: index, [[VAL_62:%.*]]: index, [[VAL_63:%.*]]: index, [[VAL_64:%.*]]: memref<?x?xf32>, [[VAL_65:%.*]]: memref<?x?xf32>) {
+// CHECK:           [[VAL_66:%.*]] = constant 2 : index
+// CHECK:           [[VAL_67:%.*]] = constant 1 : index
+// CHECK:           [[VAL_68:%.*]] = subi [[VAL_61]], [[VAL_59]] : index
+// CHECK:           gpu.launch blocks([[VAL_69:%.*]], [[VAL_70:%.*]], [[VAL_71:%.*]]) in ([[VAL_72:%.*]] = [[VAL_67]], [[VAL_73:%.*]] = [[VAL_68]], [[VAL_74:%.*]] = [[VAL_67]]) threads([[VAL_75:%.*]], [[VAL_76:%.*]], [[VAL_77:%.*]]) in ([[VAL_78:%.*]] = [[VAL_67]], [[VAL_79:%.*]] = [[VAL_67]], [[VAL_80:%.*]] = [[VAL_67]]) {
+// CHECK:             [[VAL_81:%.*]] = addi [[VAL_70]], [[VAL_59]] : index
+// CHECK:             loop.for [[VAL_82:%.*]] = [[VAL_60]] to [[VAL_62]] step [[VAL_66]] {
+// CHECK:               [[VAL_83:%.*]] = load [[VAL_64]]{{\[}}[[VAL_81]], [[VAL_82]]] : memref<?x?xf32>
+// CHECK:               store [[VAL_83]], [[VAL_65]]{{\[}}[[VAL_82]], [[VAL_81]]] : memref<?x?xf32>
+// CHECK:             }
+// CHECK:             gpu.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+
+// -----
+
+// tiled 2-d parallel loop mapped to block.y and seq. and thread.y and seq.
+
+func @parallel_loop(%arg0 : index, %arg1 : index, %arg2 : index,
+                    %arg3 : index,
+                    %buf : memref<?x?xf32>,
+                    %res : memref<?x?xf32>) {
+  %zero = constant 0 : index
+  %one = constant 1 : index
+  %four = constant 4 : index
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+    loop.parallel (%si0, %si1) = (%zero, %zero) to (%four, %four)
+                                            step (%one, %one)  {
+      %idx0 = addi %i0, %si0 : index
+      %idx1 = addi %i1, %si1 : index
+      %val = load %buf[%idx0, %idx1] : memref<?x?xf32>
+      store %val, %res[%idx1, %idx0] : memref<?x?xf32>
+    } { mapping = [
+        {processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+        {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+      ] }
+  } { mapping = [
+      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+      {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+    ] }
+  return
+}
+
+// CHECK-LABEL:   func @parallel_loop(
+// CHECK-SAME:                        [[VAL_84:%.*]]: index, [[VAL_85:%.*]]: index, [[VAL_86:%.*]]: index, [[VAL_87:%.*]]: index, [[VAL_88:%.*]]: memref<?x?xf32>, [[VAL_89:%.*]]: memref<?x?xf32>) {
+// CHECK:           [[VAL_90:%.*]] = constant 0 : index
+// CHECK:           [[VAL_91:%.*]] = constant 1 : index
+// CHECK:           [[VAL_92:%.*]] = constant 4 : index
+// CHECK:           [[VAL_93:%.*]] = constant 1 : index
+// CHECK:           [[VAL_94:%.*]] = subi [[VAL_86]], [[VAL_84]] : index
+// CHECK:           [[VAL_95:%.*]] = subi [[VAL_92]], [[VAL_90]] : index
+// CHECK:           gpu.launch blocks([[VAL_96:%.*]], [[VAL_97:%.*]], [[VAL_98:%.*]]) in ([[VAL_99:%.*]] = [[VAL_93]], [[VAL_100:%.*]] = [[VAL_94]], [[VAL_101:%.*]] = [[VAL_93]]) threads([[VAL_102:%.*]], [[VAL_103:%.*]], [[VAL_104:%.*]]) in ([[VAL_105:%.*]] = [[VAL_93]], [[VAL_106:%.*]] = [[VAL_95]], [[VAL_107:%.*]] = [[VAL_93]]) {
+// CHECK:             [[VAL_108:%.*]] = addi [[VAL_97]], [[VAL_84]] : index
+// CHECK:             loop.for [[VAL_109:%.*]] = [[VAL_85]] to [[VAL_87]] step [[VAL_92]] {
+// CHECK:               [[VAL_110:%.*]] = addi [[VAL_103]], [[VAL_90]] : index
+// CHECK:               loop.for [[VAL_111:%.*]] = [[VAL_90]] to [[VAL_92]] step [[VAL_91]] {
+// CHECK:                 [[VAL_112:%.*]] = addi [[VAL_108]], [[VAL_110]] : index
+// CHECK:                 [[VAL_113:%.*]] = addi [[VAL_109]], [[VAL_111]] : index
+// CHECK:                 [[VAL_114:%.*]] = load [[VAL_88]]{{\[}}[[VAL_112]], [[VAL_113]]] : memref<?x?xf32>
+// CHECK:                 store [[VAL_114]], [[VAL_89]]{{\[}}[[VAL_113]], [[VAL_112]]] : memref<?x?xf32>
+// CHECK:               }
+// CHECK:             }
+// CHECK:             gpu.terminator
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }
+