diff --git a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
--- a/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/GPU/CMakeLists.txt
@@ -1 +1,11 @@
 add_mlir_dialect(GPUOps GPUOps)
+
+set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td)
+mlir_tablegen(ParallelLoopMapperAttr.h.inc -gen-struct-attr-decls)
+mlir_tablegen(ParallelLoopMapperAttr.cpp.inc -gen-struct-attr-defs)
+add_public_tablegen_target(MLIRParallelLoopMapperAttrGen)
+
+set(LLVM_TARGET_DEFINITIONS ParallelLoopMapperAttr.td)
+mlir_tablegen(ParallelLoopMapperEnums.h.inc -gen-enum-decls)
+mlir_tablegen(ParallelLoopMapperEnums.cpp.inc -gen-enum-defs)
+add_public_tablegen_target(MLIRParallelLoopMapperEnumsGen)
diff --git a/mlir/include/mlir/Dialect/GPU/GPUBase.td b/mlir/include/mlir/Dialect/GPU/GPUBase.td
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/GPUBase.td
@@ -0,0 +1,26 @@
+//===-- GPUBase.td - GPU dialect definitions ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the GPU dialect
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef GPU_BASE
+#define GPU_BASE
+
+include "mlir/IR/OpBase.td"
+
+//===----------------------------------------------------------------------===//
+// GPU Dialect.
+//===----------------------------------------------------------------------===//
+
+def GPU_Dialect : Dialect {
+  let name = "gpu";
+}
+
+#endif // GPU_BASE
\ No newline at end of file
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -13,6 +13,7 @@
 #ifndef GPU_OPS
 #define GPU_OPS
 
+include "mlir/Dialect/GPU/GPUBase.td"
 include "mlir/Dialect/LLVMIR/LLVMOpBase.td"
 include "mlir/Interfaces/SideEffects.td"
 
@@ -26,10 +27,6 @@
 // GPU Dialect operations.
 //===----------------------------------------------------------------------===//
 
-def GPU_Dialect : Dialect {
-  let name = "gpu";
-}
-
 class GPU_Op<string mnemonic, list<OpTrait> traits = []> :
     Op<GPU_Dialect, mnemonic, traits>;
 
diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
--- a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
+++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapper.h
@@ -14,28 +14,44 @@
 #ifndef MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
 #define MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
 
+#include "mlir/IR/Attributes.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/DenseMap.h"
+
+#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.h.inc"
+
 namespace mlir {
 
+class AffineMap;
+struct LogicalResult;
+class Operation;
 class Region;
 
+#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.h.inc"
+
 namespace gpu {
 
 /// Name of the mapping attribute produced by loop mappers.
-static constexpr const char *kMappingAttributeName = "mapping";
-/// Name of the processor sub-attribute that identifies the hardware id
-/// to map a loop to.
-static constexpr const char *kProcessorEntryName = "processor";
-/// Name of the map sub-attribute that identifies the affine map to apply
-/// to the hardware id to compute the iteration number of the loop. This
-/// map is expected to be extended by step and lower bound computations:
-///   index = map(hardware_id) * step + lowerbound
-static constexpr const char *kIndexMapEntryName = "map";
-/// Name of the bound sub-attribute that itendities the affine map to
-/// compute an upper bound of iterations for the hardware id. This is
-/// applied to an upper bound on the number of iterations:
-///   launchBound = bound(upperbound-lowerbound ceildiv step)
-static constexpr const char *kBoundMapEntryName = "bound";
+StringRef getMappingAttrName();
 
+/// Get the value of the processor in the ParallelLoopDimMapper attribute.
+inline Processor getProcessor(ParallelLoopDimMapper attr) {
+  return static_cast<Processor>(attr.processor().getInt());
+}
+
+/// Helper function to create a ParallelDimMapperAttr.
+/// TODO(ravishankarm/antiagainst): Replace its uses with an auto-gened method.
+ParallelLoopDimMapper getParallelLoopDimMapperAttr(Processor processor,
+                                                   AffineMap map,
+                                                   AffineMap bound);
+
+/// Sets the mapping attribute of a loop.parallel operation. Verifies that the
+/// mapping passed is valid.
+/// - the number of DimMapperAttr provided is same as the number of loops of
+///   the ploopOp.
+/// - the mapping does not map multiple loops to the same processor.
+LogicalResult setMappingAttr(Operation *op,
+                             ArrayRef<ParallelLoopDimMapper> mapping);
 } // end namespace gpu
 
 /// Maps the parallel loops found in the given function to workgroups. The first
@@ -46,5 +62,4 @@
 void greedilyMapParallelLoopsToGPU(Region &region);
 
 } // end namespace mlir
-
 #endif // MLIR_DIALECT_GPU_PARALLELLOOPMAPPER_H
diff --git a/mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/ParallelLoopMapperAttr.td
@@ -0,0 +1,51 @@
+//===-- ParallelLoopMapperAttr.td - Attribute definition ---*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the attribute used for driving conversion from loop.parallel to
+// gpu.launch operations
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PARALLEL_LOOP_MAPPER_ATTR
+#define PARALLEL_LOOP_MAPPER_ATTR
+
+include "mlir/Dialect/AffineOps/AffineOpsBase.td"
+include "mlir/Dialect/GPU/GPUBase.td"
+
+def BLOCKX : I64EnumAttrCase<"BLOCKX", 0>;
+def BLOCKY : I64EnumAttrCase<"BLOCKY", 1>;
+def BLOCKZ : I64EnumAttrCase<"BLOCKZ", 2>;
+def THREADX : I64EnumAttrCase<"THREADX", 3>;
+def THREADY : I64EnumAttrCase<"THREADY", 4>;
+def THREADZ : I64EnumAttrCase<"THREADZ", 5>;
+def SEQUENTIAL : I64EnumAttrCase<"SEQUENTIAL", 6>;
+
+def ProcessorAttr : I64EnumAttr<"Processor", "processor for loop mapping", [
+    BLOCKX, BLOCKY, BLOCKZ, THREADX, THREADY, THREADZ, SEQUENTIAL]> {
+  let cppNamespace = "::mlir::gpu";
+}
+
+// Attribute that drives conversion of a loop.parallel to gpu.launch
+// operation.
+// processor: the hardware id to map to.
+// map : An affine map that is used to pre-process hardware ids before
+//       substitution.
+// bound : An affine map that is used to compute the bound of the hardware
+//         id based on an upper bound of the number of iterations.
+def ParallelLoopDimMapperAttr :
+    StructAttr<"ParallelLoopDimMapper", GPU_Dialect,
+               [StructFieldAttr<"processor", ProcessorAttr>,
+                StructFieldAttr<"map", AffineMapAttr>,
+                StructFieldAttr<"bound", AffineMapAttr>]>;
+
+
+def ParallelLoopMapperAttr :
+    TypedArrayAttrBase<ParallelLoopDimMapperAttr,
+                       "parallel loop to processor mapping attribute">;
+
+#endif // PARALLEL_LOOP_MAPPER_ATTR
diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -500,35 +500,8 @@
   PatternMatchResult matchAndRewrite(ParallelOp parallelOp,
                                      PatternRewriter &rewriter) const override;
 };
-
-struct MappingAnnotation {
-  unsigned processor;
-  AffineMap indexMap;
-  AffineMap boundMap;
-};
-
 } // namespace
 
-/// Extracts the mapping annotations from the provided attribute. The attribute
-/// is expected to be of the form
-/// { processor = <unsigned>, map = <AffineMap>, bound = <AffineMap> }
-/// where the bound is optional.
-static MappingAnnotation extractMappingAnnotation(Attribute attribute) {
-  DictionaryAttr dict = attribute.cast<DictionaryAttr>();
-  unsigned processor = dict.get(gpu::kProcessorEntryName)
-                           .cast<IntegerAttr>()
-                           .getValue()
-                           .getSExtValue();
-  AffineMap map =
-      dict.get(gpu::kIndexMapEntryName).cast<AffineMapAttr>().getValue();
-  AffineMapAttr boundAttr =
-      dict.get(gpu::kBoundMapEntryName).dyn_cast_or_null<AffineMapAttr>();
-  AffineMap bound;
-  if (boundAttr)
-    bound = boundAttr.getValue();
-  return {processor, map, bound};
-}
-
 /// Tries to derive a static upper bound from the defining operation of
 /// `upperBound`.
 static Value deriveStaticUpperBound(Value upperBound,
@@ -546,6 +519,30 @@
   return {};
 }
 
+static bool isMappedToProcessor(gpu::Processor processor) {
+  return processor != gpu::Processor::SEQUENTIAL;
+}
+
+static unsigned getLaunchOpArgumentNum(gpu::Processor processor) {
+  switch (processor) {
+  case gpu::Processor::BLOCKX:
+    return 0;
+  case gpu::Processor::BLOCKY:
+    return 1;
+  case gpu::Processor::BLOCKZ:
+    return 2;
+  case gpu::Processor::THREADX:
+    return 3;
+  case gpu::Processor::THREADY:
+    return 4;
+  case gpu::Processor::THREADZ:
+    return 5;
+  default:;
+  }
+  llvm_unreachable(
+      "invalid processor type while retrieving launch op argument number");
+}
+
 /// Modifies the current transformation state to capture the effect of the given
 /// `loop.parallel` operation on index substitutions and the operations to be
 /// inserted.
@@ -568,16 +565,14 @@
 /// inserted, a sentinel (the `gpu.launch` operation) is inserted into the
 /// worklist. This signals the processor of the worklist to pop the rewriter
 /// one scope-level up.
-static LogicalResult processParallelLoop(ParallelOp parallelOp,
-                                         gpu::LaunchOp launchOp,
-                                         BlockAndValueMapping &cloningMap,
-                                         SmallVectorImpl<Operation *> &worklist,
-                                         DenseMap<int, Value> &bounds,
-                                         PatternRewriter &rewriter) {
+static LogicalResult processParallelLoop(
+    ParallelOp parallelOp, gpu::LaunchOp launchOp,
+    BlockAndValueMapping &cloningMap, SmallVectorImpl<Operation *> &worklist,
+    DenseMap<gpu::Processor, Value> &bounds, PatternRewriter &rewriter) {
   // TODO(herhut): Verify that this is a valid GPU mapping.
   // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
   ArrayAttr mapping =
-      parallelOp.getAttrOfType<ArrayAttr>(gpu::kMappingAttributeName);
+      parallelOp.getAttrOfType<ArrayAttr>(gpu::getMappingAttrName());
 
   // TODO(herhut): Support reductions.
   if (!mapping || parallelOp.getNumResults() != 0)
@@ -604,12 +599,17 @@
     Attribute mappingAttribute;
     Value iv, lowerBound, upperBound, step;
     std::tie(mappingAttribute, iv, lowerBound, upperBound, step) = config;
-    MappingAnnotation annotation = extractMappingAnnotation(mappingAttribute);
+    auto annotation = mappingAttribute.dyn_cast<gpu::ParallelLoopDimMapper>();
+    if (!annotation)
+      return parallelOp.emitOpError()
+             << "expected mapping attribute for lowering to GPU";
     Value newIndex;
+    gpu::Processor processor = gpu::getProcessor(annotation);
 
-    if (annotation.processor < gpu::LaunchOp::kNumConfigOperands) {
+    if (isMappedToProcessor(processor)) {
       // Use the corresponding thread/grid index as replacement for the loop iv.
-      Value operand = launchOp.body().front().getArgument(annotation.processor);
+      Value operand = launchOp.body().front().getArgument(
+          getLaunchOpArgumentNum(processor));
       // Take the indexmap and add the lower bound and step computations in.
       // This computes operand * step + lowerBound.
       // Use an affine map here so that it composes nicely with the provided
@@ -619,11 +619,11 @@
           rewriter.getAffineDimExpr(0) * rewriter.getAffineSymbolExpr(0) +
               rewriter.getAffineSymbolExpr(1));
       newIndex = rewriter.create<AffineApplyOp>(
-          loc, annotation.indexMap.compose(lowerAndStep),
+          loc, annotation.map().getValue().compose(lowerAndStep),
           ValueRange{operand, step, lowerBound});
       // If there was also a bound, insert that, too.
       // TODO(herhut): Check that we do not assign bounds twice.
-      if (annotation.boundMap) {
+      if (annotation.bound().getValue()) {
         // We pass as the single opererand to the bound-map the number of
         // iterations, which is (upperBound - lowerBound) ceilDiv step. To
         // support inner loops with dynamic upper bounds (as generated by e.g.
@@ -663,19 +663,19 @@
                                rewriter.getAffineSymbolExpr(1))
                                   .ceilDiv(rewriter.getAffineSymbolExpr(2))));
           Value launchBound = rewriter.create<AffineApplyOp>(
-              loc, annotation.boundMap.compose(stepMap),
+              loc, annotation.bound().getValue().compose(stepMap),
               ValueRange{
                   ensureLaunchIndependent(
                       cloningMap.lookupOrDefault(upperBound)),
                   ensureLaunchIndependent(
                       cloningMap.lookupOrDefault(lowerBound)),
                   ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
-          if (bounds.find(annotation.processor) != bounds.end()) {
+          if (bounds.find(processor) != bounds.end()) {
             return parallelOp.emitOpError()
                    << "cannot redefine the bound for processor "
-                   << annotation.processor;
+                   << stringifyProcessor(processor);
           }
-          bounds[annotation.processor] = launchBound;
+          bounds[processor] = launchBound;
         }
         if (!boundIsPrecise) {
           // We are using an approximation, create a surrounding conditional.
@@ -757,7 +757,7 @@
   rewriter.setInsertionPointToStart(&launchOp.body().front());
 
   BlockAndValueMapping cloningMap;
-  llvm::DenseMap<int, Value> launchBounds;
+  llvm::DenseMap<gpu::Processor, Value> launchBounds;
   SmallVector<Operation *, 16> worklist;
   if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
                                  launchBounds, rewriter)))
@@ -809,7 +809,8 @@
   // Now that we succeeded creating the launch operation, also update the
   // bounds.
   for (auto bound : launchBounds)
-    launchOp.setOperand(std::get<0>(bound), std::get<1>(bound));
+    launchOp.setOperand(getLaunchOpArgumentNum(std::get<0>(bound)),
+                        std::get<1>(bound));
 
   rewriter.eraseOp(parallelOp);
   return matchSuccess();
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -10,6 +10,8 @@
 
   DEPENDS
   MLIRGPUOpsIncGen
+  MLIRParallelLoopMapperAttrGen
+  MLIRParallelLoopMapperEnumsGen
   )
 target_link_libraries(MLIRGPU
   PUBLIC
diff --git a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
--- a/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/ParallelLoopMapper.cpp
@@ -23,6 +23,43 @@
 using namespace mlir::gpu;
 using namespace mlir::loop;
 
+#include "mlir/Dialect/GPU/ParallelLoopMapperEnums.cpp.inc"
+namespace mlir {
+
+#include "mlir/Dialect/GPU/ParallelLoopMapperAttr.cpp.inc"
+namespace gpu {
+
+StringRef getMappingAttrName() { return "mapping"; }
+
+ParallelLoopDimMapper getParallelLoopDimMapperAttr(Processor processor,
+                                                   AffineMap map,
+                                                   AffineMap bound) {
+  MLIRContext *context = map.getContext();
+  OpBuilder builder(context);
+  return ParallelLoopDimMapper::get(
+      builder.getI64IntegerAttr(static_cast<int32_t>(processor)),
+      AffineMapAttr::get(map), AffineMapAttr::get(bound), context);
+}
+
+LogicalResult setMappingAttr(Operation *op,
+                             ArrayRef<ParallelLoopDimMapper> mapping) {
+  // Verify that each processor is mapped to only once.
+  llvm::DenseSet<gpu::Processor> specifiedMappings;
+  for (auto dimAttr : mapping) {
+    gpu::Processor processor =
+        static_cast<gpu::Processor>(dimAttr.processor().getInt());
+    if (processor != gpu::Processor::SEQUENTIAL &&
+        specifiedMappings.count(processor))
+      return op->emitError("invalid mapping multiple loops to same processor");
+  }
+  ArrayRef<Attribute> mappingAsAttrs(mapping.data(), mapping.size());
+  op->setAttr(getMappingAttrName(),
+              ArrayAttr::get(mappingAsAttrs, op->getContext()));
+  return success();
+}
+} // namespace gpu
+} // namespace mlir
+
 namespace {
 
 enum MappingLevel { MapGrid = 0, MapBlock = 1, Sequential = 2 };
@@ -43,10 +80,41 @@
 /// Computed the hardware id to use for a given mapping level. Will
 /// assign x,y and z hardware ids for the first 3 dimensions and use
 /// sequential after.
-static int64_t getHardwareIdForMapping(MappingLevel level, int dimension) {
+/// TODO(ravishankarm/herhut) : Make this use x for the inner-most loop that is
+/// distributed to map to x, the next innermost to y and the next innermost to
+/// z.
+static gpu::Processor getHardwareIdForMapping(MappingLevel level,
+                                              int dimension) {
+
   if (dimension >= kNumHardwareIds || level == Sequential)
-    return Sequential * kNumHardwareIds;
-  return (level * kNumHardwareIds) + dimension;
+    return Processor::SEQUENTIAL;
+  switch (level) {
+  case MapGrid:
+    switch (dimension) {
+    case 0:
+      return Processor::BLOCKX;
+    case 1:
+      return Processor::BLOCKY;
+    case 2:
+      return Processor::BLOCKZ;
+    default:
+      return Processor::SEQUENTIAL;
+    }
+    break;
+  case MapBlock:
+    switch (dimension) {
+    case 0:
+      return Processor::THREADX;
+    case 1:
+      return Processor::THREADY;
+    case 2:
+      return Processor::THREADZ;
+    default:
+      return Processor::SEQUENTIAL;
+    }
+  default:;
+  }
+  return Processor::SEQUENTIAL;
 }
 
 /// Add mapping information to the given parallel loop. Do not add
@@ -55,26 +123,20 @@
 static void mapParallelOp(ParallelOp parallelOp,
                           MappingLevel mappingLevel = MapGrid) {
   // Do not try to add a mapping to already mapped loops or nested loops.
-  if (parallelOp.getAttr(gpu::kMappingAttributeName) ||
+  if (parallelOp.getAttr(getMappingAttrName()) ||
       ((mappingLevel == MapGrid) && parallelOp.getParentOfType<ParallelOp>()))
     return;
 
   MLIRContext *ctx = parallelOp.getContext();
   Builder b(ctx);
-  SmallVector<Attribute, 4> attrs;
+  SmallVector<ParallelLoopDimMapper, 4> attrs;
   attrs.reserve(parallelOp.getNumInductionVars());
   for (int i = 0, e = parallelOp.getNumInductionVars(); i < e; ++i) {
-    SmallVector<NamedAttribute, 3> entries;
-    entries.emplace_back(b.getNamedAttr(
-        kProcessorEntryName,
-        b.getI64IntegerAttr(getHardwareIdForMapping(mappingLevel, i))));
-    entries.emplace_back(b.getNamedAttr(
-        kIndexMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
-    entries.emplace_back(b.getNamedAttr(
-        kBoundMapEntryName, AffineMapAttr::get(b.getDimIdentityMap())));
-    attrs.push_back(DictionaryAttr::get(entries, ctx));
+    attrs.push_back(getParallelLoopDimMapperAttr(
+        getHardwareIdForMapping(mappingLevel, i), b.getDimIdentityMap(),
+        b.getDimIdentityMap()));
   }
-  parallelOp.setAttr(kMappingAttributeName, ArrayAttr::get(attrs, ctx));
+  setMappingAttr(parallelOp, attrs);
   ++mappingLevel;
   // Parallel loop operations are immediately nested, so do not use
   // walk but just iterate over the operations.
diff --git a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
--- a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
+++ b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
@@ -3,7 +3,7 @@
 // 2-d parallel loop mapped to block.y and block.x
 
 func @parallel_loop_bidy_bidx(%arg0 : index, %arg1 : index, %arg2 : index,
-                              %arg3 : index, %arg4 : index, 
+                              %arg3 : index, %arg4 : index,
                               %buf : memref<?x?xf32>,
                               %res : memref<?x?xf32>) {
   %step = constant 2 : index
@@ -309,7 +309,7 @@
                           %buf : memref<?x?xf32>,
                           %res : memref<?x?xf32>) {
   %four = constant 4 : index
-  // expected-error@+2 {{cannot redefine the bound for processor 1}}
+  // expected-error@+2 {{cannot redefine the bound for processor BLOCKY}}
   // expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
   loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
                                           step (%four, %four)  {
@@ -334,7 +334,7 @@
   // expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
   loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
                                           step (%four, %four)  {
-    // expected-error@+1 {{cannot derive loop-invariant upper bound}}                                        
+    // expected-error@+1 {{cannot derive loop-invariant upper bound}}
     loop.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)
                                             step (%one, %one)  {
       %idx0 = addi %i0, %si0 : index