diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
--- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
+++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp
@@ -572,6 +572,7 @@
                                          gpu::LaunchOp launchOp,
                                          BlockAndValueMapping &cloningMap,
                                          SmallVectorImpl<Operation *> &worklist,
+                                         DenseMap<int, Value> &bounds,
                                          PatternRewriter &rewriter) {
   // TODO(herhut): Verify that this is a valid GPU mapping.
   // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential
@@ -631,22 +632,27 @@
         // conditional. If the lower-bound is constant or defined before the
         // launch, we can use it in the launch bounds. Otherwise fail.
         if (!launchIndependent(lowerBound) &&
-            !isa<ConstantOp>(lowerBound.getDefiningOp()))
+            !isa_and_nonnull<ConstantOp>(lowerBound.getDefiningOp()))
           return failure();
         // The step must also be constant or defined outside of the loop nest.
-        if (!launchIndependent(step) && !isa<ConstantOp>(step.getDefiningOp()))
+        if (!launchIndependent(step) &&
+            !isa_and_nonnull<ConstantOp>(step.getDefiningOp()))
           return failure();
         // If the upper-bound is constant or defined before the launch, we can
         // use it in the launch bounds directly. Otherwise try derive a bound.
-        bool boundIsPrecise = launchIndependent(upperBound) ||
-                              isa<ConstantOp>(upperBound.getDefiningOp());
+        bool boundIsPrecise =
+            launchIndependent(upperBound) ||
+            isa_and_nonnull<ConstantOp>(upperBound.getDefiningOp());
         {
           PatternRewriter::InsertionGuard guard(rewriter);
           rewriter.setInsertionPoint(launchOp);
           if (!boundIsPrecise) {
             upperBound = deriveStaticUpperBound(upperBound, rewriter);
-            if (!upperBound)
-              return failure();
+            if (!upperBound) {
+              return parallelOp.emitOpError()
+                     << "cannot derive loop-invariant upper bound for number "
+                        "of iterations";
+            }
           }
           // Compute the number of iterations needed. We compute this as an
           // affine expression ceilDiv (upperBound - lowerBound) step. We use
@@ -654,8 +660,8 @@
           AffineMap stepMap =
               AffineMap::get(0, 3,
                              ((rewriter.getAffineSymbolExpr(0) -
-                              rewriter.getAffineSymbolExpr(1)).ceilDiv(
-                                  rewriter.getAffineSymbolExpr(2))));
+                               rewriter.getAffineSymbolExpr(1))
+                                  .ceilDiv(rewriter.getAffineSymbolExpr(2))));
           Value launchBound = rewriter.create<AffineApplyOp>(
               loc, annotation.boundMap.compose(stepMap),
               ValueRange{
@@ -664,7 +670,12 @@
                   ensureLaunchIndependent(
                       cloningMap.lookupOrDefault(lowerBound)),
                   ensureLaunchIndependent(cloningMap.lookupOrDefault(step))});
-          launchOp.setOperand(annotation.processor, launchBound);
+          if (bounds.find(annotation.processor) != bounds.end()) {
+            return parallelOp.emitOpError()
+                   << "cannot redefine the bound for processor "
+                   << annotation.processor;
+          }
+          bounds[annotation.processor] = launchBound;
         }
         if (!boundIsPrecise) {
           // We are using an approximation, create a surrounding conditional.
@@ -746,9 +757,10 @@
   rewriter.setInsertionPointToStart(&launchOp.body().front());
 
   BlockAndValueMapping cloningMap;
+  llvm::DenseMap<int, Value> launchBounds;
   SmallVector<Operation *, 16> worklist;
   if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist,
-                                 rewriter)))
+                                 launchBounds, rewriter)))
     return matchFailure();
 
   // Whether we have seen any side-effects. Reset when leaving an inner scope.
@@ -770,8 +782,9 @@
       // A nested loop.parallel needs insertion of code to compute indices.
       // Insert that now. This will also update the worklist with the loops
       // body.
-      processParallelLoop(nestedParallel, launchOp, cloningMap, worklist,
-                          rewriter);
+      if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap,
+                                     worklist, launchBounds, rewriter)))
+        return matchFailure();
     } else if (op == launchOp.getOperation()) {
       // Found our sentinel value. We have finished the operations from one
       // nesting level, pop one level back up.
@@ -791,6 +804,11 @@
     }
   }
 
+  // Now that we succeeded creating the launch operation, also update the
+  // bounds.
+  for (auto bound : launchBounds)
+    launchOp.setOperand(std::get<0>(bound), std::get<1>(bound));
+
   rewriter.eraseOp(parallelOp);
   return matchSuccess();
 }
diff --git a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
--- a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
+++ b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file %s | FileCheck %s -dump-input-on-failure
+// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file -verify-diagnostics %s | FileCheck %s -dump-input-on-failure
 
 // 2-d parallel loop mapped to block.y and block.x
 
@@ -299,3 +299,55 @@
 // CHECK:           return
 // CHECK:         }
 // CHECK:       }
+
+// -----
+
+// Mapping to the same processor twice.
+
+func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index,
+                          %arg3 : index,
+                          %buf : memref<?x?xf32>,
+                          %res : memref<?x?xf32>) {
+  %four = constant 4 : index
+  // expected-error@+2 {{cannot redefine the bound for processor 1}}
+  // expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+  } { mapping = [
+      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+    ] }
+  return
+}
+
+// -----
+
+// Loop with loop-variant upper bound.
+
+func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : index,
+                                       %arg3 : index,
+                                       %buf : memref<?x?xf32>,
+                                       %res : memref<?x?xf32>) {
+  %zero = constant 0 : index
+  %one = constant 1 : index
+  %four = constant 4 : index
+  // expected-error@+1 {{failed to legalize operation 'loop.parallel'}}
+  loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+    // expected-error@+1 {{cannot derive loop-invariant upper bound}}                                        
+    loop.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)
+                                            step (%one, %one)  {
+      %idx0 = addi %i0, %si0 : index
+      %idx1 = addi %i1, %si1 : index
+      %val = load %buf[%idx0, %idx1] : memref<?x?xf32>
+      store %val, %res[%idx1, %idx0] : memref<?x?xf32>
+    } { mapping = [
+        {processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+        {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+      ] }
+  } { mapping = [
+      {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>},
+      {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}
+    ] }
+  return
+}