diff --git a/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h b/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h
--- a/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h
+++ b/mlir/include/mlir/Conversion/SCFToGPU/SCFToGPU.h
@@ -12,9 +12,10 @@
 
 namespace mlir {
 class AffineForOp;
+class ConversionTarget;
+struct LogicalResult;
 class MLIRContext;
 class OwningRewritePatternList;
-struct LogicalResult;
 class Value;
 
 namespace scf {
@@ -44,6 +45,10 @@
 void populateParallelLoopToGPUPatterns(OwningRewritePatternList &patterns,
                                        MLIRContext *ctx);
 
+/// Configures the rewrite target such that only `scf.parallel` operations that
+/// are not rewritten by the provided patterns are legal.
+void configureParallelLoopToGPULegality(ConversionTarget &target);
+
 } // namespace mlir
 
 #endif // MLIR_CONVERSION_SCFTOGPU_SCFTOGPU_H_
diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPU.cpp
@@ -458,9 +458,10 @@
           if (!boundIsPrecise) {
             upperBound = deriveStaticUpperBound(upperBound, rewriter);
             if (!upperBound) {
-              return parallelOp.emitOpError()
-                     << "cannot derive loop-invariant upper bound for number "
-                        "of iterations";
+              return rewriter.notifyMatchFailure(
+                  parallelOp,
+                  "cannot derive loop-invariant upper bound for number of"
+                  "iterations");
             }
           }
           // Compute the number of iterations needed. We compute this as an
@@ -481,9 +482,9 @@
           // todo(herhut,ravishankarm): Update the behavior of setMappingAttr
           // when this condition is relaxed.
           if (bounds.find(processor) != bounds.end()) {
-            return parallelOp.emitOpError()
-                   << "cannot redefine the bound for processor "
-                   << static_cast<int64_t>(processor);
+            return rewriter.notifyMatchFailure(
+                parallelOp, "cannot redefine the bound for processor " +
+                                Twine(static_cast<int64_t>(processor)));
           }
           bounds[processor] = launchBound;
         }
@@ -565,6 +566,10 @@
 LogicalResult
 ParallelToGpuLaunchLowering::matchAndRewrite(ParallelOp parallelOp,
                                              PatternRewriter &rewriter) const {
+  // We can only transform starting at the outer-most loop. Launches inside of
+  // parallel loops are not supported.
+  if (auto parentLoop = parallelOp.getParentOfType<ParallelOp>())
+    return failure();
   // Create a launch operation. We start with bound one for all grid/block
   // sizes. Those will be refined later as we discover them from mappings.
   Location loc = parallelOp.getLoc();
@@ -640,3 +645,9 @@
                                              MLIRContext *ctx) {
   patterns.insert<ParallelToGpuLaunchLowering>(ctx);
 }
+
+void mlir::configureParallelLoopToGPULegality(ConversionTarget &target) {
+  target.addDynamicallyLegalOp<scf::ParallelOp>([](scf::ParallelOp parallelOp) {
+    return !parallelOp.getAttr(gpu::getMappingAttrName());
+  });
+}
diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
--- a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
+++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp
@@ -53,7 +53,7 @@
     target.addLegalDialect<AffineDialect>();
     target.addLegalDialect<gpu::GPUDialect>();
     target.addLegalDialect<scf::SCFDialect>();
-    target.addIllegalOp<scf::ParallelOp>();
+    configureParallelLoopToGPULegality(target);
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
       signalPassFailure();
diff --git a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
--- a/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
+++ b/mlir/test/Conversion/SCFToGPU/parallel_loop.mlir
@@ -317,15 +317,13 @@
 
 // -----
 
-// Mapping to the same processor twice.
+// Mapping to the same processor twice. Cannot be mapped.
 
 func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index,
                           %arg3 : index,
                           %buf : memref<?x?xf32>,
                           %res : memref<?x?xf32>) {
   %four = constant 4 : index
-  // expected-error@+2 {{cannot redefine the bound for processor 1}}
-  // expected-error@+1 {{failed to legalize operation 'scf.parallel'}}
   scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
                                           step (%four, %four)  {
   } { mapping = [
@@ -335,9 +333,12 @@
   return
 }
 
+// CHECK-LABEL: @parallel_double_map
+// CHECK: scf.parallel
+
 // -----
 
-// Loop with loop-variant upper bound.
+// Loop with loop-variant upper bound. Cannot be mapped.
 
 func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : index,
                                        %arg3 : index,
@@ -346,10 +347,8 @@
   %zero = constant 0 : index
   %one = constant 1 : index
   %four = constant 4 : index
-  // expected-error@+1 {{failed to legalize operation 'scf.parallel'}}
   scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
                                           step (%four, %four)  {
-    // expected-error@+1 {{cannot derive loop-invariant upper bound}}
     scf.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1)
                                             step (%one, %one)  {
       %idx0 = addi %i0, %si0 : index
@@ -366,3 +365,25 @@
     ] }
   return
 }
+
+// CHECK-LABEL: @parallel_loop_loop_variant_bound
+// CHECK: scf.parallel
+// CHECK: scf.parallel
+
+// -----
+
+// Loop without annotations. Cannot be mapped.
+
+func @parallel_no_annotations(%arg0 : index, %arg1 : index, %arg2 : index,
+                              %arg3 : index,
+                              %buf : memref<?x?xf32>,
+                              %res : memref<?x?xf32>) {
+  %four = constant 4 : index
+  scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
+                                          step (%four, %four)  {
+  }
+  return
+}
+
+// CHECK-LABEL: @parallel_no_annotations
+// CHECK: scf.parallel