diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
--- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
+++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp
@@ -39,17 +39,25 @@
     const ArrayRef<DeviceMappingAttrInterface> &threadMappingAttributes,
     const Optional<ArrayAttr> &foreachMapping,
     llvm::Optional<TransformOpInterface> transformOp) {
+  DiagnosedSilenceableFailure diag = DiagnosedSilenceableFailure::success();
   if (!foreachMapping.has_value())
     return transformOp->emitSilenceableError() << "mapping must be present";
-
+  DenseSet<DeviceMappingAttrInterface> seen;
   if (llvm::any_of(foreachMapping->getValue(),
                    [&](DeviceMappingAttrInterface map) {
+                     if (seen.find(map) != seen.end())
+                       diag = transformOp->emitDefiniteFailure()
+                              << map
+                              << " is duplicated, cannot map different "
+                                 "loops to the same processor";
+
+                     seen.insert(map);
                      return llvm::find(threadMappingAttributes, map) ==
                             threadMappingAttributes.end();
                    }))
-    return transformOp->emitDefiniteFailure()
+    diag = transformOp->emitDefiniteFailure()
            << "mapping must be one of " << threadMappingAttributes;
-  return DiagnosedSilenceableFailure::success();
+  return diag;
 }
 
 /// Determines if the size of the kernel configuration is supported by the GPU
diff --git a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
--- a/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
+++ b/mlir/test/Dialect/GPU/transform-gpu-failing.mlir
@@ -121,31 +121,6 @@
 
 // -----
 
-func.func @map_nested_foreach_to_threads_4d_loop(%x: memref<2x32x32x32xf32>, %y: memref<2x32x32x32xf32>, %stream : !gpu.async.token) -> memref<2x32x32x32xf32> {
-  %one = arith.constant 1 : index
-  %c2 = arith.constant 1 : index
-  %c32 = arith.constant 32 : index
-  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
-            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
-  {
-    scf.foreach_thread (%i, %j, %k, %l) in (%c2, %c32,%c32,%c32) {
-        %4 = memref.load %x[%i, %j, %k, %l] : memref<2x32x32x32xf32>        
-        memref.store %4, %y[%i, %j, %k, %l] : memref<2x32x32x32xf32>
-     }  { mapping = [#gpu.thread<y>, #gpu.thread<x>, #gpu.thread<z>, #gpu.thread<z>] }
-    gpu.terminator
-  }
-  return %y : memref<2x32x32x32xf32>
-}
-
-transform.sequence failures(propagate) {
-^bb1(%arg0: !pdl.operation):
-  %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
-  // expected-error @below {{scf.foreach_thread with rank > 3 does not lower to gpu.thread_id}}
-  transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [128, 4, 1] }
-}
-
-// -----
-
 func.func @map_nested_foreach_to_threads_not_buffer(%x: tensor<32x32xf32>, %y: tensor<32x32xf32>, %z: tensor<32x32xf32>, %stream : !gpu.async.token) {
   %one = arith.constant 1 : index
   %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
@@ -273,3 +248,29 @@
 }
 
 // -----
+
+
+!type = memref<32x32xf32>
+func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type {
+  %c32 = arith.constant 32 : index
+  %one = arith.constant 1 : index
+  %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one)
+            threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one)
+  {
+    scf.foreach_thread (%i, %j) in (%c32, %c32) {
+        %4 = memref.load %x[%i, %j] : !type
+        %5 = memref.load %y[%i, %j] : !type
+        %6 = arith.mulf %4, %5 : f32
+        memref.store %6, %y[%i, %j] : !type
+     }  { mapping = [#gpu.thread<x>, #gpu.thread<x>] }
+    gpu.terminator
+  }
+  return %y : !type
+}
+
+transform.sequence failures(propagate) {
+^bb1(%arg0: !pdl.operation):
+  %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0
+  // expected-error @below {{#gpu.thread<x> is duplicated, cannot map different loops to the same processor}}
+  transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [32, 32]}
+}
\ No newline at end of file