diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp --- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp +++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp @@ -247,10 +247,13 @@ // Step 5. RAUW thread indices to thread ops. for (Value blockIdx : foreachThreadOp.getThreadIndices()) { - for (Operation *user : llvm::make_early_inc_range(blockIdx.getUsers())) { - rewriter.updateRootInPlace(user, [&]() { - user->replaceUsesOfWith(blockIdx, bvm.lookup(blockIdx)); - }); + Value val = bvm.lookup(blockIdx); + SmallVector uses; + for (OpOperand &use : blockIdx.getUses()) + uses.push_back(&use); + for (OpOperand *operand : uses) { + Operation *op = operand->getOwner(); + rewriter.updateRootInPlace(op, [&]() { operand->set(val); }); } } @@ -490,10 +493,13 @@ // Step 6. RAUW thread indices to thread ops. for (Value threadIdx : foreachThreadOp.getThreadIndices()) { - for (Operation *user : llvm::make_early_inc_range(threadIdx.getUsers())) { - rewriter.updateRootInPlace(user, [&]() { - user->replaceUsesOfWith(threadIdx, bvm.lookup(threadIdx)); - }); + Value val = bvm.lookup(threadIdx); + SmallVector uses; + for (OpOperand &use : threadIdx.getUses()) + uses.push_back(&use); + for (OpOperand *operand : uses) { + Operation *op = operand->getOwner(); + rewriter.updateRootInPlace(op, [&]() { operand->set(val); }); } } diff --git a/mlir/test/Dialect/GPU/transform-gpu.mlir b/mlir/test/Dialect/GPU/transform-gpu.mlir --- a/mlir/test/Dialect/GPU/transform-gpu.mlir +++ b/mlir/test/Dialect/GPU/transform-gpu.mlir @@ -162,3 +162,35 @@ %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [12, 9, 1], syncAfterDistribute = false } } + +// ----- + +!type = memref<32x32xf32> +// CHECK-LABEL: func.func @saxpy2d_singleloop( +// CHECK-SAME: %[[ARGX:[0-9a-z]+]]: memref<32x32xf32> +// CHECK-SAME: %[[ARGY:[0-9a-z]+]]: memref<32x32xf32> +func.func @saxpy2d_singleloop(%x: !type, %y: !type, %stream : !gpu.async.token) -> !type { + %c32 = arith.constant 32 : index + %one = arith.constant 1 : index + %name = gpu.launch async[%stream] blocks(%arg3, %arg4, %arg5) in (%arg9 = %one, %arg10 = %one, %arg11 = %one) + threads(%arg6, %arg7, %arg8) in (%arg12 = %one, %arg13 = %one, %arg14 = %one) + { +// CHECK: %[[TIDX:.*]] = gpu.thread_id x +// CHECK: memref.load %[[ARGX]][%[[TIDX]], %[[TIDX]]] +// CHECK: memref.load %[[ARGY]][%[[TIDX]], %[[TIDX]]] + scf.foreach_thread (%i) in (%c32) { + %4 = memref.load %x[%i, %i] : !type + %5 = memref.load %y[%i, %i] : !type + %6 = arith.mulf %4, %5 : f32 + memref.store %6, %y[%i, %i] : !type + } { mapping = [#gpu.thread] } + gpu.terminator + } + return %y : !type +} + +transform.sequence failures(propagate) { +^bb1(%arg0: !pdl.operation): + %funcop = transform.structured.match ops{["gpu.launch"]} in %arg0 + transform.gpu.map_nested_foreach_to_threads %funcop { blockDim = [32]} +}