diff --git a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp --- a/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp +++ b/mlir/lib/Conversion/LoopsToGPU/LoopsToGPU.cpp @@ -572,6 +572,7 @@ gpu::LaunchOp launchOp, BlockAndValueMapping &cloningMap, SmallVectorImpl &worklist, + DenseMap &bounds, PatternRewriter &rewriter) { // TODO(herhut): Verify that this is a valid GPU mapping. // processor ids: 0-2 block [x/y/z], 3-5 -> thread [x/y/z], 6-> sequential @@ -631,22 +632,27 @@ // conditional. If the lower-bound is constant or defined before the // launch, we can use it in the launch bounds. Otherwise fail. if (!launchIndependent(lowerBound) && - !isa(lowerBound.getDefiningOp())) + !isa_and_nonnull(lowerBound.getDefiningOp())) return failure(); // The step must also be constant or defined outside of the loop nest. - if (!launchIndependent(step) && !isa(step.getDefiningOp())) + if (!launchIndependent(step) && + !isa_and_nonnull(step.getDefiningOp())) return failure(); // If the upper-bound is constant or defined before the launch, we can // use it in the launch bounds directly. Otherwise try derive a bound. - bool boundIsPrecise = launchIndependent(upperBound) || - isa(upperBound.getDefiningOp()); + bool boundIsPrecise = + launchIndependent(upperBound) || + isa_and_nonnull(upperBound.getDefiningOp()); { PatternRewriter::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(launchOp); if (!boundIsPrecise) { upperBound = deriveStaticUpperBound(upperBound, rewriter); - if (!upperBound) - return failure(); + if (!upperBound) { + return parallelOp.emitOpError() + << "cannot derive loop-invariant upper bound for number " + "of iterations"; + } } // Compute the number of iterations needed. We compute this as an // affine expression ceilDiv (upperBound - lowerBound) step. We use @@ -654,8 +660,8 @@ AffineMap stepMap = AffineMap::get(0, 3, ((rewriter.getAffineSymbolExpr(0) - - rewriter.getAffineSymbolExpr(1)).ceilDiv( - rewriter.getAffineSymbolExpr(2)))); + rewriter.getAffineSymbolExpr(1)) + .ceilDiv(rewriter.getAffineSymbolExpr(2)))); Value launchBound = rewriter.create( loc, annotation.boundMap.compose(stepMap), ValueRange{ @@ -664,7 +670,12 @@ ensureLaunchIndependent( cloningMap.lookupOrDefault(lowerBound)), ensureLaunchIndependent(cloningMap.lookupOrDefault(step))}); - launchOp.setOperand(annotation.processor, launchBound); + if (bounds.find(annotation.processor) != bounds.end()) { + return parallelOp.emitOpError() + << "cannot redefine the bound for processor " + << annotation.processor; + } + bounds[annotation.processor] = launchBound; } if (!boundIsPrecise) { // We are using an approximation, create a surrounding conditional. @@ -746,9 +757,10 @@ rewriter.setInsertionPointToStart(&launchOp.body().front()); BlockAndValueMapping cloningMap; + llvm::DenseMap launchBounds; SmallVector worklist; if (failed(processParallelLoop(parallelOp, launchOp, cloningMap, worklist, - rewriter))) + launchBounds, rewriter))) return matchFailure(); // Whether we have seen any side-effects. Reset when leaving an inner scope. @@ -770,8 +782,9 @@ // A nested loop.parallel needs insertion of code to compute indices. // Insert that now. This will also update the worklist with the loops // body. - processParallelLoop(nestedParallel, launchOp, cloningMap, worklist, - rewriter); + if (failed(processParallelLoop(nestedParallel, launchOp, cloningMap, + worklist, launchBounds, rewriter))) + return matchFailure(); } else if (op == launchOp.getOperation()) { // Found our sentinel value. We have finished the operations from one // nesting level, pop one level back up. @@ -791,6 +804,11 @@ } } + // Now that we succeeded creating the launch operation, also update the + // bounds. + for (auto bound : launchBounds) + launchOp.setOperand(std::get<0>(bound), std::get<1>(bound)); + rewriter.eraseOp(parallelOp); return matchSuccess(); } diff --git a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir --- a/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir +++ b/mlir/test/Conversion/LoopsToGPU/parallel_loop.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file %s | FileCheck %s -dump-input-on-failure +// RUN: mlir-opt -convert-parallel-loops-to-gpu -split-input-file -verify-diagnostics %s | FileCheck %s -dump-input-on-failure // 2-d parallel loop mapped to block.y and block.x @@ -299,3 +299,55 @@ // CHECK: return // CHECK: } // CHECK: } + +// ----- + +// Mapping to the same processor twice. + +func @parallel_double_map(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index, + %buf : memref, + %res : memref) { + %four = constant 4 : index + // expected-error@+2 {{cannot redefine the bound for processor 1}} + // expected-error@+1 {{failed to legalize operation 'loop.parallel'}} + loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%four, %four) { + } { mapping = [ + {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, + {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>} + ] } + return +} + +// ----- + +// Loop with loop-variant upper bound. + +func @parallel_loop_loop_variant_bound(%arg0 : index, %arg1 : index, %arg2 : index, + %arg3 : index, + %buf : memref, + %res : memref) { + %zero = constant 0 : index + %one = constant 1 : index + %four = constant 4 : index + // expected-error@+1 {{failed to legalize operation 'loop.parallel'}} + loop.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3) + step (%four, %four) { + // expected-error@+1 {{cannot derive loop-invariant upper bound}} + loop.parallel (%si0, %si1) = (%zero, %zero) to (%i0, %i1) + step (%one, %one) { + %idx0 = addi %i0, %si0 : index + %idx1 = addi %i1, %si1 : index + %val = load %buf[%idx0, %idx1] : memref + store %val, %res[%idx1, %idx0] : memref + } { mapping = [ + {processor = 4, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, + {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>} + ] } + } { mapping = [ + {processor = 1, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>}, + {processor = 6, map = affine_map<(d0) -> (d0)>, bound = affine_map<(d0) -> (d0)>} + ] } + return +}