diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -526,6 +526,22 @@ let hasCanonicalizer = 1; } +def GPU_PrintfOp : GPU_Op<"printf", [MemoryEffects<[MemWrite]>]>, + Arguments<(ins StrAttr:$format, + Variadic>:$args)> { + let summary = "Device-side printf, as in CUDA or OpenCL, for debugging"; + let description = [{ + `gpu.printf` takes a literal format string `format` and an arbitrary number of + scalar arguments that should be printed. + + The format string is a C-style printf string, subject to any restrictions + imposed by one's target platform. + }]; + let assemblyFormat = [{ + attr-dict ($args^ `:` type($args))? + }]; +} + def GPU_ReturnOp : GPU_Op<"return", [HasParent<"GPUFuncOp">, NoSideEffect, Terminator]>, Arguments<(ins Variadic:$operands)>, Results<(outs)> { diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h @@ -33,6 +33,21 @@ Identifier kernelAttributeName; }; +/// The lowering of gpu.printf to a call to an external printf() function +/// +/// This pass will add a decleration of printf() to the GPUModule if needed +/// and seperate out the format strings into global constants. For some +/// runtimes, such as the AMD GPU, this is sufficient setup for printf(), as the +/// LLVM backend inserts the needed support code automatically. +struct GPUPrintfOpToLLVMCallLowering + : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(gpu::PrintfOp gpuPrintfOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + struct GPUReturnOpLowering : public ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; @@ -43,7 +58,6 @@ return success(); } }; - } // namespace mlir #endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp @@ -144,3 +144,68 @@ rewriter.eraseOp(gpuFuncOp); return success(); } + +LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite( + gpu::PrintfOp gpuPrintfOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + Location loc = gpuPrintfOp->getLoc(); + + mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8)); + mlir::Type i8Ptr = LLVM::LLVMPointerType::get(llvmI8); + mlir::Type llvmIndex = typeConverter->convertType(rewriter.getIndexType()); + + // Note: this is the GPUModule op, not the ModuleOp that surrounds it + // This ensures that global constants and declarations are placed within + // the device code, not the host code + auto moduleOp = gpuPrintfOp->getParentOfType(); + + LLVM::LLVMFuncOp printfDecl; + // Declare printf if it doesn't exist + if (!(printfDecl = moduleOp.lookupSymbol("printf"))) { + auto printfType = LLVM::LLVMFunctionType::get(rewriter.getI32Type(), + {i8Ptr}, /*isVarArg=*/true); + ConversionPatternRewriter::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(moduleOp.getBody()); + printfDecl = rewriter.create(loc, "printf", printfType, + LLVM::Linkage::External); + } + + // Create a global constant for the format string + unsigned stringNumber = 0; + std::string stringConstName; + do { + stringConstName = llvm::formatv("printfFormat_{0}", stringNumber++); + } while (moduleOp.lookupSymbol(stringConstName)); + + llvm::SmallString<20> formatString(adaptor.format().getValue()); + formatString.push_back('\0'); // Null terminate for C + auto globalType = + LLVM::LLVMArrayType::get(llvmI8, formatString.size_in_bytes()); + LLVM::GlobalOp global; + { + ConversionPatternRewriter::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(moduleOp.getBody()); + global = rewriter.create( + loc, globalType, + /*isConstant=*/true, LLVM::Linkage::Internal, stringConstName, + rewriter.getStringAttr(formatString)); + } + + // Get a pointr to the format string's first element + Value globalPtr = rewriter.create(loc, global); + Value zero = rewriter.create( + loc, llvmIndex, rewriter.getIntegerAttr(llvmIndex, 0)); + Value stringStart = rewriter.create( + loc, i8Ptr, globalPtr, mlir::ValueRange({zero, zero})); + + // Construct arguments and function call + auto &&argsRange = adaptor.args(); + SmallVector printfArgs; + printfArgs.reserve(argsRange.size() + 1); + printfArgs.push_back(stringStart); + printfArgs.append(argsRange.begin(), argsRange.end()); + + rewriter.create(loc, printfDecl, printfArgs); + rewriter.eraseOp(gpuPrintfOp); + return success(); +} diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -111,7 +111,7 @@ ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>, GPUIndexIntrinsicOpLowering, - GPUReturnOpLowering>(converter); + GPUPrintfOpToLLVMCallLowering, GPUReturnOpLowering>(converter); patterns.add( converter, /*allocaAddrSpace=*/5, Identifier::get(ROCDL::ROCDLDialect::getKernelFuncAttrName(), diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -386,3 +386,19 @@ gpu.return } } + +// ----- + +gpu.module @test_module { + // CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello, World\0A\00") + // CHECK: llvm.func @printf(!llvm.ptr, ...) -> i32 + // CHECK-LABEL: func @test_printf + gpu.func @test_printf() { + // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr> + // CHECK-NEXT: %[[IMM1:.*]] = llvm.mlir.constant(0 : i64) : i64 + // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][%[[IMM1]], %[[IMM1]]] : (!llvm.ptr>, i64, i64) -> !llvm.ptr + // CHECK-NEXT: %{{.*}} = llvm.call @printf(%[[IMM2]]) : (!llvm.ptr) -> i32 + gpu.printf { format = "Hello, World\n" } + gpu.return + } +} diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -109,6 +109,14 @@ gpu.return } + // CHECK-LABEL gpu.func @printf_test + // CHECK: (%[[ARG0:.*]]: i32) + // CHECK: gpu.printf {format = "Value: %d"} %[[ARG0]] : i32 + gpu.func @printf_test(%arg0 : i32) { + gpu.printf {format = "Value: %d"} %arg0 : i32 + gpu.return + } + // CHECK-LABEL: @no_attribution_attrs // CHECK: attributes // CHECK: {