diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h --- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h +++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h @@ -8,6 +8,7 @@ #ifndef MLIR_CONVERSION_GPUTOROCDL_GPUTOROCDLPASS_H_ #define MLIR_CONVERSION_GPUTOROCDL_GPUTOROCDLPASS_H_ +#include "mlir/Conversion/GPUToROCDL/Runtimes.h" #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" #include @@ -25,8 +26,11 @@ } // namespace gpu /// Collect a set of patterns to convert from the GPU dialect to ROCDL. +/// If `runtime` is Unknown, gpu.printf will not be lowered +/// The resulting pattern set should be run over a gpu.module op void populateGpuToROCDLConversionPatterns(LLVMTypeConverter &converter, - RewritePatternSet &patterns); + RewritePatternSet &patterns, + gpu::amd::Runtime runtime); /// Configure target to convert from the GPU dialect to ROCDL. void configureGpuToROCDLConversionLegality(ConversionTarget &target); @@ -36,7 +40,8 @@ /// is configurable. std::unique_ptr> createLowerGpuOpsToROCDLOpsPass( - unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout); + unsigned indexBitwidth = kDeriveIndexBitwidthFromDataLayout, + gpu::amd::Runtime runtime = gpu::amd::Runtime::Unknown); } // namespace mlir diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/Runtimes.h b/mlir/include/mlir/Conversion/GPUToROCDL/Runtimes.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Conversion/GPUToROCDL/Runtimes.h @@ -0,0 +1,24 @@ +//===- Runtimes.h - Possible runtimes for AMD GPUs ---*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef MLIR_CONVERSION_GPUTOROCDL_RUNTIMES_H +#define MLIR_CONVERSION_GPUTOROCDL_RUNTIMES_H + +namespace mlir { +namespace gpu { +namespace amd { +/// Potential runtimes for AMD GPU kernels +enum Runtime { + Unknown = 0, + HIP = 1, + OpenCL = 2, +}; +} // end namespace amd +} // end namespace gpu +} // end namespace mlir + +#endif // MLIR_CONVERSION_GPUTOROCDL_RUNTIMES_H diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -203,7 +203,15 @@ let options = [ Option<"indexBitwidth", "index-bitwidth", "unsigned", /*default=kDeriveIndexBitwidthFromDataLayout*/"0", - "Bitwidth of the index type, 0 to use size of machine word"> + "Bitwidth of the index type, 0 to use size of machine word">, + Option<"runtime", "runtime", "::mlir::gpu::amd::Runtime", + "::mlir::gpu::amd::Runtime::Unknown", + "Runtime code will be run on (default is Unknown, can also use HIP or OpenCl)", + [{::llvm::cl::values( + clEnumValN(::mlir::gpu::amd::Runtime::Unknown, "unknown", "Unknown (default)"), + clEnumValN(::mlir::gpu::amd::Runtime::HIP, "HIP", "HIP"), + clEnumValN(::mlir::gpu::amd::Runtime::OpenCL, "OpenCL", "OpenCL") + )}]> ]; } diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -547,6 +547,22 @@ let hasCanonicalizer = 1; } +def GPU_PrintfOp : GPU_Op<"printf", [MemoryEffects<[MemWrite]>]>, + Arguments<(ins StrAttr:$format, + Variadic>:$args)> { + let summary = "Device-side printf, as in CUDA or OpenCL, for debugging"; + let description = [{ + `gpu.printf` takes a literal format string `format` and an arbitrary number of + scalar arguments that should be printed. + + The format string is a C-style printf string, subject to any restrictions + imposed by one's target platform. + }]; + let assemblyFormat = [{ + $format attr-dict ($args^ `:` type($args))? + }]; +} + def GPU_ReturnOp : GPU_Op<"return", [HasParent<"GPUFuncOp">, NoSideEffect, Terminator]>, Arguments<(ins Variadic:$operands)>, Results<(outs)> { diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h @@ -33,6 +33,40 @@ StringAttr kernelAttributeName; }; +/// The lowering of gpu.printf to a call to HIP hostcalls +/// +/// Simplifies llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp, as we don't have +/// to deal with %s (even if there were first-class strings in MLIR, they're not +/// legal input to gpu.printf) or non-constant format strings +struct GPUPrintfOpToHIPLowering : public ConvertOpToLLVMPattern { + using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + + LogicalResult + matchAndRewrite(gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + +/// The lowering of gpu.printf to a call to an external printf() function +/// +/// This pass will add a declaration of printf() to the GPUModule if needed +/// and seperate out the format strings into global constants. For some +/// runtimes, such as OpenCL on AMD, this is sufficient setup, as the compiler +/// will lower printf calls to appropriate device-side code +struct GPUPrintfOpToLLVMCallLowering + : public ConvertOpToLLVMPattern { + GPUPrintfOpToLLVMCallLowering(LLVMTypeConverter &converter, + int addressSpace = 0) + : ConvertOpToLLVMPattern(converter), + addressSpace(addressSpace) {} + + LogicalResult + matchAndRewrite(gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; + +private: + int addressSpace; +}; + struct GPUReturnOpLowering : public ConvertOpToLLVMPattern { using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "GPUOpsLowering.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/Builders.h" #include "llvm/Support/FormatVariadic.h" @@ -144,3 +145,200 @@ rewriter.eraseOp(gpuFuncOp); return success(); } + +static const char formatStringPrefix[] = "printfFormat_"; + +template +static LLVM::LLVMFuncOp getOrDefineFunction(T &moduleOp, const Location loc, + ConversionPatternRewriter &rewriter, + StringRef name, + LLVM::LLVMFunctionType type) { + LLVM::LLVMFuncOp ret; + if (!(ret = moduleOp.template lookupSymbol(name))) { + ConversionPatternRewriter::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(moduleOp.getBody()); + ret = rewriter.create(loc, name, type, + LLVM::Linkage::External); + } + return ret; +} + +LogicalResult GPUPrintfOpToHIPLowering::matchAndRewrite( + gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + Location loc = gpuPrintfOp->getLoc(); + + mlir::Type llvmI8 = typeConverter->convertType(rewriter.getI8Type()); + mlir::Type i8Ptr = LLVM::LLVMPointerType::get(llvmI8); + mlir::Type llvmIndex = typeConverter->convertType(rewriter.getIndexType()); + mlir::Type llvmI32 = typeConverter->convertType(rewriter.getI32Type()); + mlir::Type llvmI64 = typeConverter->convertType(rewriter.getI64Type()); + // Note: this is the GPUModule op, not the ModuleOp that surrounds it + // This ensures that global constants and declarations are placed within + // the device code, not the host code + auto moduleOp = gpuPrintfOp->getParentOfType(); + + auto ocklBegin = + getOrDefineFunction(moduleOp, loc, rewriter, "__ockl_printf_begin", + LLVM::LLVMFunctionType::get(llvmI64, {llvmI64})); + LLVM::LLVMFuncOp ocklAppendArgs; + if (!adaptor.args().empty()) { + ocklAppendArgs = getOrDefineFunction( + moduleOp, loc, rewriter, "__ockl_printf_append_args", + LLVM::LLVMFunctionType::get( + llvmI64, {llvmI64, /*numArgs*/ llvmI32, llvmI64, llvmI64, llvmI64, + llvmI64, llvmI64, llvmI64, llvmI64, /*isLast*/ llvmI32})); + } + auto ocklAppendStringN = getOrDefineFunction( + moduleOp, loc, rewriter, "__ockl_printf_append_string_n", + LLVM::LLVMFunctionType::get( + llvmI64, + {llvmI64, i8Ptr, /*length (bytes)*/ llvmI64, /*isLast*/ llvmI32})); + + /// Start the printf hostcall + Value zeroI64 = rewriter.create( + loc, llvmI64, rewriter.getI64IntegerAttr(0)); + auto printfBeginCall = rewriter.create(loc, ocklBegin, zeroI64); + Value printfDesc = printfBeginCall.getResult(0); + + // Create a global constant for the format string + unsigned stringNumber = 0; + SmallString<16> stringConstName; + do { + stringConstName.clear(); + (formatStringPrefix + Twine(stringNumber++)).toStringRef(stringConstName); + } while (moduleOp.lookupSymbol(stringConstName)); + + llvm::SmallString<20> formatString(adaptor.format().getValue()); + formatString.push_back('\0'); // Null terminate for C + size_t formatStringSize = formatString.size_in_bytes(); + + auto globalType = LLVM::LLVMArrayType::get(llvmI8, formatStringSize); + LLVM::GlobalOp global; + { + ConversionPatternRewriter::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(moduleOp.getBody()); + global = rewriter.create( + loc, globalType, + /*isConstant=*/true, LLVM::Linkage::Internal, stringConstName, + rewriter.getStringAttr(formatString)); + } + + // Get a pointer to the format string's first element and pass it to printf() + Value globalPtr = rewriter.create(loc, global); + Value zero = rewriter.create( + loc, llvmIndex, rewriter.getIntegerAttr(llvmIndex, 0)); + Value stringStart = rewriter.create( + loc, i8Ptr, globalPtr, mlir::ValueRange({zero, zero})); + Value stringLen = rewriter.create( + loc, llvmI64, rewriter.getI64IntegerAttr(formatStringSize)); + + Value oneI32 = rewriter.create( + loc, llvmI32, rewriter.getI32IntegerAttr(1)); + Value zeroI32 = rewriter.create( + loc, llvmI32, rewriter.getI32IntegerAttr(0)); + + mlir::ValueRange appendFormatArgs = {printfDesc, stringStart, stringLen, + adaptor.args().empty() ? oneI32 + : zeroI32}; + auto appendFormatCall = + rewriter.create(loc, ocklAppendStringN, appendFormatArgs); + printfDesc = appendFormatCall.getResult(0); + + // __ockl_printf_append_args takes 7 values per append call + constexpr size_t argsPerAppend = 7; + size_t nArgs = adaptor.args().size(); + for (size_t group = 0; group < nArgs; group += argsPerAppend) { + size_t bound = std::min(group + argsPerAppend, nArgs); + size_t numArgsThisCall = bound - group; + + SmallVector arguments; + arguments.push_back(printfDesc); + arguments.push_back(rewriter.create( + loc, llvmI32, rewriter.getI32IntegerAttr(numArgsThisCall))); + for (size_t i = group; i < bound; ++i) { + Value arg = adaptor.args()[i]; + if (auto floatType = arg.getType().dyn_cast()) { + if (!floatType.isF64()) + arg = rewriter.create( + loc, typeConverter->convertType(rewriter.getF64Type()), arg); + arg = rewriter.create(loc, llvmI64, arg); + } + if (arg.getType().getIntOrFloatBitWidth() != 64) + arg = rewriter.create(loc, llvmI64, arg); + + arguments.push_back(arg); + } + // Pad out to 7 arguments since the hostcall always needs 7 + for (size_t extra = numArgsThisCall; extra < argsPerAppend; ++extra) { + arguments.push_back(zeroI64); + } + + auto isLast = (bound == nArgs) ? oneI32 : zeroI32; + arguments.push_back(isLast); + auto call = rewriter.create(loc, ocklAppendArgs, arguments); + printfDesc = call.getResult(0); + } + rewriter.eraseOp(gpuPrintfOp); + return success(); +} + +LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite( + gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + Location loc = gpuPrintfOp->getLoc(); + + mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8)); + mlir::Type i8Ptr = LLVM::LLVMPointerType::get(llvmI8, addressSpace); + mlir::Type llvmIndex = typeConverter->convertType(rewriter.getIndexType()); + + // Note: this is the GPUModule op, not the ModuleOp that surrounds it + // This ensures that global constants and declarations are placed within + // the device code, not the host code + auto moduleOp = gpuPrintfOp->getParentOfType(); + + auto printfType = LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {i8Ptr}, + /*isVarArg=*/true); + LLVM::LLVMFuncOp printfDecl = + getOrDefineFunction(moduleOp, loc, rewriter, "printf", printfType); + + // Create a global constant for the format string + unsigned stringNumber = 0; + SmallString<16> stringConstName; + do { + stringConstName.clear(); + (formatStringPrefix + Twine(stringNumber++)).toStringRef(stringConstName); + } while (moduleOp.lookupSymbol(stringConstName)); + + llvm::SmallString<20> formatString(adaptor.format().getValue()); + formatString.push_back('\0'); // Null terminate for C + auto globalType = + LLVM::LLVMArrayType::get(llvmI8, formatString.size_in_bytes()); + LLVM::GlobalOp global; + { + ConversionPatternRewriter::InsertionGuard guard(rewriter); + rewriter.setInsertionPointToStart(moduleOp.getBody()); + global = rewriter.create( + loc, globalType, + /*isConstant=*/true, LLVM::Linkage::Internal, stringConstName, + rewriter.getStringAttr(formatString), /*allignment=*/0, addressSpace); + } + + // Get a pointer to the format string's first element + Value globalPtr = rewriter.create(loc, global); + Value zero = rewriter.create( + loc, llvmIndex, rewriter.getIntegerAttr(llvmIndex, 0)); + Value stringStart = rewriter.create( + loc, i8Ptr, globalPtr, mlir::ValueRange({zero, zero})); + + // Construct arguments and function call + auto argsRange = adaptor.args(); + SmallVector printfArgs; + printfArgs.reserve(argsRange.size() + 1); + printfArgs.push_back(stringStart); + printfArgs.append(argsRange.begin(), argsRange.end()); + + rewriter.create(loc, printfDecl, printfArgs); + rewriter.eraseOp(gpuPrintfOp); + return success(); +} diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -51,8 +51,9 @@ struct LowerGpuOpsToROCDLOpsPass : public ConvertGpuOpsToROCDLOpsBase { LowerGpuOpsToROCDLOpsPass() = default; - LowerGpuOpsToROCDLOpsPass(unsigned indexBitwidth) { + LowerGpuOpsToROCDLOpsPass(unsigned indexBitwidth, gpu::amd::Runtime runtime) { this->indexBitwidth = indexBitwidth; + this->runtime = runtime; } void runOnOperation() override { @@ -79,7 +80,7 @@ populateVectorToROCDLConversionPatterns(converter, llvmPatterns); populateStdToLLVMConversionPatterns(converter, llvmPatterns); populateMemRefToLLVMConversionPatterns(converter, llvmPatterns); - populateGpuToROCDLConversionPatterns(converter, llvmPatterns); + populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime); LLVMConversionTarget target(getContext()); configureGpuToROCDLConversionLegality(target); if (failed(applyPartialConversion(m, target, std::move(llvmPatterns)))) @@ -102,8 +103,11 @@ target.addLegalOp(); } -void mlir::populateGpuToROCDLConversionPatterns(LLVMTypeConverter &converter, - RewritePatternSet &patterns) { +void mlir::populateGpuToROCDLConversionPatterns( + LLVMTypeConverter &converter, RewritePatternSet &patterns, + mlir::gpu::amd::Runtime runtime) { + using mlir::gpu::amd::Runtime; + populateWithGenerated(patterns); patterns .add(converter); + } else if (Runtime::OpenCL == runtime) { + // Use address space = 4 to match the OpenCL definition of printf() + patterns.add(converter, /*addressSpace=*/4); + } + patterns.add>(converter, "__ocml_fabs_f32", "__ocml_fabs_f64"); patterns.add>(converter, "__ocml_atan_f32", @@ -158,6 +169,7 @@ } std::unique_ptr> -mlir::createLowerGpuOpsToROCDLOpsPass(unsigned indexBitwidth) { - return std::make_unique(indexBitwidth); +mlir::createLowerGpuOpsToROCDLOpsPass(unsigned indexBitwidth, + gpu::amd::Runtime runtime) { + return std::make_unique(indexBitwidth, runtime); } diff --git a/mlir/lib/Conversion/PassDetail.h b/mlir/lib/Conversion/PassDetail.h --- a/mlir/lib/Conversion/PassDetail.h +++ b/mlir/lib/Conversion/PassDetail.h @@ -11,6 +11,8 @@ #include "mlir/Pass/Pass.h" +#include "mlir/Conversion/GPUToROCDL/Runtimes.h" + namespace mlir { class AffineDialect; class StandardOpsDialect; diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp --- a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp @@ -306,6 +306,12 @@ return nullptr; } } + + // Set amdgpu_hostcall if host calls have been linked, as needed by newer LLVM + // FIXME: Is there a way to set this during printf() lowering that makes sense + if (ret->getFunction("__ockl_hostcall_internal")) + if (!ret->getModuleFlag("amdgpu_hostcall")) + ret->addModuleFlag(llvm::Module::Override, "amdgpu_hostcall", 1); return ret; } diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp --- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp @@ -71,11 +71,14 @@ // For GPU kernels, // 1. Insert AMDGPU_KERNEL calling convention. - // 2. Insert amdgpu-flat-workgroup-size(1, 1024) attribute. + // 2. Insert amdgpu-flat-workgroup-size(1, 256) attribute. + // 3. Insert amdgpu-implicitarg-num-bytes=56 (which must be set on OpenCL + // and HIP kernels per Clang) llvm::Function *llvmFunc = moduleTranslation.lookupFunction(func.getName()); llvmFunc->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); - llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1, 1024"); + llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1, 256"); + llvmFunc->addFnAttr("amdgpu-implicitarg-num-bytes", "56"); } return success(); } diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir @@ -0,0 +1,44 @@ +// RUN: mlir-opt %s -convert-gpu-to-rocdl=runtime=HIP -split-input-file | FileCheck %s + +gpu.module @test_module { + // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL0:[A-Za-z0-9_]+]]("Hello, world\0A\00") + // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL1:[A-Za-z0-9_]+]]("Hello: %d\0A\00") + // CHECK-DAG: llvm.func @__ockl_printf_append_args(i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64 + // CHECK-DAG: llvm.func @__ockl_printf_append_string_n(i64, !llvm.ptr, i64, i32) -> i64 + // CHECK-DAG: llvm.func @__ockl_printf_begin(i64) -> i64 + + // CHECK-LABEL: func @test_const_printf + gpu.func @test_const_printf() { + // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64 + // CHECK-NEXT: %[[DESC0:.*]] = llvm.call @__ockl_printf_begin(%0) : (i64) -> i64 + // CHECK-NEXT: %[[FORMATSTR:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL0]] : !llvm.ptr> + // CHECK-NEXT: %[[CST1:.*]] = llvm.mlir.constant(0 : i64) : i64 + // CHECK-NEXT: %[[FORMATSTART:.*]] = llvm.getelementptr %[[FORMATSTR]][%[[CST1]], %[[CST1]]] : (!llvm.ptr>, i64, i64) -> !llvm.ptr + // CHECK-NEXT: %[[FORMATLEN:.*]] = llvm.mlir.constant(14 : i64) : i64 + // CHECK-NEXT: %[[ISLAST:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-NEXT: %[[ISNTLAST:.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_string_n(%[[DESC0]], %[[FORMATSTART]], %[[FORMATLEN]], %[[ISLAST]]) : (i64, !llvm.ptr, i64, i32) -> i64 + gpu.printf "Hello, world\n" + gpu.return + } + + + // CHECK-LABEL: func @test_printf + // CHECK: (%[[ARG0:.*]]: i32) + gpu.func @test_printf(%arg0: i32) { + // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64 + // CHECK-NEXT: %[[DESC0:.*]] = llvm.call @__ockl_printf_begin(%0) : (i64) -> i64 + // CHECK-NEXT: %[[FORMATSTR:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL1]] : !llvm.ptr> + // CHECK-NEXT: %[[CST1:.*]] = llvm.mlir.constant(0 : i64) : i64 + // CHECK-NEXT: %[[FORMATSTART:.*]] = llvm.getelementptr %[[FORMATSTR]][%[[CST1]], %[[CST1]]] : (!llvm.ptr>, i64, i64) -> !llvm.ptr + // CHECK-NEXT: %[[FORMATLEN:.*]] = llvm.mlir.constant(11 : i64) : i64 + // CHECK-NEXT: %[[ISLAST:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-NEXT: %[[ISNTLAST:.*]] = llvm.mlir.constant(0 : i32) : i32 + // CHECK-NEXT: %[[DESC1:.*]] = llvm.call @__ockl_printf_append_string_n(%[[DESC0]], %[[FORMATSTART]], %[[FORMATLEN]], %[[ISNTLAST]]) : (i64, !llvm.ptr, i64, i32) -> i64 + // CHECK-NEXT: %[[NARGS1:.*]] = llvm.mlir.constant(1 : i32) : i32 + // CHECK-NEXT: %[[ARG0_64:.*]] = llvm.zext %[[ARG0]] : i32 to i64 + // CHECK-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_args(%[[DESC1]], %[[NARGS1]], %[[ARG0_64]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[ISLAST]]) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64 + gpu.printf "Hello: %d\n" %arg0 : i32 + gpu.return + } +} diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir @@ -0,0 +1,16 @@ +// RUN: mlir-opt %s -convert-gpu-to-rocdl=runtime=OpenCL | FileCheck %s + +gpu.module @test_module { + // CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00") {addr_space = 4 : i32} + // CHECK: llvm.func @printf(!llvm.ptr, ...) -> i32 + // CHECK-LABEL: func @test_printf + // CHECK: (%[[ARG0:.*]]: i32) + gpu.func @test_printf(%arg0: i32) { + // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr, 4> + // CHECK-NEXT: %[[IMM1:.*]] = llvm.mlir.constant(0 : i64) : i64 + // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][%[[IMM1]], %[[IMM1]]] : (!llvm.ptr, 4>, i64, i64) -> !llvm.ptr + // CHECK-NEXT: %{{.*}} = llvm.call @printf(%[[IMM2]], %[[ARG0]]) : (!llvm.ptr, i32) -> i32 + gpu.printf "Hello: %d\n" %arg0 : i32 + gpu.return + } +} diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -112,6 +112,14 @@ gpu.return } + // CHECK-LABEL gpu.func @printf_test + // CHECK: (%[[ARG0:.*]]: i32) + // CHECK: gpu.printf "Value: %d" %[[ARG0]] : i32 + gpu.func @printf_test(%arg0 : i32) { + gpu.printf "Value: %d" %arg0 : i32 + gpu.return + } + // CHECK-LABEL: gpu.func @no_attribution // CHECK: { gpu.func @no_attribution(%arg0: f32) { diff --git a/mlir/test/Integration/GPU/ROCM/printf.mlir b/mlir/test/Integration/GPU/ROCM/printf.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/GPU/ROCM/printf.mlir @@ -0,0 +1,29 @@ +// RUN: mlir-opt %s \ +// RUN: -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-rocdl{index-bitwidth=32 runtime=HIP},gpu-to-hsaco{chip=%chip})' \ +// RUN: -gpu-to-llvm \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%linalg_test_lib_dir/libmlir_rocm_runtime%shlibext \ +// RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +// CHECK: Hello from 0 +// CHECK: Hello from 1 +module attributes {gpu.container_module} { + gpu.module @kernels { + gpu.func @hello() kernel { + %0 = "gpu.thread_id"() {dimension="x"} : () -> (index) + gpu.printf "Hello from %d\n" %0 : index + gpu.return + } + } + + func @main() { + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + gpu.launch_func @kernels::@hello + blocks in (%c1, %c1, %c1) + threads in (%c2, %c1, %c1) + return + } +}