diff --git a/mlir/include/mlir/Dialect/GPU/Passes.h b/mlir/include/mlir/Dialect/GPU/Passes.h --- a/mlir/include/mlir/Dialect/GPU/Passes.h +++ b/mlir/include/mlir/Dialect/GPU/Passes.h @@ -54,14 +54,23 @@ protected: void getDependentDialects(DialectRegistry ®istry) const override; -private: - /// Creates the LLVM target machine to generate the ISA. - std::unique_ptr createTargetMachine(); + /// Hook allowing the application of optimizations before codegen + /// By default, does nothing + virtual LogicalResult optimizeLlvm(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine); /// Translates the 'getOperation()' result to an LLVM module. virtual std::unique_ptr translateToLLVMIR(llvm::LLVMContext &llvmContext); +private: + /// Creates the LLVM target machine to generate the ISA. + std::unique_ptr createTargetMachine(); + + /// Translates the module to ISA + Optional translateToISA(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine); + /// Serializes the target ISA to binary form. virtual std::unique_ptr> serializeISA(const std::string &isa) = 0; diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt --- a/mlir/lib/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/CMakeLists.txt @@ -145,14 +145,14 @@ message(STATUS "ROCm HIP version: ${HIP_VERSION}") endif() - target_compile_definitions(obj.MLIRGPUOps + target_compile_definitions(obj.MLIRGPUTransforms PRIVATE __HIP_PLATFORM_HCC__ __ROCM_PATH__="${ROCM_PATH}" MLIR_GPU_TO_HSACO_PASS_ENABLE=1 ) - target_include_directories(obj.MLIRGPUOps + target_include_directories(obj.MLIRGPUTransforms PRIVATE ${MLIR_SOURCE_DIR}/../lld/include ${HIP_PATH}/include @@ -162,6 +162,7 @@ target_link_libraries(MLIRGPUOps PRIVATE lldELF + MLIRExecutionEngine MLIRROCDLToLLVMIRTranslation ) diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp --- a/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToBlob.cpp @@ -31,18 +31,28 @@ gpu::SerializeToBlobPass::SerializeToBlobPass(const SerializeToBlobPass &other) : OperationPass(other) {} -static std::string translateToISA(llvm::Module &llvmModule, - llvm::TargetMachine &targetMachine) { +Optional +gpu::SerializeToBlobPass::translateToISA(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) { llvmModule.setDataLayout(targetMachine.createDataLayout()); + if (failed(optimizeLlvm(llvmModule, targetMachine))) + return llvm::None; + std::string targetISA; llvm::raw_string_ostream stream(targetISA); - llvm::buffer_ostream pstream(stream); + llvm::legacy::PassManager codegenPasses; - targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr, - llvm::CGFT_AssemblyFile); - codegenPasses.run(llvmModule); - return targetISA; + + { // Drop pstream after this to prevent the ISA from being stuck buffering + llvm::buffer_ostream pstream(stream); + if (targetMachine.addPassesToEmitFile(codegenPasses, pstream, nullptr, + llvm::CGFT_AssemblyFile)) + return llvm::None; + + codegenPasses.run(llvmModule); + } + return stream.str(); } void gpu::SerializeToBlobPass::runOnOperation() { @@ -58,7 +68,13 @@ if (!targetMachine) return signalPassFailure(); - std::string targetISA = translateToISA(*llvmModule, *targetMachine); + Optional maybeTargetISA = + translateToISA(*llvmModule, *targetMachine); + + if (!maybeTargetISA.hasValue()) + return signalPassFailure(); + + std::string targetISA = std::move(maybeTargetISA.getValue()); // Serialize the target ISA. std::unique_ptr> blob = serializeISA(targetISA); @@ -71,6 +87,14 @@ getOperation()->setAttr(gpuBinaryAnnotation, attr); } +LogicalResult +gpu::SerializeToBlobPass::optimizeLlvm(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) { + // TODO: If serializeToCubin ends up defining optimizations, factor them + // into here from SerializeToHsaco + return success(); +} + void gpu::SerializeToBlobPass::getDependentDialects( DialectRegistry ®istry) const { registerLLVMDialectTranslation(registry); diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp --- a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp @@ -11,8 +11,11 @@ // //===----------------------------------------------------------------------===// #include "mlir/Dialect/GPU/Passes.h" +#include "mlir/IR/Location.h" +#include "mlir/IR/MLIRContext.h" #if MLIR_GPU_TO_HSACO_PASS_ENABLE +#include "mlir/ExecutionEngine/OptUtils.h" #include "mlir/Pass/Pass.h" #include "mlir/Support/FileUtilities.h" #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h" @@ -32,8 +35,11 @@ #include "llvm/Support/FileUtilities.h" #include "llvm/Support/LineIterator.h" #include "llvm/Support/Program.h" +#include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Support/WithColor.h" + +#include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" #include "lld/Common/Driver.h" @@ -48,12 +54,24 @@ class SerializeToHsacoPass : public PassWrapper { public: - SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features); + SerializeToHsacoPass(StringRef triple, StringRef arch, StringRef features, + int optLevel); + SerializeToHsacoPass(const SerializeToHsacoPass &other); StringRef getArgument() const override { return "gpu-to-hsaco"; } StringRef getDescription() const override { return "Lower GPU kernel function to HSACO binary annotations"; } +protected: + Option optLevel{ + *this, "opt-level", + llvm::cl::desc("Optimization level for HSACO compilation"), + llvm::cl::init(2)}; + + /// Adds LLVM optimization passes + LogicalResult optimizeLlvm(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) override; + private: void getDependentDialects(DialectRegistry ®istry) const override; @@ -67,6 +85,8 @@ }; } // namespace +SerializeToHsacoPass::SerializeToHsacoPass(const SerializeToHsacoPass &other) + : PassWrapper(other) {} static std::string getDefaultChip() { const char kDefaultChip[] = "gfx900"; @@ -132,10 +152,12 @@ } SerializeToHsacoPass::SerializeToHsacoPass(StringRef triple, StringRef arch, - StringRef features) { + StringRef features, int optLevel) { maybeSetOption(this->triple, [&triple] { return triple.str(); }); maybeSetOption(this->chip, [&arch] { return arch.str(); }); maybeSetOption(this->features, [&features] { return features.str(); }); + if (this->optLevel.getNumOccurrences() == 0) + this->optLevel.setValue(optLevel); } void SerializeToHsacoPass::getDependentDialects( @@ -144,6 +166,30 @@ gpu::SerializeToBlobPass::getDependentDialects(registry); } +LogicalResult +SerializeToHsacoPass::optimizeLlvm(llvm::Module &llvmModule, + llvm::TargetMachine &targetMachine) { + int optLevel = this->optLevel.getValue(); + if (optLevel < 0 || optLevel > 3) + return getOperation().emitError() + << "Invalid HSA optimization level" << optLevel << "\n"; + + targetMachine.setOptLevel(static_cast(optLevel)); + + auto transformer = + makeOptimizingTransformer(optLevel, /*sizeLevel=*/0, &targetMachine); + auto error = transformer(&llvmModule); + if (error) { + InFlightDiagnostic mlirError = getOperation()->emitError(); + llvm::handleAllErrors( + std::move(error), [&mlirError](const llvm::ErrorInfoBase &ei) { + mlirError << "Could not optimize LLVM IR: " << ei.message() << "\n"; + }); + return mlirError; + } + return success(); +} + std::unique_ptr> SerializeToHsacoPass::assembleIsa(const std::string &isa) { auto loc = getOperation().getLoc(); @@ -170,8 +216,11 @@ std::unique_ptr mai( target->createMCAsmInfo(*mri, this->triple, mcOptions)); mai->setRelaxELFRelocations(true); + std::unique_ptr sti( + target->createMCSubtargetInfo(this->triple, this->chip, this->features)); - llvm::MCContext ctx(triple, mai.get(), mri.get(), &srcMgr, &mcOptions); + llvm::MCContext ctx(triple, mai.get(), mri.get(), sti.get(), &srcMgr, + &mcOptions); std::unique_ptr mofi(target->createMCObjectFileInfo( ctx, /*PIC=*/false, /*LargeCodeModel=*/false)); ctx.setObjectFileInfo(mofi.get()); @@ -182,8 +231,6 @@ std::unique_ptr mcStreamer; std::unique_ptr mcii(target->createMCInstrInfo()); - std::unique_ptr sti( - target->createMCSubtargetInfo(this->triple, this->chip, this->features)); llvm::MCCodeEmitter *ce = target->createMCCodeEmitter(*mcii, *mri, ctx); llvm::MCAsmBackend *mab = target->createMCAsmBackend(*sti, *mri, mcOptions); @@ -280,7 +327,7 @@ LLVMInitializeAMDGPUTargetMC(); return std::make_unique("amdgcn-amd-amdhsa", "", - ""); + "", 2); }); } #else // MLIR_GPU_TO_HSACO_PASS_ENABLE diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -202,8 +202,11 @@ ${HIP_PATH}/include ${ROCM_PATH}/include ) + set_property(TARGET mlir_rocm_runtime + PROPERTY INSTALL_RPATH_USE_LINK_PATH ON) + target_link_libraries(mlir_rocm_runtime - PRIVATE + PUBLIC ${ROCM_RUNTIME_LIBRARY} ) endif() diff --git a/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir b/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir --- a/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir +++ b/mlir/test/Integration/GPU/ROCM/gpu-to-hsaco.mlir @@ -11,10 +11,10 @@ func @other_func(%arg0 : f32, %arg1 : memref) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %block_dim = dim %arg1, %c0 : memref + %block_dim = memref.dim %arg1, %c0 : memref gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) threads(%tx, %ty, %tz) in (%block_x = %block_dim, %block_y = %c1, %block_z = %c1) { - store %arg0, %arg1[%tx] : memref + memref.store %arg0, %arg1[%tx] : memref gpu.terminator } return @@ -22,12 +22,12 @@ // CHECK: [1, 1, 1, 1, 1] func @main() { - %arg0 = alloc() : memref<5xf32> + %arg0 = memref.alloc() : memref<5xf32> %21 = arith.constant 5 : i32 - %22 = memref_cast %arg0 : memref<5xf32> to memref - %cast = memref_cast %22 : memref to memref<*xf32> + %22 = memref.cast %arg0 : memref<5xf32> to memref + %cast = memref.cast %22 : memref to memref<*xf32> gpu.host_register %cast : memref<*xf32> - %23 = memref_cast %22 : memref to memref<*xf32> + %23 = memref.cast %22 : memref to memref<*xf32> call @print_memref_f32(%23) : (memref<*xf32>) -> () %24 = arith.constant 1.0 : f32 %25 = call @mgpuMemGetDeviceMemRef1dFloat(%22) : (memref) -> (memref) diff --git a/mlir/test/Integration/GPU/ROCM/two-modules.mlir b/mlir/test/Integration/GPU/ROCM/two-modules.mlir --- a/mlir/test/Integration/GPU/ROCM/two-modules.mlir +++ b/mlir/test/Integration/GPU/ROCM/two-modules.mlir @@ -10,24 +10,24 @@ // CHECK: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] func @main() { - %arg = alloc() : memref<13xi32> - %dst = memref_cast %arg : memref<13xi32> to memref + %arg = memref.alloc() : memref<13xi32> + %dst = memref.cast %arg : memref<13xi32> to memref %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %sx = dim %dst, %c0 : memref - %cast_dst = memref_cast %dst : memref to memref<*xi32> + %sx = memref.dim %dst, %c0 : memref + %cast_dst = memref.cast %dst : memref to memref<*xi32> gpu.host_register %cast_dst : memref<*xi32> %dst_device = call @mgpuMemGetDeviceMemRef1dInt32(%dst) : (memref) -> (memref) gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %c1, %block_z = %c1) { %t0 = arith.index_cast %tx : index to i32 - store %t0, %dst_device[%tx] : memref + memref.store %t0, %dst_device[%tx] : memref gpu.terminator } gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) threads(%tx, %ty, %tz) in (%block_x = %sx, %block_y = %c1, %block_z = %c1) { %t0 = arith.index_cast %tx : index to i32 - store %t0, %dst_device[%tx] : memref + memref.store %t0, %dst_device[%tx] : memref gpu.terminator } call @print_memref_i32(%cast_dst) : (memref<*xi32>) -> () diff --git a/mlir/test/Integration/GPU/ROCM/vecadd.mlir b/mlir/test/Integration/GPU/ROCM/vecadd.mlir --- a/mlir/test/Integration/GPU/ROCM/vecadd.mlir +++ b/mlir/test/Integration/GPU/ROCM/vecadd.mlir @@ -12,13 +12,13 @@ func @vecadd(%arg0 : memref, %arg1 : memref, %arg2 : memref) { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %block_dim = dim %arg0, %c0 : memref + %block_dim = memref.dim %arg0, %c0 : memref gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) threads(%tx, %ty, %tz) in (%block_x = %block_dim, %block_y = %c1, %block_z = %c1) { - %a = load %arg0[%tx] : memref - %b = load %arg1[%tx] : memref + %a = memref.load %arg0[%tx] : memref + %b = memref.load %arg1[%tx] : memref %c = arith.addf %a, %b : f32 - store %c, %arg2[%tx] : memref + memref.store %c, %arg2[%tx] : memref gpu.terminator } return @@ -30,19 +30,19 @@ %c1 = arith.constant 1 : index %c5 = arith.constant 5 : index %cf1dot23 = arith.constant 1.23 : f32 - %0 = alloc() : memref<5xf32> - %1 = alloc() : memref<5xf32> - %2 = alloc() : memref<5xf32> - %3 = memref_cast %0 : memref<5xf32> to memref - %4 = memref_cast %1 : memref<5xf32> to memref - %5 = memref_cast %2 : memref<5xf32> to memref + %0 = memref.alloc() : memref<5xf32> + %1 = memref.alloc() : memref<5xf32> + %2 = memref.alloc() : memref<5xf32> + %3 = memref.cast %0 : memref<5xf32> to memref + %4 = memref.cast %1 : memref<5xf32> to memref + %5 = memref.cast %2 : memref<5xf32> to memref scf.for %i = %c0 to %c5 step %c1 { - store %cf1dot23, %3[%i] : memref - store %cf1dot23, %4[%i] : memref + memref.store %cf1dot23, %3[%i] : memref + memref.store %cf1dot23, %4[%i] : memref } - %6 = memref_cast %3 : memref to memref<*xf32> - %7 = memref_cast %4 : memref to memref<*xf32> - %8 = memref_cast %5 : memref to memref<*xf32> + %6 = memref.cast %3 : memref to memref<*xf32> + %7 = memref.cast %4 : memref to memref<*xf32> + %8 = memref.cast %5 : memref to memref<*xf32> gpu.host_register %6 : memref<*xf32> gpu.host_register %7 : memref<*xf32> gpu.host_register %8 : memref<*xf32> diff --git a/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir b/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir --- a/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir +++ b/mlir/test/Integration/GPU/ROCM/vector-transferops.mlir @@ -59,19 +59,19 @@ %cf1 = arith.constant 1.0 : f32 %cf1dot23 = arith.constant 1.23 : f32 - %arg0 = alloc() : memref<4xf32> - %arg1 = alloc() : memref<4xf32> + %arg0 = memref.alloc() : memref<4xf32> + %arg1 = memref.alloc() : memref<4xf32> - %22 = memref_cast %arg0 : memref<4xf32> to memref - %23 = memref_cast %arg1 : memref<4xf32> to memref + %22 = memref.cast %arg0 : memref<4xf32> to memref + %23 = memref.cast %arg1 : memref<4xf32> to memref scf.for %i = %c0 to %c4 step %c1 { - store %cf1dot23, %22[%i] : memref - store %cf1dot23, %23[%i] : memref + memref.store %cf1dot23, %22[%i] : memref + memref.store %cf1dot23, %23[%i] : memref } - %cast0 = memref_cast %22 : memref to memref<*xf32> - %cast1 = memref_cast %23 : memref to memref<*xf32> + %cast0 = memref.cast %22 : memref to memref<*xf32> + %cast1 = memref.cast %23 : memref to memref<*xf32> gpu.host_register %cast0 : memref<*xf32> gpu.host_register %cast1 : memref<*xf32>