diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td --- a/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td +++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationAttrs.td @@ -39,4 +39,27 @@ def GPUObjectArrayAttr : TypedArrayAttrBase; +//===----------------------------------------------------------------------===// +// GPU offloading LLVM translation handler attributes. +//===----------------------------------------------------------------------===// + +def GPU_SelectObjectAttr : GPU_Attr<"SelectObject", "select_object"> { + let description = [{ + This GPU offloading handler selects a single GPU object for embedding. The + object is selected based on the `target` parameter, this parameter can be + either a number -i.e. selects the ith-target, or the target itself -i.e. + searches for the specified target in the object array. + + The first object in a `gpu.binary` operation is selected if no target is + specified. + }]; + let parameters = (ins + OptionalParameter<"Attribute", "Target to select for embedding.">:$target + ); + let assemblyFormat = [{ + (`<` $target^ `>`)? + }]; + let genVerifyDecl = 1; +} + #endif // GPU_COMPILATION_ATTRS diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h --- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h @@ -180,6 +180,10 @@ using SparseDnTensorHandleType = SparseHandleType; using SparseSpMatHandleType = SparseHandleType; +/// Registers offloading LLVM translation interfaces. TODO: Remove this +/// function. +void registerOffloadingLLVMTranslationInterfacesExternalModels( + mlir::DialectRegistry ®istry); } // namespace gpu } // namespace mlir diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1100,8 +1100,12 @@ Examples: ``` + // Selects the first object. gpu.binary @myobject [#gpu.object<...>, #gpu.object<...>] + // Uses the `#foo.my_handler` for handling the binary during translation. gpu.binary @myobject <#foo.my_handler> [#gpu.object<...>, #gpu.object<...>] + // Selects the object with the `#rocdl.target` target attribute. + gpu.binary @myobject <#gpu.select_object<#rocdl.target>> [#gpu.object<...>, #gpu.object<#rocdl.target, ...>] ``` }]; let builders = [ @@ -1114,7 +1118,7 @@ ]; let skipDefaultBuilders = 1; let assemblyFormat = [{ - $sym_name (`<` $offloadingHandler ^ `>`)? attr-dict $objects + $sym_name custom($offloadingHandler) attr-dict $objects }]; } diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h --- a/mlir/include/mlir/InitAllDialects.h +++ b/mlir/include/mlir/InitAllDialects.h @@ -155,6 +155,7 @@ bufferization::func_ext::registerBufferizableOpInterfaceExternalModels( registry); builtin::registerCastOpInterfaceExternalModels(registry); + gpu::registerOffloadingLLVMTranslationInterfacesExternalModels(registry); linalg::registerBufferizableOpInterfaceExternalModels(registry); linalg::registerTilingInterfaceExternalModels(registry); linalg::registerValueBoundsOpInterfaceExternalModels(registry); diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -208,6 +208,7 @@ #include "mlir/Dialect/GPU/IR/GPUOpsAttributes.cpp.inc" >(); addInterfaces(); + declarePromisedInterface(); } static std::string getSparseHandleKeyword(SparseHandleKind kind) { @@ -1645,7 +1646,13 @@ result.attributes.push_back(builder.getNamedAttr( SymbolTable::getSymbolAttrName(), builder.getStringAttr(name))); properties.objects = objects; - properties.offloadingHandler = offloadingHandler; + if (offloadingHandler) { + properties.offloadingHandler = offloadingHandler; + } else { + auto offloadingHandler = builder.getAttr(nullptr); + properties.offloadingHandler = + dyn_cast(offloadingHandler); + } } void BinaryOp::build(OpBuilder &builder, OperationState &result, StringRef name, @@ -1655,6 +1662,25 @@ objects.size() > 0 ? builder.getArrayAttr(objects) : ArrayAttr()); } +static ParseResult parseOffloadingHandler(OpAsmParser &parser, + Attribute &offloadingHandler) { + if (succeeded(parser.parseOptionalLess())) { + if (parser.parseAttribute(offloadingHandler)) + return failure(); + if (parser.parseGreater()) + return failure(); + } + if (!offloadingHandler) + offloadingHandler = parser.getBuilder().getAttr(nullptr); + return success(); +} + +static void printOffloadingHandler(OpAsmPrinter &printer, Operation *op, + Attribute offloadingHandler) { + if (offloadingHandler != SelectObjectAttr::get(op->getContext(), nullptr)) + printer << '<' << offloadingHandler << '>'; +} + //===----------------------------------------------------------------------===// // GPUMemcpyOp //===----------------------------------------------------------------------===// @@ -1926,6 +1952,27 @@ results.add(context); } +//===----------------------------------------------------------------------===// +// GPU select object attribute +//===----------------------------------------------------------------------===// + +LogicalResult +gpu::SelectObjectAttr::verify(function_ref emitError, + Attribute target) { + // Check `target`, it can be null, an integer attr or a GPU Target attribute. + if (target) { + if (auto intAttr = mlir::dyn_cast(target)) { + if (intAttr.getInt() < 0) { + return emitError() << "The object index must be positive."; + } + } else if (!(::mlir::isa(target))) { + return emitError() + << "The target attribute must be a GPU Target attribute."; + } + } + return success(); +} + //===----------------------------------------------------------------------===// // GPU target options //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt b/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt --- a/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt +++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_translation_library(MLIRGPUToLLVMIRTranslation GPUToLLVMIRTranslation.cpp + SelectObjectAttr.cpp LINK_COMPONENTS Core diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp --- a/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp @@ -36,6 +36,7 @@ registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) { dialect->addInterfaces(); }); + gpu::registerOffloadingLLVMTranslationInterfacesExternalModels(registry); } void mlir::registerGPUDialectTranslation(MLIRContext &context) { diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/SelectObjectAttr.cpp @@ -0,0 +1,370 @@ +//===- ObjectHandler.cpp - Implements base ObjectManager attributes -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the `OffloadingLLVMTranslationAttrInterface` for the +// `SelectObject` attribute. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/GPU/IR/GPUDialect.h" + +#include "mlir/Target/LLVMIR/Export.h" +#include "mlir/Target/LLVMIR/ModuleTranslation.h" + +#include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/FormatVariadic.h" + +using namespace mlir; + +namespace { +// Implementation of the `OffloadingLLVMTranslationAttrInterface` model. +class SelectObjectAttrImpl + : public gpu::OffloadingLLVMTranslationAttrInterface::FallbackModel< + SelectObjectAttrImpl> { +public: + // Translates a `gpu.binary`, embedding the binary into a host LLVM module as + // global binary string. + LogicalResult embedBinary(Attribute attribute, Operation *operation, + llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) const; + + // Translates a `gpu.launch_func` to a sequence of LLVM instructions resulting + // in a kernel launch call. + LogicalResult launchKernel(Attribute attribute, + Operation *launchFuncOperation, + Operation *binaryOperation, + llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) const; +}; +// Returns an identifier for the global string holding the binary. +std::string getBinaryIdentifier(StringRef binaryName) { + return binaryName.str() + "_bin_cst"; +} +} // namespace + +void mlir::gpu::registerOffloadingLLVMTranslationInterfacesExternalModels( + DialectRegistry ®istry) { + registry.addExtension(+[](MLIRContext *ctx, gpu::GPUDialect *dialect) { + SelectObjectAttr::attachInterface(*ctx); + }); +} + +LogicalResult SelectObjectAttrImpl::embedBinary( + Attribute attribute, Operation *operation, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) const { + assert(operation && "The binary operation must be non null."); + if (!operation) + return failure(); + + auto op = mlir::dyn_cast(operation); + if (!op) { + operation->emitError("Operation must be a GPU binary."); + return failure(); + } + + ArrayRef objects = op.getObjectsAttr().getValue(); + + // Obtain the index of the object to select. + int64_t index = -1; + if (Attribute target = cast(attribute).getTarget()) { + // If the target attribute is a number it is the index. Otherwise compare + // the attribute to every target inside the object array to find the index. + if (auto indexAttr = mlir::dyn_cast(target)) { + index = indexAttr.getInt(); + } else { + for (auto [i, attr] : llvm::enumerate(objects)) { + auto obj = mlir::dyn_cast(attr); + if (obj.getTarget() == target) { + index = i; + } + } + } + } else { + // If the target attribute is null then it's selecting the first object in + // the object array. + index = 0; + } + + if (index < 0 || index >= static_cast(objects.size())) { + op->emitError("The requested target object couldn't be found."); + return failure(); + } + auto object = mlir::dyn_cast(objects[index]); + + llvm::Module *module = moduleTranslation.getLLVMModule(); + + // Embed the object as a global string. + llvm::Constant *binary = llvm::ConstantDataArray::getString( + builder.getContext(), object.getObject().getValue(), false); + llvm::GlobalVariable *serializedObj = + new llvm::GlobalVariable(*module, binary->getType(), true, + llvm::GlobalValue::LinkageTypes::InternalLinkage, + binary, getBinaryIdentifier(op.getName())); + serializedObj->setLinkage(llvm::GlobalValue::LinkageTypes::InternalLinkage); + serializedObj->setAlignment(llvm::MaybeAlign(8)); + serializedObj->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); + return success(); +} + +namespace llvm { +namespace { +class LaunchKernel { +public: + LaunchKernel(Module &module, IRBuilderBase &builder, + mlir::LLVM::ModuleTranslation &moduleTranslation); + // Get the kernel launch callee. + FunctionCallee getKernelLaunchFn(); + + // Get the module function callee. + FunctionCallee getModuleFunctionFn(); + + // Get the module load callee. + FunctionCallee getModuleLoadFn(); + + // Get the module unload callee. + FunctionCallee getModuleUnloadFn(); + + // Get the stream create callee. + FunctionCallee getStreamCreateFn(); + + // Get the stream destroy callee. + FunctionCallee getStreamDestroyFn(); + + // Get the stream sync callee. + FunctionCallee getStreamSyncFn(); + + // Ger or create the function name global string. + Value *getOrCreateFunctionName(StringRef moduleName, StringRef kernelName); + + // Create the void* kernel array for passing the arguments. + Value *createKernelArgArray(mlir::gpu::LaunchFuncOp op); + + // Create the full kernel launch. + mlir::LogicalResult createKernelLaunch(mlir::gpu::LaunchFuncOp op); + +private: + Module &module; + IRBuilderBase &builder; + mlir::LLVM::ModuleTranslation &moduleTranslation; + Type *i32Ty{}; + Type *voidTy{}; + Type *intPtrTy{}; + PointerType *ptrTy{}; +}; +} // namespace +} // namespace llvm + +LogicalResult SelectObjectAttrImpl::launchKernel( + Attribute attribute, Operation *launchFuncOperation, + Operation *binaryOperation, llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) const { + + assert(launchFuncOperation && "The launch func operation must be non null."); + if (!launchFuncOperation) + return failure(); + + auto launchFuncOp = mlir::dyn_cast(launchFuncOperation); + if (!launchFuncOp) { + launchFuncOperation->emitError("Operation must be a GPU launch func Op."); + return failure(); + } + + return llvm::LaunchKernel(*moduleTranslation.getLLVMModule(), builder, + moduleTranslation) + .createKernelLaunch(launchFuncOp); +} + +llvm::LaunchKernel::LaunchKernel( + Module &module, IRBuilderBase &builder, + mlir::LLVM::ModuleTranslation &moduleTranslation) + : module(module), builder(builder), moduleTranslation(moduleTranslation) { + i32Ty = builder.getInt32Ty(); + ptrTy = builder.getPtrTy(0); + voidTy = builder.getVoidTy(); + intPtrTy = builder.getIntPtrTy(module.getDataLayout()); +} + +llvm::FunctionCallee llvm::LaunchKernel::getKernelLaunchFn() { + return module.getOrInsertFunction( + "mgpuLaunchKernel", + FunctionType::get( + voidTy, + ArrayRef({ptrTy, intPtrTy, intPtrTy, intPtrTy, intPtrTy, + intPtrTy, intPtrTy, i32Ty, ptrTy, ptrTy, ptrTy}), + false)); +} + +llvm::FunctionCallee llvm::LaunchKernel::getModuleFunctionFn() { + return module.getOrInsertFunction( + "mgpuModuleGetFunction", + FunctionType::get(ptrTy, ArrayRef({ptrTy, ptrTy}), false)); +} + +llvm::FunctionCallee llvm::LaunchKernel::getModuleLoadFn() { + return module.getOrInsertFunction( + "mgpuModuleLoad", + FunctionType::get(ptrTy, ArrayRef({ptrTy}), false)); +} + +llvm::FunctionCallee llvm::LaunchKernel::getModuleUnloadFn() { + return module.getOrInsertFunction( + "mgpuModuleUnload", + FunctionType::get(voidTy, ArrayRef({ptrTy}), false)); +} + +llvm::FunctionCallee llvm::LaunchKernel::getStreamCreateFn() { + return module.getOrInsertFunction("mgpuStreamCreate", + FunctionType::get(ptrTy, false)); +} + +llvm::FunctionCallee llvm::LaunchKernel::getStreamDestroyFn() { + return module.getOrInsertFunction( + "mgpuStreamDestroy", + FunctionType::get(voidTy, ArrayRef({ptrTy}), false)); +} + +llvm::FunctionCallee llvm::LaunchKernel::getStreamSyncFn() { + return module.getOrInsertFunction( + "mgpuStreamSynchronize", + FunctionType::get(voidTy, ArrayRef({ptrTy}), false)); +} + +// Generates an LLVM IR dialect global that contains the name of the given +// kernel function as a C string, and returns a pointer to its beginning. +llvm::Value *llvm::LaunchKernel::getOrCreateFunctionName(StringRef moduleName, + StringRef kernelName) { + std::string globalName = + std::string(formatv("{0}_{1}_kernel_name", moduleName, kernelName)); + + if (GlobalVariable *gv = module.getGlobalVariable(globalName)) + return gv; + + return builder.CreateGlobalString(kernelName, globalName); +} + +// Creates a struct containing all kernel parameters on the stack and returns +// an array of type-erased pointers to the fields of the struct. The array can +// then be passed to the CUDA / ROCm (HIP) kernel launch calls. +// The generated code is essentially as follows: +// +// %struct = alloca(sizeof(struct { Parameters... })) +// %array = alloca(NumParameters * sizeof(void *)) +// for (i : [0, NumParameters)) +// %fieldPtr = llvm.getelementptr %struct[0, i] +// llvm.store parameters[i], %fieldPtr +// %elementPtr = llvm.getelementptr %array[i] +// llvm.store %fieldPtr, %elementPtr +// return %array +llvm::Value * +llvm::LaunchKernel::createKernelArgArray(mlir::gpu::LaunchFuncOp op) { + SmallVector args = + moduleTranslation.lookupValues(op.getKernelOperands()); + SmallVector structTypes(args.size(), nullptr); + + for (auto [i, arg] : llvm::enumerate(args)) + structTypes[i] = arg->getType(); + + Type *structTy = StructType::create(module.getContext(), structTypes); + Value *argStruct = builder.CreateAlloca(structTy, 0u); + Value *argArray = builder.CreateAlloca( + ptrTy, ConstantInt::get(intPtrTy, structTypes.size())); + + for (auto [i, arg] : enumerate(args)) { + Value *structMember = builder.CreateStructGEP(structTy, argStruct, i); + builder.CreateStore(arg, structMember); + Value *arrayMember = builder.CreateConstGEP1_32(ptrTy, argArray, i); + builder.CreateStore(structMember, arrayMember); + } + return argArray; +} + +// Emits LLVM IR to launch a kernel function: +// %0 = call %binarygetter +// %1 = call %moduleLoad(%0) +// %2 = +// %3 = call %moduleGetFunction(%1, %2) +// %4 = call %streamCreate() +// %5 = +// call %launchKernel(%3, , 0, %4, %5, nullptr) +// call %streamSynchronize(%4) +// call %streamDestroy(%4) +// call %moduleUnload(%1) +mlir::LogicalResult +llvm::LaunchKernel::createKernelLaunch(mlir::gpu::LaunchFuncOp op) { + auto llvmValue = [&](mlir::Value value) -> Value * { + Value *v = moduleTranslation.lookupValue(value); + assert(v && "Value has not been translated."); + return v; + }; + + // Get grid dimensions. + mlir::gpu::KernelDim3 grid = op.getGridSizeOperandValues(); + Value *gx = llvmValue(grid.x), *gy = llvmValue(grid.y), + *gz = llvmValue(grid.z); + + // Get block dimensions. + mlir::gpu::KernelDim3 block = op.getBlockSizeOperandValues(); + Value *bx = llvmValue(block.x), *by = llvmValue(block.y), + *bz = llvmValue(block.z); + + // Get dynamic shared memory size. + Value *dynamicMemorySize = nullptr; + if (mlir::Value dynSz = op.getDynamicSharedMemorySize()) + dynamicMemorySize = llvmValue(dynSz); + else + dynamicMemorySize = ConstantInt::get(i32Ty, 0); + + // Create the argument array. + Value *argArray = createKernelArgArray(op); + + // Load the kernel module. + StringRef moduleName = op.getKernelModuleName().getValue(); + std::string binaryIdentifier = getBinaryIdentifier(moduleName); + Value *binary = module.getGlobalVariable(binaryIdentifier, true); + if (!binary) + return op.emitError() << "Couldn't find the binary: " << binaryIdentifier; + Value *moduleObject = builder.CreateCall(getModuleLoadFn(), {binary}); + + // Load the kernel function. + Value *moduleFunction = builder.CreateCall( + getModuleFunctionFn(), + {moduleObject, + getOrCreateFunctionName(moduleName, op.getKernelName().getValue())}); + + // Get the stream to use for execution. If there's no async object then create + // a stream to make a synchronous kernel launch. + Value *stream = nullptr; + bool handleStream = false; + if (mlir::Value asyncObject = op.getAsyncObject()) { + stream = llvmValue(asyncObject); + } else { + handleStream = true; + stream = builder.CreateCall(getStreamCreateFn(), {}); + } + + // Create the launch call. + Value *nullPtr = ConstantPointerNull::get(ptrTy); + builder.CreateCall( + getKernelLaunchFn(), + ArrayRef({moduleFunction, gx, gy, gz, bx, by, bz, + dynamicMemorySize, stream, argArray, nullPtr})); + + // Sync & destroy the stream, for synchronous launches. + if (handleStream) { + builder.CreateCall(getStreamSyncFn(), {stream}); + builder.CreateCall(getStreamDestroyFn(), {stream}); + } + + // Unload the kernel module. + builder.CreateCall(getModuleUnloadFn(), {moduleObject}); + + return success(); +} diff --git a/mlir/test/Dialect/GPU/invalid.mlir b/mlir/test/Dialect/GPU/invalid.mlir --- a/mlir/test/Dialect/GPU/invalid.mlir +++ b/mlir/test/Dialect/GPU/invalid.mlir @@ -637,6 +637,6 @@ // ----- module { - // expected-error @+1 {{custom op 'gpu.binary' invalid kind of attribute specified}} + // expected-error @+1 {{'gpu.binary' op attribute 'offloadingHandler' failed to satisfy constraint: OffloadingLLVMTranslationAttrInterface instance}} gpu.binary @binary <1> [#gpu.object<#nvvm.target, "">] } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -121,6 +121,12 @@ } } + gpu.binary @binary_1 [#gpu.object<#nvvm.target, "">] + + gpu.binary @binary_2 <#gpu.select_object<#rocdl.target>> [#gpu.object<#nvvm.target, "">, #gpu.object<#rocdl.target, "">] + + gpu.binary @binary_3 <#gpu.select_object<1>> [#gpu.object<#nvvm.target, "">, #gpu.object<#rocdl.target, "">] + func.func private @two_value_generator() -> (f32, memref) func.func @foo() { @@ -150,6 +156,9 @@ // CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i32 args(%{{.*}} : f32, %{{.*}} : memref) gpu.launch_func @kernels::@kernel_1 blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0) : i32 args(%0 : f32, %1 : memref) + // CHECK: gpu.launch_func @binary_1::@kernel blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i32 args(%{{.*}} : f32, %{{.*}} : memref) + gpu.launch_func @binary_1::@kernel blocks in (%c0, %c0, %c0) threads in (%c0, %c0, %c0) : i32 args(%0 : f32, %1 : memref) + // CHECK: %[[VALUES:.*]]:2 = call %values:2 = func.call @two_value_generator() : () -> (f32, memref) // CHECK: gpu.launch_func @kernels::@kernel_1 {{.*}} args(%[[VALUES]]#0 : f32, %[[VALUES]]#1 : memref)