Changeset View
Changeset View
Standalone View
Standalone View
mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp
Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines | |||||
/// * mcuModuleLoad -- loads the module given the cubin data | /// * mcuModuleLoad -- loads the module given the cubin data | ||||
/// * mcuModuleGetFunction -- gets a handle to the actual kernel function | /// * mcuModuleGetFunction -- gets a handle to the actual kernel function | ||||
/// * mcuGetStreamHelper -- initializes a new CUDA stream | /// * mcuGetStreamHelper -- initializes a new CUDA stream | ||||
/// * mcuLaunchKernelName -- launches the kernel on a stream | /// * mcuLaunchKernelName -- launches the kernel on a stream | ||||
/// * mcuStreamSynchronize -- waits for operations on the stream to finish | /// * mcuStreamSynchronize -- waits for operations on the stream to finish | ||||
/// | /// | ||||
/// Intermediate data structures are allocated on the stack. | /// Intermediate data structures are allocated on the stack. | ||||
class GpuLaunchFuncToCudaCallsPass | class GpuLaunchFuncToCudaCallsPass | ||||
: public ModulePass<GpuLaunchFuncToCudaCallsPass> { | : public OperationPass<GpuLaunchFuncToCudaCallsPass, ModuleOp> { | ||||
private: | private: | ||||
/// Include the generated pass utilities. | /// Include the generated pass utilities. | ||||
#define GEN_PASS_ConvertGpuLaunchFuncToCudaCalls | #define GEN_PASS_ConvertGpuLaunchFuncToCudaCalls | ||||
#include "mlir/Conversion/Passes.h.inc" | #include "mlir/Conversion/Passes.h.inc" | ||||
LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; } | LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; } | ||||
llvm::LLVMContext &getLLVMContext() { | llvm::LLVMContext &getLLVMContext() { | ||||
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines | void addParamToList(OpBuilder &builder, Location loc, Value param, Value list, | ||||
unsigned pos, Value one); | unsigned pos, Value one); | ||||
Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder); | Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder); | ||||
Value generateKernelNameConstant(StringRef name, Location loc, | Value generateKernelNameConstant(StringRef name, Location loc, | ||||
OpBuilder &builder); | OpBuilder &builder); | ||||
void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp); | void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp); | ||||
public: | public: | ||||
// Run the dialect converter on the module. | // Run the dialect converter on the module. | ||||
void runOnModule() override { | void runOnOperation() override { | ||||
// Cache the LLVMDialect for the current module. | // Cache the LLVMDialect for the current module. | ||||
llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>(); | llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>(); | ||||
// Cache the used LLVM types. | // Cache the used LLVM types. | ||||
initializeCachedTypes(); | initializeCachedTypes(); | ||||
getModule().walk([this](mlir::gpu::LaunchFuncOp op) { | getOperation().walk( | ||||
translateGpuLaunchCalls(op); | [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); }); | ||||
}); | |||||
// GPU kernel modules are no longer necessary since we have a global | // GPU kernel modules are no longer necessary since we have a global | ||||
// constant with the CUBIN data. | // constant with the CUBIN data. | ||||
for (auto m : | for (auto m : | ||||
llvm::make_early_inc_range(getModule().getOps<gpu::GPUModuleOp>())) | llvm::make_early_inc_range(getOperation().getOps<gpu::GPUModuleOp>())) | ||||
m.erase(); | m.erase(); | ||||
} | } | ||||
private: | private: | ||||
LLVM::LLVMDialect *llvmDialect; | LLVM::LLVMDialect *llvmDialect; | ||||
LLVM::LLVMType llvmVoidType; | LLVM::LLVMType llvmVoidType; | ||||
LLVM::LLVMType llvmPointerType; | LLVM::LLVMType llvmPointerType; | ||||
LLVM::LLVMType llvmPointerPointerType; | LLVM::LLVMType llvmPointerPointerType; | ||||
LLVM::LLVMType llvmInt8Type; | LLVM::LLVMType llvmInt8Type; | ||||
LLVM::LLVMType llvmInt32Type; | LLVM::LLVMType llvmInt32Type; | ||||
LLVM::LLVMType llvmInt64Type; | LLVM::LLVMType llvmInt64Type; | ||||
LLVM::LLVMType llvmIntPtrType; | LLVM::LLVMType llvmIntPtrType; | ||||
}; | }; | ||||
} // anonymous namespace | } // anonymous namespace | ||||
// Adds declarations for the needed helper functions from the CUDA wrapper. | // Adds declarations for the needed helper functions from the CUDA wrapper. | ||||
// The types in comments give the actual types expected/returned but the API | // The types in comments give the actual types expected/returned but the API | ||||
// uses void pointers. This is fine as they have the same linkage in C. | // uses void pointers. This is fine as they have the same linkage in C. | ||||
void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) { | void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) { | ||||
ModuleOp module = getModule(); | ModuleOp module = getOperation(); | ||||
OpBuilder builder(module.getBody()->getTerminator()); | OpBuilder builder(module.getBody()->getTerminator()); | ||||
if (!module.lookupSymbol(cuModuleLoadName)) { | if (!module.lookupSymbol(cuModuleLoadName)) { | ||||
builder.create<LLVM::LLVMFuncOp>( | builder.create<LLVM::LLVMFuncOp>( | ||||
loc, cuModuleLoadName, | loc, cuModuleLoadName, | ||||
LLVM::LLVMType::getFunctionTy( | LLVM::LLVMType::getFunctionTy( | ||||
getCUResultType(), | getCUResultType(), | ||||
{ | { | ||||
getPointerPointerType(), /* CUmodule *module */ | getPointerPointerType(), /* CUmodule *module */ | ||||
▲ Show 20 Lines • Show All 214 Lines • ▼ Show 20 Lines | void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls( | ||||
OpBuilder builder(launchOp); | OpBuilder builder(launchOp); | ||||
Location loc = launchOp.getLoc(); | Location loc = launchOp.getLoc(); | ||||
declareCudaFunctions(loc); | declareCudaFunctions(loc); | ||||
auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(), | auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(), | ||||
builder.getI32IntegerAttr(0)); | builder.getI32IntegerAttr(0)); | ||||
// Create an LLVM global with CUBIN extracted from the kernel annotation and | // Create an LLVM global with CUBIN extracted from the kernel annotation and | ||||
// obtain a pointer to the first byte in it. | // obtain a pointer to the first byte in it. | ||||
auto kernelModule = getModule().lookupSymbol<gpu::GPUModuleOp>( | auto kernelModule = getOperation().lookupSymbol<gpu::GPUModuleOp>( | ||||
launchOp.getKernelModuleName()); | launchOp.getKernelModuleName()); | ||||
assert(kernelModule && "expected a kernel module"); | assert(kernelModule && "expected a kernel module"); | ||||
auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation); | auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation); | ||||
if (!cubinAttr) { | if (!cubinAttr) { | ||||
kernelModule.emitOpError() | kernelModule.emitOpError() | ||||
<< "missing " << kCubinAnnotation << " attribute"; | << "missing " << kCubinAnnotation << " attribute"; | ||||
return signalPassFailure(); | return signalPassFailure(); | ||||
} | } | ||||
SmallString<128> nameBuffer(kernelModule.getName()); | SmallString<128> nameBuffer(kernelModule.getName()); | ||||
nameBuffer.append(kCubinStorageSuffix); | nameBuffer.append(kCubinStorageSuffix); | ||||
Value data = LLVM::createGlobalString( | Value data = LLVM::createGlobalString( | ||||
loc, builder, nameBuffer.str(), cubinAttr.getValue(), | loc, builder, nameBuffer.str(), cubinAttr.getValue(), | ||||
LLVM::Linkage::Internal, getLLVMDialect()); | LLVM::Linkage::Internal, getLLVMDialect()); | ||||
// Emit the load module call to load the module data. Error checking is done | // Emit the load module call to load the module data. Error checking is done | ||||
// in the called helper function. | // in the called helper function. | ||||
auto cuModule = allocatePointer(builder, loc); | auto cuModule = allocatePointer(builder, loc); | ||||
auto cuModuleLoad = | auto cuModuleLoad = | ||||
getModule().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleLoadName); | getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleLoadName); | ||||
builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()}, | builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()}, | ||||
builder.getSymbolRefAttr(cuModuleLoad), | builder.getSymbolRefAttr(cuModuleLoad), | ||||
ArrayRef<Value>{cuModule, data}); | ArrayRef<Value>{cuModule, data}); | ||||
// Get the function from the module. The name corresponds to the name of | // Get the function from the module. The name corresponds to the name of | ||||
// the kernel function. | // the kernel function. | ||||
auto cuOwningModuleRef = | auto cuOwningModuleRef = | ||||
builder.create<LLVM::LoadOp>(loc, getPointerType(), cuModule); | builder.create<LLVM::LoadOp>(loc, getPointerType(), cuModule); | ||||
auto kernelName = generateKernelNameConstant(launchOp.kernel(), loc, builder); | auto kernelName = generateKernelNameConstant(launchOp.kernel(), loc, builder); | ||||
auto cuFunction = allocatePointer(builder, loc); | auto cuFunction = allocatePointer(builder, loc); | ||||
auto cuModuleGetFunction = | auto cuModuleGetFunction = | ||||
getModule().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleGetFunctionName); | getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleGetFunctionName); | ||||
builder.create<LLVM::CallOp>( | builder.create<LLVM::CallOp>( | ||||
loc, ArrayRef<Type>{getCUResultType()}, | loc, ArrayRef<Type>{getCUResultType()}, | ||||
builder.getSymbolRefAttr(cuModuleGetFunction), | builder.getSymbolRefAttr(cuModuleGetFunction), | ||||
ArrayRef<Value>{cuFunction, cuOwningModuleRef, kernelName}); | ArrayRef<Value>{cuFunction, cuOwningModuleRef, kernelName}); | ||||
// Grab the global stream needed for execution. | // Grab the global stream needed for execution. | ||||
auto cuGetStreamHelper = | auto cuGetStreamHelper = | ||||
getModule().lookupSymbol<LLVM::LLVMFuncOp>(cuGetStreamHelperName); | getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGetStreamHelperName); | ||||
auto cuStream = builder.create<LLVM::CallOp>( | auto cuStream = builder.create<LLVM::CallOp>( | ||||
loc, ArrayRef<Type>{getPointerType()}, | loc, ArrayRef<Type>{getPointerType()}, | ||||
builder.getSymbolRefAttr(cuGetStreamHelper), ArrayRef<Value>{}); | builder.getSymbolRefAttr(cuGetStreamHelper), ArrayRef<Value>{}); | ||||
// Invoke the function with required arguments. | // Invoke the function with required arguments. | ||||
auto cuLaunchKernel = | auto cuLaunchKernel = | ||||
getModule().lookupSymbol<LLVM::LLVMFuncOp>(cuLaunchKernelName); | getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuLaunchKernelName); | ||||
auto cuFunctionRef = | auto cuFunctionRef = | ||||
builder.create<LLVM::LoadOp>(loc, getPointerType(), cuFunction); | builder.create<LLVM::LoadOp>(loc, getPointerType(), cuFunction); | ||||
auto paramsArray = setupParamsArray(launchOp, builder); | auto paramsArray = setupParamsArray(launchOp, builder); | ||||
if (!paramsArray) { | if (!paramsArray) { | ||||
launchOp.emitOpError() << "cannot pass given parameters to the kernel"; | launchOp.emitOpError() << "cannot pass given parameters to the kernel"; | ||||
return signalPassFailure(); | return signalPassFailure(); | ||||
} | } | ||||
auto nullpointer = | auto nullpointer = | ||||
builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero); | builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero); | ||||
builder.create<LLVM::CallOp>( | builder.create<LLVM::CallOp>( | ||||
loc, ArrayRef<Type>{getCUResultType()}, | loc, ArrayRef<Type>{getCUResultType()}, | ||||
builder.getSymbolRefAttr(cuLaunchKernel), | builder.getSymbolRefAttr(cuLaunchKernel), | ||||
ArrayRef<Value>{cuFunctionRef, launchOp.getOperand(0), | ArrayRef<Value>{cuFunctionRef, launchOp.getOperand(0), | ||||
launchOp.getOperand(1), launchOp.getOperand(2), | launchOp.getOperand(1), launchOp.getOperand(2), | ||||
launchOp.getOperand(3), launchOp.getOperand(4), | launchOp.getOperand(3), launchOp.getOperand(4), | ||||
launchOp.getOperand(5), zero, /* sharedMemBytes */ | launchOp.getOperand(5), zero, /* sharedMemBytes */ | ||||
cuStream.getResult(0), /* stream */ | cuStream.getResult(0), /* stream */ | ||||
paramsArray, /* kernel params */ | paramsArray, /* kernel params */ | ||||
nullpointer /* extra */}); | nullpointer /* extra */}); | ||||
// Sync on the stream to make it synchronous. | // Sync on the stream to make it synchronous. | ||||
auto cuStreamSync = | auto cuStreamSync = | ||||
getModule().lookupSymbol<LLVM::LLVMFuncOp>(cuStreamSynchronizeName); | getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuStreamSynchronizeName); | ||||
builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()}, | builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()}, | ||||
builder.getSymbolRefAttr(cuStreamSync), | builder.getSymbolRefAttr(cuStreamSync), | ||||
ArrayRef<Value>(cuStream.getResult(0))); | ArrayRef<Value>(cuStream.getResult(0))); | ||||
launchOp.erase(); | launchOp.erase(); | ||||
} | } | ||||
std::unique_ptr<mlir::OpPassBase<mlir::ModuleOp>> | std::unique_ptr<mlir::OpPassBase<mlir::ModuleOp>> | ||||
mlir::createConvertGpuLaunchFuncToCudaCallsPass() { | mlir::createConvertGpuLaunchFuncToCudaCallsPass() { | ||||
return std::make_unique<GpuLaunchFuncToCudaCallsPass>(); | return std::make_unique<GpuLaunchFuncToCudaCallsPass>(); | ||||
} | } |