diff --git a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h --- a/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h +++ b/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h @@ -392,7 +392,7 @@ LLVM::LLVMType getVoidType() const; /// Get the MLIR type wrapping the LLVM i8* type. - LLVM::LLVMType getVoidPtrType() const; + LLVM::LLVMType getVoidPtrType(unsigned addrspace = 0) const; /// Create an LLVM dialect operation defining the given index constant. Value createIndexConstant(ConversionPatternRewriter &builder, Location loc, diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h --- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h +++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h @@ -18,7 +18,9 @@ #include "mlir/IR/FunctionSupport.h" #include "mlir/IR/OpDefinition.h" #include "mlir/IR/OpImplementation.h" +#include "mlir/IR/StandardTypes.h" #include "mlir/IR/SymbolTable.h" +#include "mlir/IR/Types.h" #include "mlir/Interfaces/SideEffects.h" namespace mlir { @@ -34,6 +36,24 @@ Value z; }; +enum GpuTypes { + Chain = Type::FIRST_GPU_TYPE, + LAST_USED_GPU_TYPE = Chain, +}; + +class ChainType : public Type::TypeBase { +public: + // Used for generic hooks in TypeBase. + using Base::Base; + /// Construction hook. + static ChainType get(MLIRContext *context) { + /// Custom, uniq'ed construction in the MLIRContext. + return Base::get(context, GpuTypes::Chain); + } + /// Used to implement llvm-style cast. + static bool kindof(unsigned kind) { return kind == GpuTypes::Chain; } +}; + #include "mlir/Dialect/GPU/GPUOpsDialect.h.inc" #define GET_OP_CLASSES diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -23,6 +23,10 @@ Or<[AnySignlessInteger.predicate, Index.predicate, LLVMInt.predicate]>, "integer, index or LLVM dialect equivalent">; +def GPU_ChainType : DialectType< + GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::ChainType>()">, "chain">, + BuildableType<"mlir::gpu::ChainType::get($_builder.getContext())">; + //===----------------------------------------------------------------------===// // GPU Dialect operations. //===----------------------------------------------------------------------===// @@ -245,13 +249,13 @@ let parser = [{ return parseGPUFuncOp(parser, result); }]; } -def GPU_LaunchFuncOp : GPU_Op<"launch_func">, - Arguments<(ins IntLikeOrLLVMInt:$gridSizeX, IntLikeOrLLVMInt:$gridSizeY, +def GPU_LaunchFuncOp : GPU_Op<"launch_func", [AttrSizedOperandSegments]>, + Arguments<(ins Variadic:$chains, IntLikeOrLLVMInt:$gridSizeX, IntLikeOrLLVMInt:$gridSizeY, IntLikeOrLLVMInt:$gridSizeZ, IntLikeOrLLVMInt:$blockSizeX, IntLikeOrLLVMInt:$blockSizeY, IntLikeOrLLVMInt:$blockSizeZ, Variadic:$operands)>, - Results<(outs)> { - let summary = "Launches a function as a GPU kerneel"; + Results<(outs GPU_ChainType)> { + let summary = "Launches a function as a GPU kernel"; let description = [{ Launch a kernel function on the specified grid of thread blocks. @@ -323,11 +327,11 @@ let skipDefaultBuilders = 1; let builders = [ - OpBuilder<"Builder *builder, OperationState &result, GPUFuncOp kernelFunc, " + OpBuilder<"Builder *builder, OperationState &result, ValueRange chains, GPUFuncOp kernelFunc, " "Value gridSizeX, Value gridSizeY, Value gridSizeZ, " "Value blockSizeX, Value blockSizeY, Value blockSizeZ, " "ValueRange kernelOperands">, - OpBuilder<"Builder *builder, OperationState &result, GPUFuncOp kernelFunc, " + OpBuilder<"Builder *builder, OperationState &result, ValueRange chains, GPUFuncOp kernelFunc, " "KernelDim3 gridSize, KernelDim3 blockSize, " "ValueRange kernelOperands"> ]; @@ -352,6 +356,8 @@ /// Get the SSA values passed as operands to specify the block size. KernelDim3 getBlockSizeOperandValues(); + OperandRange getChains(); + /// The number of launch configuration operands, placed at the leading /// positions of the operand list. static constexpr unsigned kNumConfigOperands = 6; @@ -373,9 +379,11 @@ } def GPU_LaunchOp : GPU_Op<"launch">, - Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, + Arguments<(ins Variadic:$chains, + Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ)>, - Results<(outs)> { + // Variadic (0 or 1) to be backwards compatible. + Results<(outs Variadic)> { let summary = "GPU kernel launch operation"; let description = [{ @@ -446,7 +454,10 @@ let builders = [ OpBuilder<"Builder *builder, OperationState &result, Value gridSizeX," "Value gridSizeY, Value gridSizeZ, Value blockSizeX," - "Value blockSizeY, Value blockSizeZ"> + "Value blockSizeY, Value blockSizeZ">, + OpBuilder<"Builder *builder, OperationState &result, ValueRange chains, " + "Value gridSizeX, Value gridSizeY, Value gridSizeZ, " + "Value blockSizeX, Value blockSizeY, Value blockSizeZ"> ]; let extraClassDeclaration = [{ @@ -464,6 +475,8 @@ /// Get the SSA values passed as operands to specify the block size. KernelDim3 getBlockSizeOperandValues(); + OperandRange getChains(); + static StringRef getBlocksKeyword() { return "blocks"; } static StringRef getThreadsKeyword() { return "threads"; } @@ -641,6 +654,46 @@ let printer = [{ p << getOperationName(); }]; } +def GPU_MemcpyOp : GPU_Op<"memcpy"> { + let summary = "GPU memory copy operation"; + let description = [{ + The `memcpy` operation copies a region of memory from GPU to host or vice + versa. + }]; + + // TODO(csigg): src and dst should be AnyUnrankedMemRef. + let arguments = (ins Variadic:$chains, AnyType:$dst, + AnyType:$src, OptionalAttr:$element_size); + let results = (outs GPU_ChainType); + + let builders = [OpBuilder< + "Builder *builder, OperationState &result, ValueRange chains, Value dst, " # + "Value src", [{ + result.addOperands(chains); + result.addOperands({dst, src}); + result.types.push_back(ChainType::get(builder->getContext())); + }]>]; + + let extraClassDeclaration = [{ + }]; + + let assemblyFormat = [{ + (`[` $chains^ `]`)? `(` $dst`,` $src `)` attr-dict `:` type($dst)`,` type($src) + }]; +} + +def GPU_WaitOp : GPU_Op<"wait"> { + let summary = "GPU async wait operation"; + let description = [{ + The `wait` operation blocks on a list of chains. + }]; + + let arguments = (ins Variadic:$chains); + let results = (outs); + + let assemblyFormat = "`[` $chains `]` attr-dict"; +} + def GPU_GPUModuleOp : GPU_Op<"module", [ IsolatedFromAbove, SymbolTable, Symbol, SingleBlockImplicitTerminator<"ModuleEndOp"> diff --git a/mlir/include/mlir/IR/DialectSymbolRegistry.def b/mlir/include/mlir/IR/DialectSymbolRegistry.def --- a/mlir/include/mlir/IR/DialectSymbolRegistry.def +++ b/mlir/include/mlir/IR/DialectSymbolRegistry.def @@ -25,6 +25,7 @@ DEFINE_SYM_KIND_RANGE(SPIRV) // SPIR-V dialect DEFINE_SYM_KIND_RANGE(XLA_HLO) // XLA HLO dialect DEFINE_SYM_KIND_RANGE(SHAPE) // Shape dialect +DEFINE_SYM_KIND_RANGE(GPU) // GPU dialect // The following ranges are reserved for experimenting with MLIR dialects in a // private context without having to register them here. diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp --- a/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp +++ b/mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp @@ -31,6 +31,7 @@ #include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Module.h" #include "llvm/Support/Error.h" +#include "llvm/Support/Mutex.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" #include "llvm/Target/TargetMachine.h" @@ -57,6 +58,11 @@ void runOnOperation() override { gpu::GPUModuleOp module = getOperation(); + auto llvmDialect = + module.getContext()->getRegisteredDialect(); + llvm::sys::SmartScopedLock scopedLock( + llvmDialect->getLLVMContextMutex()); + // Make sure the NVPTX target is initialized. LLVMInitializeNVPTXTarget(); LLVMInitializeNVPTXTargetInfo(); diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp --- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp +++ b/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp @@ -41,6 +41,10 @@ static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper"; static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize"; static constexpr const char *kMcuMemHostRegister = "mcuMemHostRegister"; +static constexpr const char *cuGraphAddKernelNodeName = "mcuGraphAddKernelNode"; +static constexpr const char *cuGraphAddMemcpyNodeName = "mcuGraphAddMemcpyNode"; +static constexpr const char *cuGraphExecuteName = "mcuGraphExecute"; +static constexpr const char *cuGetGraphHelperName = "mcuGetGraphHelper"; static constexpr const char *kCubinAnnotation = "nvvm.cubin"; static constexpr const char *kCubinStorageSuffix = "_cubin_cst"; @@ -79,6 +83,17 @@ llvmInt64Type = LLVM::LLVMType::getInt64Ty(llvmDialect); llvmIntPtrType = LLVM::LLVMType::getIntNTy( llvmDialect, module.getDataLayout().getPointerSizeInBits()); + llvmKernelNodeParamsType = LLVM::LLVMType::getStructTy( + llvmDialect, + { + getPointerType(), // CUfunction + getInt32Type(), getInt32Type(), getInt32Type(), // gridDim + getInt32Type(), getInt32Type(), getInt32Type(), // blockDim + getInt32Type(), // sharedMemBytes + getPointerPointerType(), // kernelParams + getPointerPointerType() // extras + }, + false); } LLVM::LLVMType getVoidType() { return llvmVoidType; } @@ -104,6 +119,8 @@ return getInt32Type(); } + LLVM::LLVMType getKernelNodeParamsType() { return llvmKernelNodeParamsType; } + // Allocate a void pointer on the stack. Value allocatePointer(OpBuilder &builder, Location loc) { auto one = builder.create(loc, getInt32Type(), @@ -119,6 +136,8 @@ Value generateKernelNameConstant(StringRef moduleName, StringRef name, Location loc, OpBuilder &builder); void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp); + void translateGpuMemcpyOp(mlir::gpu::MemcpyOp memcpyOp); + void translateGpuWaitOp(mlir::gpu::WaitOp waitOp); public: // Run the dialect converter on the module. @@ -131,6 +150,11 @@ getOperation().walk( [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); }); + getOperation().walk( + [this](mlir::gpu::MemcpyOp op) { translateGpuMemcpyOp(op); }); + getOperation().walk( + [this](mlir::gpu::WaitOp op) { translateGpuWaitOp(op); }); + // GPU kernel modules are no longer necessary since we have a global // constant with the CUBIN data. for (auto m : @@ -147,6 +171,7 @@ LLVM::LLVMType llvmInt32Type; LLVM::LLVMType llvmInt64Type; LLVM::LLVMType llvmIntPtrType; + LLVM::LLVMType llvmKernelNodeParamsType; }; } // anonymous namespace @@ -230,6 +255,58 @@ }, /*isVarArg=*/false)); } + if (!module.lookupSymbol(cuGraphAddKernelNodeName)) { + builder.create( + loc, cuGraphAddKernelNodeName, + LLVM::LLVMType::getFunctionTy( + getCUResultType(), + { + getPointerPointerType(), /* CUgraphNode* graph_node */ + getPointerType(), /* CUgraph graph */ + getPointerPointerType(), /* const CUgraphNode* dependencies */ + getInt32Type(), /* size_t numDependencies */ + getKernelNodeParamsType() + .getPointerTo(), /* CUDA_KERNEL_NODE_PARAMS* params */ + }, + /*isVarArg=*/false)); + } + if (!module.lookupSymbol(cuGraphAddMemcpyNodeName)) { + auto memrefTy = LLVM::LLVMType::getStructTy( + llvmDialect, {getInt64Type(), getPointerType()}); + builder.create( + loc, cuGraphAddMemcpyNodeName, + LLVM::LLVMType::getFunctionTy( + getCUResultType(), + { + getPointerPointerType(), /* CUgraphNode* graph_node */ + getPointerType(), /* CUgraph graph */ + getPointerPointerType(), /* const CUgraphNode* dependencies */ + getInt32Type(), /* size_t numDependencies */ + memrefTy, /* dst */ + memrefTy, /* src */ + getInt64Type(), /* size_t element_size */ + }, + /*isVarArg=*/false)); + } + if (!module.lookupSymbol(cuGraphExecuteName)) { + builder.create( + loc, cuGraphExecuteName, + LLVM::LLVMType::getFunctionTy( + getCUResultType(), + { + getPointerType(), /* CUgraph graph */ + getPointerPointerType(), /* const CUgraphNode* dependencies */ + getInt32Type(), /* size_t numDependencies */ + }, + /*isVarArg=*/false)); + } + if (!module.lookupSymbol(cuGetGraphHelperName)) { + // Helper function to get the current CUDA stream. Uses void* instead of + // CUDAs opaque CUstream. + builder.create( + loc, cuGetGraphHelperName, + LLVM::LLVMType::getFunctionTy(getPointerType(), /*isVarArg=*/false)); + } } /// Emits the IR with the following structure: @@ -426,14 +503,14 @@ builder.getSymbolRefAttr(cuModuleGetFunction), ArrayRef{cuFunction, cuOwningModuleRef, kernelName}); // Grab the global stream needed for execution. - auto cuGetStreamHelper = - getOperation().lookupSymbol(cuGetStreamHelperName); - auto cuStream = builder.create( + auto cuGetGraphHelper = + getOperation().lookupSymbol(cuGetGraphHelperName); + auto cuGraph = builder.create( loc, ArrayRef{getPointerType()}, - builder.getSymbolRefAttr(cuGetStreamHelper), ArrayRef{}); + builder.getSymbolRefAttr(cuGetGraphHelper), ArrayRef{}); // Invoke the function with required arguments. - auto cuLaunchKernel = - getOperation().lookupSymbol(cuLaunchKernelName); + auto cuGraphAddKernelNode = + getOperation().lookupSymbol(cuGraphAddKernelNodeName); auto cuFunctionRef = builder.create(loc, getPointerType(), cuFunction); auto paramsArray = setupParamsArray(launchOp, builder); @@ -441,27 +518,142 @@ launchOp.emitOpError() << "cannot pass given parameters to the kernel"; return signalPassFailure(); } + + auto one = builder.create(loc, getInt32Type(), + builder.getI32IntegerAttr(1)); + auto kernelNodeParams = builder.create( + loc, getKernelNodeParamsType().getPointerTo(), one, /*alignement=*/0); + auto setParamsValue = [&](Value value, size_t i) { + auto index = builder.create(loc, getInt32Type(), + builder.getI32IntegerAttr(i)); + auto ptrTy = value.getType().cast().getPointerTo(); + auto gep = builder.create(loc, ptrTy, kernelNodeParams, + ArrayRef{zero, index}); + builder.create(loc, value, gep); + }; + + setParamsValue(cuFunctionRef, 0); + for (const auto &en : + llvm::enumerate(launchOp.getOperands() + .drop_front(launchOp.getChains().size()) + .take_front(launchOp.kNumConfigOperands))) { + auto truncOp = + builder.create(loc, getInt32Type(), en.value()); + setParamsValue(truncOp, 1 + en.index()); + } + setParamsValue(/*sharedMemSizeBytes=*/zero, 7); + setParamsValue(paramsArray, 8); auto nullpointer = builder.create(loc, getPointerPointerType(), zero); + setParamsValue(nullpointer, 9); + + auto chainsSize = builder.create( + loc, getInt32Type(), builder.getI32IntegerAttr(launchOp.chains().size())); + auto chainsArray = builder.create( + loc, getPointerPointerType(), chainsSize, /*alignment=*/0); + for (size_t i = 0; i < launchOp.chains().size(); ++i) { + auto index = builder.create(loc, getInt32Type(), + builder.getI32IntegerAttr(i)); + auto gep = builder.create(loc, getPointerPointerType(), + chainsArray, ArrayRef{index}); + builder.create(loc, launchOp.chains()[i], gep); + } + + auto nodePointer = allocatePointer(builder, loc); + builder.create( loc, ArrayRef{getCUResultType()}, - builder.getSymbolRefAttr(cuLaunchKernel), - ArrayRef{cuFunctionRef, launchOp.getOperand(0), - launchOp.getOperand(1), launchOp.getOperand(2), - launchOp.getOperand(3), launchOp.getOperand(4), - launchOp.getOperand(5), zero, /* sharedMemBytes */ - cuStream.getResult(0), /* stream */ - paramsArray, /* kernel params */ - nullpointer /* extra */}); - // Sync on the stream to make it synchronous. - auto cuStreamSync = - getOperation().lookupSymbol(cuStreamSynchronizeName); - builder.create(loc, ArrayRef{getCUResultType()}, - builder.getSymbolRefAttr(cuStreamSync), - ArrayRef(cuStream.getResult(0))); + builder.getSymbolRefAttr(cuGraphAddKernelNode), + ArrayRef{nodePointer, cuGraph.getResult(0), chainsArray, + chainsSize, kernelNodeParams}); + + Value nodeRef = + builder.create(loc, getPointerType(), nodePointer); + + launchOp.replaceAllUsesWith(nodeRef); launchOp.erase(); } +void GpuLaunchFuncToCudaCallsPass::translateGpuMemcpyOp( + mlir::gpu::MemcpyOp memcpyOp) { + OpBuilder builder(memcpyOp); + Location loc = memcpyOp.getLoc(); + declareCudaFunctions(loc); + + auto chainsSize = builder.create( + loc, getInt32Type(), builder.getI32IntegerAttr(memcpyOp.chains().size())); + auto chainsArray = builder.create( + loc, getPointerPointerType(), chainsSize, /*alignment=*/0); + for (size_t i = 0; i < memcpyOp.chains().size(); ++i) { + auto index = builder.create(loc, getInt32Type(), + builder.getI32IntegerAttr(i)); + auto gep = builder.create(loc, getPointerPointerType(), + chainsArray, ArrayRef{index}); + builder.create(loc, memcpyOp.chains()[i], gep); + } + + auto nodePointer = allocatePointer(builder, loc); + + auto cuGetGraphHelper = + getOperation().lookupSymbol(cuGetGraphHelperName); + auto cuGraph = builder.create( + loc, ArrayRef{getPointerType()}, + builder.getSymbolRefAttr(cuGetGraphHelper), ArrayRef{}); + + auto elementSize = builder.create( + loc, getInt64Type(), + builder.getI64IntegerAttr(memcpyOp.element_size()->getLimitedValue())); + + auto cuGraphAddMemcpyNode = + getOperation().lookupSymbol(cuGraphAddMemcpyNodeName); + + builder.create( + loc, ArrayRef{getCUResultType()}, + builder.getSymbolRefAttr(cuGraphAddMemcpyNode), + ArrayRef{nodePointer, cuGraph.getResult(0), chainsArray, + chainsSize, memcpyOp.dst(), memcpyOp.src(), elementSize}); + + Value nodeRef = + builder.create(loc, getPointerType(), nodePointer); + + memcpyOp.replaceAllUsesWith(nodeRef); + memcpyOp.erase(); +} + +void GpuLaunchFuncToCudaCallsPass::translateGpuWaitOp( + mlir::gpu::WaitOp waitOp) { + OpBuilder builder(waitOp); + Location loc = waitOp.getLoc(); + declareCudaFunctions(loc); + + auto chainsSize = builder.create( + loc, getInt32Type(), builder.getI32IntegerAttr(waitOp.chains().size())); + auto chainsArray = builder.create( + loc, getPointerPointerType(), chainsSize, /*alignment=*/0); + for (size_t i = 0; i < waitOp.chains().size(); ++i) { + auto index = builder.create(loc, getInt32Type(), + builder.getI32IntegerAttr(i)); + auto gep = builder.create(loc, getPointerPointerType(), + chainsArray, ArrayRef{index}); + builder.create(loc, waitOp.chains()[i], gep); + } + + auto cuGetGraphHelper = + getOperation().lookupSymbol(cuGetGraphHelperName); + auto cuGraph = builder.create( + loc, ArrayRef{getPointerType()}, + builder.getSymbolRefAttr(cuGetGraphHelper), ArrayRef{}); + + auto cuGraphExecute = + getOperation().lookupSymbol(cuGraphExecuteName); + + builder.create( + loc, ArrayRef{getCUResultType()}, + builder.getSymbolRefAttr(cuGraphExecute), + ArrayRef{cuGraph.getResult(0), chainsArray, chainsSize}); + waitOp.erase(); +} + std::unique_ptr> mlir::createConvertGpuLaunchFuncToCudaCallsPass() { return std::make_unique(); diff --git a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp --- a/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp +++ b/mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp @@ -725,8 +725,8 @@ return LLVM::LLVMType::getVoidTy(&getDialect()); } -LLVM::LLVMType ConvertToLLVMPattern::getVoidPtrType() const { - return LLVM::LLVMType::getInt8PtrTy(&getDialect()); +LLVM::LLVMType ConvertToLLVMPattern::getVoidPtrType(unsigned addrspace) const { + return LLVM::LLVMType::getInt8Ty(&getDialect()).getPointerTo(addrspace); } Value ConvertToLLVMPattern::createIndexConstant( @@ -1524,6 +1524,8 @@ // Insert the malloc/aligned_alloc declaration if it is not already present. auto allocFuncName = useAlignedAlloc ? "aligned_alloc" : "malloc"; + if (memRefType.getMemorySpace() == 1) + allocFuncName = "mcuMalloc"; auto module = allocOp.getParentOfType(); auto allocFunc = module.lookupSymbol(allocFuncName); if (!allocFunc) { @@ -1534,8 +1536,9 @@ callArgTypes.push_back(getIndexType()); allocFunc = moduleBuilder.create( rewriter.getUnknownLoc(), allocFuncName, - LLVM::LLVMType::getFunctionTy(getVoidPtrType(), callArgTypes, - /*isVarArg=*/false)); + LLVM::LLVMType::getFunctionTy( + getVoidPtrType(memRefType.getMemorySpace()), callArgTypes, + /*isVarArg=*/false)); } // Allocate the underlying buffer and store a pointer to it in the MemRef @@ -1570,8 +1573,9 @@ } auto allocFuncSymbol = rewriter.getSymbolRefAttr(allocFunc); allocatedBytePtr = rewriter - .create(loc, getVoidPtrType(), - allocFuncSymbol, callArgs) + .create( + loc, getVoidPtrType(memRefType.getMemorySpace()), + allocFuncSymbol, callArgs) .getResult(0); // For heap allocations, the allocated pointer is a cast of the byte pointer // to the type pointer. diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -14,12 +14,14 @@ #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/DialectImplementation.h" #include "mlir/IR/Function.h" #include "mlir/IR/FunctionImplementation.h" #include "mlir/IR/Module.h" #include "mlir/IR/OpImplementation.h" #include "mlir/IR/PatternMatch.h" #include "mlir/IR/StandardTypes.h" +#include "mlir/IR/TypeUtilities.h" using namespace mlir; using namespace mlir::gpu; @@ -35,12 +37,38 @@ GPUDialect::GPUDialect(MLIRContext *context) : Dialect(getDialectNamespace(), context) { + addTypes(); addOperations< #define GET_OP_LIST #include "mlir/Dialect/GPU/GPUOps.cpp.inc" >(); } +Type GPUDialect::parseType(DialectAsmParser &parser) const { + // Parse the main keyword for the type. + StringRef keyword; + if (parser.parseKeyword(&keyword)) + return Type(); + MLIRContext *context = getContext(); + + // Handle 'chain' types. + if (keyword == "chain") + return ChainType::get(context); + + parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword); + return Type(); +} + +void GPUDialect::printType(Type type, DialectAsmPrinter &os) const { + switch (type.getKind()) { + default: + llvm_unreachable("Unhandled gpu type"); + case GpuTypes::Chain: + os << "chain"; + break; + } +} + LogicalResult GPUDialect::verifyOperationAttribute(Operation *op, NamedAttribute attr) { if (!attr.second.isa() || @@ -201,9 +229,11 @@ // LaunchOp //===----------------------------------------------------------------------===// -void LaunchOp::build(Builder *builder, OperationState &result, Value gridSizeX, - Value gridSizeY, Value gridSizeZ, Value blockSizeX, - Value blockSizeY, Value blockSizeZ) { +void LaunchOp::build(Builder *builder, OperationState &result, + ValueRange chains, Value gridSizeX, Value gridSizeY, + Value gridSizeZ, Value blockSizeX, Value blockSizeY, + Value blockSizeZ) { + result.addOperands(chains); // Add grid and block sizes as op operands, followed by the data operands. result.addOperands( {gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ}); @@ -217,6 +247,12 @@ std::vector(kNumConfigRegionAttributes, builder->getIndexType())); kernelRegion->push_back(body); } +void LaunchOp::build(Builder *builder, OperationState &result, Value gridSizeX, + Value gridSizeY, Value gridSizeZ, Value blockSizeX, + Value blockSizeY, Value blockSizeZ) { + build(builder, result, {}, gridSizeX, gridSizeY, gridSizeZ, blockSizeX, + blockSizeY, blockSizeZ); +} KernelDim3 LaunchOp::getBlockIds() { assert(!body().getBlocks().empty() && "FuncOp body must not be empty."); @@ -243,11 +279,20 @@ } KernelDim3 LaunchOp::getGridSizeOperandValues() { - return KernelDim3{getOperand(0), getOperand(1), getOperand(2)}; + auto operands = getOperands().drop_front(getChains().size()); + return KernelDim3{operands[0], operands[1], operands[2]}; } KernelDim3 LaunchOp::getBlockSizeOperandValues() { - return KernelDim3{getOperand(3), getOperand(4), getOperand(5)}; + auto operands = getOperands().drop_front(getChains().size()); + return KernelDim3{operands[3], operands[4], operands[5]}; +} + +OperandRange LaunchOp::getChains() { + auto operands = getOperands(); + return {operands.begin(), llvm::find_if_not(operands, [](const Value &op) { + return op.getType().isa(); + })}; } static LogicalResult verify(LaunchOp op) { @@ -256,8 +301,9 @@ // for block/thread identifiers and grid/block sizes. if (!op.body().empty()) { Block &entryBlock = op.body().front(); - if (entryBlock.getNumArguments() != - LaunchOp::kNumConfigOperands + op.getNumOperands()) + if (entryBlock.getNumArguments() - LaunchOp::kNumConfigRegionAttributes != + op.getNumOperands() - op.getChains().size() - + LaunchOp::kNumConfigOperands) return op.emitOpError("unexpected number of region arguments"); } @@ -286,22 +332,27 @@ // (%size-x = %ssa-use, %size-y = %ssa-use, %size-z = %ssa-use) // where %size-* and %iter-* will correspond to the body region arguments. static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size, - ValueRange operands, KernelDim3 ids) { + KernelDim3 operands, KernelDim3 ids) { p << '(' << ids.x << ", " << ids.y << ", " << ids.z << ") in ("; - p << size.x << " = " << operands[0] << ", "; - p << size.y << " = " << operands[1] << ", "; - p << size.z << " = " << operands[2] << ')'; + p << size.x << " = " << operands.x << ", "; + p << size.y << " = " << operands.y << ", "; + p << size.z << " = " << operands.z << ')'; } static void printLaunchOp(OpAsmPrinter &p, LaunchOp op) { - ValueRange operands = op.getOperands(); - // Print the launch configuration. - p << LaunchOp::getOperationName() << ' ' << op.getBlocksKeyword(); - printSizeAssignment(p, op.getGridSize(), operands.take_front(3), + p << LaunchOp::getOperationName(); + // Note: only print [] if op returns a chain for backwards compatibility. + if (!op.getResults().empty()) { + p << '['; + p.printOperands(op.getChains()); + p << ']'; + } + p << ' ' << op.getBlocksKeyword(); + printSizeAssignment(p, op.getGridSize(), op.getGridSizeOperandValues(), op.getBlockIds()); p << ' ' << op.getThreadsKeyword(); - printSizeAssignment(p, op.getBlockSize(), operands.slice(3, 3), + printSizeAssignment(p, op.getBlockSize(), op.getBlockSizeOperandValues(), op.getThreadIds()); p.printRegion(op.body(), /*printEntryBlockArgs=*/false); @@ -344,6 +395,17 @@ // region attr-dict? // ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` static ParseResult parseLaunchOp(OpAsmParser &parser, OperationState &result) { + // Note: only return chain if it contains '[]' for backwards compatibility. + if (!parser.parseOptionalLSquare()) { + SmallVector chains; + auto chainTy = ChainType::get(parser.getBuilder().getContext()); + if (parser.parseOperandList(chains) || + parser.resolveOperands(chains, chainTy, result.operands) || + parser.parseOptionalRSquare() || + parser.addTypeToList(chainTy, result.types)) + return failure(); + } + // Sizes of the grid and block. SmallVector sizes( LaunchOp::kNumConfigOperands); @@ -390,9 +452,12 @@ //===----------------------------------------------------------------------===// void LaunchFuncOp::build(Builder *builder, OperationState &result, - GPUFuncOp kernelFunc, Value gridSizeX, Value gridSizeY, - Value gridSizeZ, Value blockSizeX, Value blockSizeY, - Value blockSizeZ, ValueRange kernelOperands) { + ValueRange chains, GPUFuncOp kernelFunc, + Value gridSizeX, Value gridSizeY, Value gridSizeZ, + Value blockSizeX, Value blockSizeY, Value blockSizeZ, + ValueRange kernelOperands) { + result.types.push_back(ChainType::get(builder->getContext())); + result.addOperands(chains); // Add grid and block sizes as op operands, followed by the data operands. result.addOperands( {gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ}); @@ -402,12 +467,18 @@ auto kernelModule = kernelFunc.getParentOfType(); result.addAttribute(getKernelModuleAttrName(), builder->getSymbolRefAttr(kernelModule.getName())); + SmallVector operandSegmentSizes(8, 1); + operandSegmentSizes.front() = static_cast(chains.size()); + operandSegmentSizes.back() = static_cast(kernelOperands.size()); + result.addAttribute(getOperandSegmentSizeAttr(), + builder->getI32VectorAttr(operandSegmentSizes)); } void LaunchFuncOp::build(Builder *builder, OperationState &result, - GPUFuncOp kernelFunc, KernelDim3 gridSize, - KernelDim3 blockSize, ValueRange kernelOperands) { - build(builder, result, kernelFunc, gridSize.x, gridSize.y, gridSize.z, + ValueRange chains, GPUFuncOp kernelFunc, + KernelDim3 gridSize, KernelDim3 blockSize, + ValueRange kernelOperands) { + build(builder, result, chains, kernelFunc, gridSize.x, gridSize.y, gridSize.z, blockSize.x, blockSize.y, blockSize.z, kernelOperands); } @@ -416,7 +487,8 @@ } unsigned LaunchFuncOp::getNumKernelOperands() { - return getNumOperands() - kNumConfigOperands; + // TODO(csigg): Use the attribute from AttrSizedOperandSegments? + return getNumOperands() - getChains().size() - kNumConfigOperands; } StringRef LaunchFuncOp::getKernelModuleName() { @@ -425,15 +497,24 @@ } Value LaunchFuncOp::getKernelOperand(unsigned i) { - return getOperation()->getOperand(i + kNumConfigOperands); + return getOperand(getChains().size() + kNumConfigOperands + i); } KernelDim3 LaunchFuncOp::getGridSizeOperandValues() { - return KernelDim3{getOperand(0), getOperand(1), getOperand(2)}; + auto operands = getOperands().drop_front(getChains().size()); + return KernelDim3{operands[0], operands[1], operands[2]}; } KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() { - return KernelDim3{getOperand(3), getOperand(4), getOperand(5)}; + auto operands = getOperands().drop_front(getChains().size()); + return KernelDim3{operands[3], operands[4], operands[5]}; +} + +OperandRange LaunchFuncOp::getChains() { + auto num_chains = + *getAttrOfType(getOperandSegmentSizeAttr()).begin(); + auto begin = getOperands().begin(); + return {begin, begin + num_chains.getLimitedValue()}; } static LogicalResult verify(LaunchFuncOp op) { diff --git a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp --- a/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -198,9 +198,11 @@ gpu::GPUFuncOp kernelFunc, ValueRange operands) { OpBuilder builder(launchOp); - builder.create( - launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(), - launchOp.getBlockSizeOperandValues(), operands); + auto launchFuncOp = builder.create( + launchOp.getLoc(), launchOp.getChains(), kernelFunc, + launchOp.getGridSizeOperandValues(), launchOp.getBlockSizeOperandValues(), + operands); + launchOp.replaceAllUsesWith(launchFuncOp.getOperation()); launchOp.erase(); } diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp --- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp @@ -497,8 +497,6 @@ /// definitions. LogicalResult ModuleTranslation::convertGlobals() { // Lock access to the llvm context. - llvm::sys::SmartScopedLock scopedLock( - llvmDialect->getLLVMContextMutex()); for (auto op : getModuleBody(mlirModule).getOps()) { llvm::Type *type = op.getType().getUnderlyingType(); llvm::Constant *cst = llvm::UndefValue::get(type); diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt -allow-unregistered-dialect %s | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect %s | FileCheck %s --dump-input-on-failure module attributes {gpu.container_module} { @@ -83,6 +83,18 @@ return } + func @async() { + %size = constant 32 : index + // CHECK: %{{.*}} = gpu.alloc (%{{.*}}) : memref + %b0 = gpu.alloc (%size) : memref + // CHECK: %{{.*}}, %{{.*}} = gpu.alloc[] (%{{.*}}) : memref + %b1, %t1 = gpu.alloc[] (%size) : memref + // CHECK: %{{.*}}, %{{.*}} = gpu.alloc[%{{.*}}] (%{{.*}}) : memref + %b2, %t2 = gpu.alloc[%t1] (%size) : memref + + return + } + module @gpu_funcs attributes {gpu.kernel_module} { // CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32) // CHECK: workgroup diff --git a/mlir/test/mlir-cuda-runner/async.mlir b/mlir/test/mlir-cuda-runner/async.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/mlir-cuda-runner/async.mlir @@ -0,0 +1,50 @@ +// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s + +func @main() { + %one = constant 1 : index + %count = constant 4 : index + // b0 = iota (init) + %b0 = alloc(%count) : memref + %t1 = gpu.launch[] blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) + threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %one, %block_z = %one) { + %val = index_cast %tx : index to i32 + store %val, %b0[%tx] : memref + gpu.terminator + } + // b1 = iota (init) + %b1 = alloc(%count) : memref + %t2 = gpu.launch[] blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) + threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %one, %block_z = %one) { + %val = index_cast %tx : index to i32 + store %val, %b1[%tx] : memref + gpu.terminator + } + // b2 = b0 + b1 (join) + %b2 = alloc(%count) : memref + %t3 = gpu.launch[%t1, %t2] blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) + threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %one, %block_z = %one) { + %v0 = load %b0[%tx] : memref + %v1 = load %b1[%tx] : memref + %sum = addi %v0, %v1 : i32 + store %sum, %b2[%tx] : memref + gpu.terminator + } + // copy b2 to h0 and h1 (fork) + %h0 = alloc(%count) : memref + %h1 = alloc(%count) : memref + %g0 = memref_cast %h0 : memref to memref<*xi32> + %g1 = memref_cast %h1 : memref to memref<*xi32> + %c2 = memref_cast %b2 : memref to memref<*xi32, 1> + %t4 = gpu.memcpy[%t3] (%g0, %c2) { element_size = 4 } : memref<*xi32>, memref<*xi32, 1> + %t5 = gpu.memcpy[%t3] (%g1, %c2) { element_size = 4 } : memref<*xi32>, memref<*xi32, 1> + // wait for copies to complete (sync) + gpu.wait[%t4, %t5] + // print result + // CHECK: [0 2 4 8] + call @print_memref_i32(%g0) : (memref<*xi32>) -> () + // CHECK: [0 2 4 8] + call @print_memref_i32(%g1) : (memref<*xi32>) -> () + return +} + +func @print_memref_i32(memref<*xi32>) diff --git a/mlir/test/mlir-cuda-runner/simple.mlir b/mlir/test/mlir-cuda-runner/simple.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/mlir-cuda-runner/simple.mlir @@ -0,0 +1,27 @@ +// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void | FileCheck %s + +func @main() { + %one = constant 1 : index + %count = constant 4 : index + // b0 = iota (init) + %b0 = alloc(%count) : memref + %t1 = gpu.launch[] blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one) + threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %one, %block_z = %one) { + %val = index_cast %tx : index to i32 + store %val, %b0[%tx] : memref + gpu.terminator + } + // copy b0 to h0 + %h0 = alloc(%count) : memref + %g0 = memref_cast %h0 : memref to memref<*xi32> + %c0 = memref_cast %b0 : memref to memref<*xi32, 1> + %t2 = gpu.memcpy[%t1] (%g0, %c0) { element_size = 4 } : memref<*xi32>, memref<*xi32, 1> + // wait for copy to complete (sync) + gpu.wait[%t2] + // print result + // CHECK: [0, 1, 2, 3] + call @print_memref_i32(%g0) : (memref<*xi32>) -> () + return +} + +func @print_memref_i32(memref<*xi32>) diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp --- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp +++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp @@ -21,7 +21,7 @@ #include "cuda.h" namespace { -int32_t reportErrorIfAny(CUresult result, const char *where) { +CUresult reportErrorIfAny(CUresult result, const char *where) { if (result != CUDA_SUCCESS) { llvm::errs() << "CUDA failed with " << result << " in " << where << "\n"; } @@ -65,6 +65,72 @@ return stream; } +extern "C" CUgraph mcuGetGraphHelper() { + static auto result = [] { + CUgraph graph = nullptr; + reportErrorIfAny(cuGraphCreate(&graph, 0), "cuGraphCreate"); + return graph; + }(); + return result; +} +extern "C" CUresult mcuGraphExecute(CUgraph graph) { + CUgraphExec exec = nullptr; + reportErrorIfAny(cuGraphInstantiate(&exec, graph, /*phErrorNode=*/NULL, + /*logBuffer=*/NULL, /*bufferSize=*/0), + "cuGraphInstantiate"); + reportErrorIfAny(cuGraphLaunch(exec, nullptr), "cuGraphLaunch"); + return reportErrorIfAny(cuGraphExecDestroy(exec), "cuGraphExecDestroy"); +} +extern "C" CUresult mcuGraphAddKernelNode(CUgraphNode *node, CUgraph graph, + const CUgraphNode *deps, + int32_t num_deps, + CUDA_KERNEL_NODE_PARAMS *params) { + return reportErrorIfAny( + cuGraphAddKernelNode(node, graph, deps, num_deps, params), + "cuGraphAddKernelNode"); +} + +struct StridedMemRefDesc { + void *basePtr; + CUdeviceptr data; + int64_t offset; + int64_t sizes[4]; +}; + +extern "C" CUresult +mcuGraphAddMemcpyNode(CUgraphNode *node, CUgraph graph, const CUgraphNode *deps, + int32_t num_deps, int64_t dst_rank, + StridedMemRefDesc *dst_desc, int64_t src_rank, + StridedMemRefDesc *src_desc, size_t element_size_bytes) { + CUcontext ctx; + reportErrorIfAny(cuCtxGetCurrent(&ctx), "cuCtxGetCurrent"); + auto get_size = [](int64_t rank, StridedMemRefDesc *desc) { + const int64_t *begin = desc->sizes; + return std::accumulate(begin, begin + rank, 1, std::multiplies()); + }; + int64_t src_size = get_size(src_rank, src_desc); + assert(src_size == get_size(dst_rank, dst_desc)); + + CUDA_MEMCPY3D copy_params = {}; + copy_params.srcMemoryType = CU_MEMORYTYPE_UNIFIED; + copy_params.srcDevice = src_desc->data; + copy_params.dstMemoryType = CU_MEMORYTYPE_UNIFIED; + copy_params.dstDevice = dst_desc->data; + copy_params.WidthInBytes = src_size * element_size_bytes; + copy_params.Height = copy_params.Depth = 1; + + return reportErrorIfAny( + cuGraphAddMemcpyNode(node, graph, deps, num_deps, ©_params, ctx), + "cuGraphAddMemcpyNode"); +} + +extern "C" void *mcuMalloc(size_t size) { + void *ptr; + reportErrorIfAny(cuMemAlloc(reinterpret_cast(&ptr), size), + "cuMemAlloc"); + return ptr; +} + extern "C" int32_t mcuStreamSynchronize(void *stream) { return reportErrorIfAny( cuStreamSynchronize(reinterpret_cast(stream)), "StreamSync");