diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1167,6 +1167,39 @@ let hasCanonicalizer = 1; } +def GPU_AllocManagedOp : GPU_Op<"alloc_managed", []> { + + let summary = "Managed memory allocation operation."; + let description = [{ + The `gpu.alloc_managed` operation allocates a region of memory that is + visible to GPU and CPU. It is similar to the `memref.alloc` op, but + allocated data automatically migrated between the memory spaces by driver + via page faults or software. + + `attachHost` memory cannot be accessed by any stream on any device. + + Example: + + ```mlir + %memref = gpu.alloc_managed (%width) : memref<64x?xf32, 1> + ``` + }]; + + let arguments = (ins Variadic:$dynamicSizes, + OptionalAttr:$attachHost); + let results = (outs Res:$memref); + + let extraClassDeclaration = [{ + MemRefType getType() { return ::llvm::cast(getMemref().getType()); } + }]; + + let assemblyFormat = [{ + `(` $dynamicSizes `)` attr-dict `:` type($memref) + }]; + let hasCanonicalizer = 1; + let hasVerifier = 1; +} + def GPU_DeallocOp : GPU_Op<"dealloc", [GPU_AsyncOpInterface]> { let summary = "GPU memory deallocation operation"; diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -177,6 +177,11 @@ llvmPointerType /* void * */, {llvmIntPtrType /* intptr_t sizeBytes */, llvmPointerType /* void *stream */}}; + FunctionCallBuilder allocManagedCallBuilder = { + "mgpuMemAllocManaged", + llvmPointerType /* void * */, + {llvmIntPtrType /* intptr_t sizeBytes */, + llvmInt32Type /* unsigned flags */}}; FunctionCallBuilder deallocCallBuilder = { "mgpuMemFree", llvmVoidType, @@ -349,6 +354,20 @@ ConversionPatternRewriter &rewriter) const override; }; +/// A rewrite pattern to convert gpu.alloc_managed operations into a GPU +/// runtime call. Currently it supports CUDA and ROCm (HIP). +class ConvertAllocManagedOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertAllocManagedOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::AllocManagedOp allocOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + /// A rewrite pattern to convert gpu.dealloc operations into a GPU runtime /// call. Currently it supports CUDA and ROCm (HIP). class ConvertDeallocOpToGpuRuntimeCallPattern @@ -906,6 +925,49 @@ return success(); } +LogicalResult ConvertAllocManagedOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::AllocManagedOp allocOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + MemRefType memRefType = allocOp.getType(); + + if (failed(areAllLLVMTypes(allocOp, adaptor.getOperands(), rewriter)) || + !isConvertibleAndHasIdentityMaps(memRefType)) + return failure(); + + auto loc = allocOp.getLoc(); + + // Get shape of the memref as values: static sizes are constant + // values and dynamic sizes are passed to 'alloc' as operands. + SmallVector shape; + SmallVector strides; + Value sizeBytes; + getMemRefDescriptorSizes(loc, memRefType, adaptor.getDynamicSizes(), rewriter, + shape, strides, sizeBytes); + + // Allocate the underlying buffer and store a pointer to it in the MemRef + // descriptor. + Type elementPtrType = this->getElementPtrType(memRefType); + Value flags = rewriter.create( + loc, llvmInt32Type, allocOp.getAttachHost() ? 2 : 1); + Value allocatedPtr = + allocManagedCallBuilder.create(loc, rewriter, {sizeBytes, flags}) + .getResult(); + if (!getTypeConverter()->useOpaquePointers()) + allocatedPtr = + rewriter.create(loc, elementPtrType, allocatedPtr); + + // No alignment. + Value alignedPtr = allocatedPtr; + + // Create the MemRef descriptor. + auto memRefDescriptor = this->createMemRefDescriptor( + loc, memRefType, allocatedPtr, alignedPtr, shape, strides, rewriter); + + rewriter.replaceOp(allocOp, {memRefDescriptor}); + + return success(); +} + LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::DeallocOp deallocOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { @@ -1786,6 +1848,7 @@ addOpaquePointerConversion(converter); patterns.add(getMemref().getType()); + + if (static_cast(getDynamicSizes().size()) != + memRefType.getNumDynamicDims()) + return emitOpError("dimension operand count does not equal memref " + "dynamic dimension count"); + + unsigned numSymbols = 0; + if (!memRefType.getLayout().isIdentity()) + numSymbols = memRefType.getLayout().getAffineMap().getNumSymbols(); + if (getSymbolOperands().size() != numSymbols) { + return emitOpError( + "symbol operand count does not equal memref symbol count"); + } + + return success(); +} + namespace { /// Folding of memref.dim(gpu.alloc(%size), %idx) -> %size similar to @@ -1765,6 +1784,11 @@ results.add(context); } +void AllocManagedOp::getCanonicalizationPatterns(RewritePatternSet &results, + MLIRContext *context) { + results.add(context); +} + #include "mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc" #include "mlir/Dialect/GPU/IR/GPUOpsEnums.cpp.inc" diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -175,6 +175,13 @@ return reinterpret_cast(ptr); } +extern "C" void *mgpuMemAllocManaged(uint64_t sizeBytes, unsigned int flags) { + ScopedContext scopedContext; + CUdeviceptr sharedPtr; + CUDA_REPORT_IF_ERROR(cuMemAllocManaged(&sharedPtr, sizeBytes, flags)); + return reinterpret_cast(sharedPtr); +} + extern "C" void mgpuMemFree(void *ptr, CUstream /*stream*/) { CUDA_REPORT_IF_ERROR(cuMemFree(reinterpret_cast(ptr))); } diff --git a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/RocmRuntimeWrappers.cpp @@ -105,6 +105,12 @@ return ptr; } +extern "C" void *mgpuMemAllocManaged(uint64_t sizeBytes, unsigned int flags) { + void *sharedPtr; + HIP_REPORT_IF_ERROR(hipMallocManaged(&sharedPtr, sizeBytes)); + return sharedPtr; +} + extern "C" void mgpuMemFree(void *ptr, hipStream_t /*stream*/) { HIP_REPORT_IF_ERROR(hipFree(ptr)); } diff --git a/mlir/test/Integration/GPU/CUDA/managed.mlir b/mlir/test/Integration/GPU/CUDA/managed.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/GPU/CUDA/managed.mlir @@ -0,0 +1,50 @@ +// RUN: mlir-opt %s \ +// RUN: | mlir-opt -gpu-kernel-outlining \ +// RUN: | mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,gpu-to-cubin))' \ +// RUN: | mlir-opt -gpu-async-region -gpu-to-llvm \ +// RUN: | mlir-opt -async-to-async-runtime -async-runtime-ref-counting \ +// RUN: | mlir-opt -convert-async-to-llvm -convert-func-to-llvm \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_cuda_runtime \ +// RUN: --shared-libs=%mlir_async_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --entry-point-result=void -O0 \ +// RUN: | FileCheck %s + +// CHECK: [42, 412] +// CHECK: Hello from GPU data[0]=42 +// CHECK: Hello from GPU data[1]=412 +// CHECK: [42, 454] + +func.func @main() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %count = arith.constant 2 : index + + // initialize h0 on host + %sharedPtr = gpu.alloc_managed(%count) : memref + %h0_unranked = memref.cast %sharedPtr : memref to memref<*xi32> + + %v0 = arith.constant 42 : i32 + %v1 = arith.constant 412 : i32 + memref.store %v0, %sharedPtr[%c0] : memref + memref.store %v1, %sharedPtr[%c1] : memref + + + call @printMemrefI32(%h0_unranked) : (memref<*xi32>) -> () + + gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1) + threads(%tx, %ty, %tz) in (%block_x = %c1, %block_y = %c1, %block_z = %c1) { + %v2 = memref.load %sharedPtr[%c0] : memref + %v3 = memref.load %sharedPtr[%c1] : memref + %sum = arith.addi %v2, %v3 : i32 + gpu.printf "Hello from GPU data[%lld]=%d \n" %c0, %v2 : index, i32 + gpu.printf "Hello from GPU data[%lld]=%d \n" %c1, %v3 : index, i32 + memref.store %sum, %sharedPtr[%c1] : memref + gpu.terminator + } + + call @printMemrefI32(%h0_unranked) : (memref<*xi32>) -> () + return +} +func.func private @printMemrefI32(memref<*xi32>)