diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -919,15 +919,19 @@ it does not block until the execution has finished on the device). In that case, it also returns a !gpu.async.token. + If the `host_shared` keyword is present, the memory will be allocated in a + memory accessible both on host and on device. + Example: ```mlir - %memref, %token = gpu.alloc async [%dep] (%width) : memref<64x?xf32, 1> + %memref, %token = gpu.alloc async [%dep] host_shared (%width) : memref<64x?xf32, 1> ``` }]; let arguments = (ins Variadic:$asyncDependencies, - Variadic:$dynamicSizes, Variadic:$symbolOperands); + Variadic:$dynamicSizes, Variadic:$symbolOperands, + UnitAttr:$hostShared); let results = (outs Res:$memref, Optional:$asyncToken); @@ -936,7 +940,7 @@ }]; let assemblyFormat = [{ - custom(type($asyncToken), $asyncDependencies) ` ` + custom(type($asyncToken), $asyncDependencies) (` ` `host_shared` $hostShared^)? ` ` `(` $dynamicSizes `)` (`` `[` $symbolOperands^ `]`)? attr-dict `:` type($memref) }]; diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -464,6 +464,10 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::AllocOp allocOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { + if (adaptor.getHostShared()) + return rewriter.notifyMatchFailure( + allocOp, "host_shared allocation is not supprted"); + MemRefType memRefType = allocOp.getType(); if (failed(areAllLLVMTypes(allocOp, adaptor.getOperands(), rewriter)) || diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -209,6 +209,11 @@ // CHECK: gpu.dealloc async [%[[t1]]] %[[m1]] : memref<13xf32, 1> %t2 = gpu.dealloc async [%t1] %m1 : memref<13xf32, 1> + // CHECK: %[[m2:.*]] = gpu.alloc host_shared () : memref<13xf32, 1> + %m2 = gpu.alloc host_shared () : memref<13xf32, 1> + // CHECK: gpu.dealloc %[[m2]] : memref<13xf32, 1> + gpu.dealloc %m2 : memref<13xf32, 1> + return }