diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -361,6 +361,9 @@ /*default=*/"gpu::getDefaultGpuBinaryAnnotation()", "Annotation attribute string for GPU binary" >, + Option<"useOpaquePointers", "use-opaque-pointers", "bool", + /*default=*/"false", "Generate LLVM IR using opaque pointers " + "instead of typed pointers">, ]; let dependentDialects = [ @@ -410,6 +413,9 @@ "Bitwidth of the index type, 0 to use size of machine word">, Option<"hasRedux", "has-redux", "bool", /*default=*/"false", "Target gpu supports redux">, + Option<"useOpaquePointers", "use-opaque-pointers", "bool", + /*default=*/"false", "Generate LLVM IR using opaque pointers " + "instead of typed pointers">, ]; } @@ -443,7 +449,10 @@ clEnumValN(::mlir::gpu::amd::Runtime::Unknown, "unknown", "Unknown (default)"), clEnumValN(::mlir::gpu::amd::Runtime::HIP, "HIP", "HIP"), clEnumValN(::mlir::gpu::amd::Runtime::OpenCL, "OpenCL", "OpenCL") - )}]> + )}]>, + Option<"useOpaquePointers", "use-opaque-pointers", "bool", + /*default=*/"false", "Generate LLVM IR using opaque pointers " + "instead of typed pointers">, ]; } diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h --- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h +++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h @@ -208,7 +208,8 @@ /// global and use it to compute the address of the first character in the /// string (operations inserted at the builder insertion point). Value createGlobalString(Location loc, OpBuilder &builder, StringRef name, - StringRef value, Linkage linkage); + StringRef value, Linkage linkage, + bool useOpaquePointers); /// LLVM requires some operations to be inside of a Module operation. This /// function confirms that the Operation has the desired properties. diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp @@ -43,17 +43,10 @@ workgroupBuffers.push_back(globalOp); } - // Rewrite the original GPU function to an LLVM function. - auto convertedType = typeConverter->convertType(gpuFuncOp.getFunctionType()); - if (!convertedType) - return failure(); - auto funcType = - convertedType.template cast().getElementType(); - // Remap proper input types. TypeConverter::SignatureConversion signatureConversion( gpuFuncOp.front().getNumArguments()); - getTypeConverter()->convertFunctionSignature( + Type funcType = getTypeConverter()->convertFunctionSignature( gpuFuncOp.getFunctionType(), /*isVariadic=*/false, signatureConversion); // Create the new function operation. Only copy those attributes that are @@ -90,12 +83,18 @@ for (const auto &en : llvm::enumerate(workgroupBuffers)) { LLVM::GlobalOp global = en.value(); - Value address = rewriter.create(loc, global); + Value address = rewriter.create( + loc, + getTypeConverter()->getPointerType(global.getType(), + global.getAddrSpace()), + global.getSymNameAttr()); auto elementType = global.getType().cast().getElementType(); Value memory = rewriter.create( - loc, LLVM::LLVMPointerType::get(elementType, global.getAddrSpace()), - address, ArrayRef{0, 0}); + loc, + getTypeConverter()->getPointerType(elementType, + global.getAddrSpace()), + global.getType(), address, ArrayRef{0, 0}); // Build a memref descriptor pointing to the buffer to plug with the // existing memref infrastructure. This may use more registers than @@ -119,14 +118,14 @@ // Explicitly drop memory space when lowering private memory // attributions since NVVM models it as `alloca`s in the default // memory space and does not support `alloca`s with addrspace(5). - auto ptrType = LLVM::LLVMPointerType::get( - typeConverter->convertType(type.getElementType()) - .template cast(), - allocaAddrSpace); + Type elementType = typeConverter->convertType(type.getElementType()); + auto ptrType = + getTypeConverter()->getPointerType(elementType, allocaAddrSpace); Value numElements = rewriter.create( gpuFuncOp.getLoc(), int64Ty, type.getNumElements()); Value allocated = rewriter.create( - gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0); + gpuFuncOp.getLoc(), ptrType, elementType, numElements, + /*alignment=*/0); auto descr = MemRefDescriptor::fromStaticShape( rewriter, loc, *getTypeConverter(), type, allocated); signatureConversion.remapInput( @@ -206,7 +205,7 @@ Location loc = gpuPrintfOp->getLoc(); mlir::Type llvmI8 = typeConverter->convertType(rewriter.getI8Type()); - mlir::Type i8Ptr = LLVM::LLVMPointerType::get(llvmI8); + mlir::Type i8Ptr = getTypeConverter()->getPointerType(llvmI8); mlir::Type llvmI32 = typeConverter->convertType(rewriter.getI32Type()); mlir::Type llvmI64 = typeConverter->convertType(rewriter.getI64Type()); // Note: this is the GPUModule op, not the ModuleOp that surrounds it @@ -255,9 +254,12 @@ } // Get a pointer to the format string's first element and pass it to printf() - Value globalPtr = rewriter.create(loc, global); + Value globalPtr = rewriter.create( + loc, + getTypeConverter()->getPointerType(globalType, global.getAddrSpace()), + global.getSymNameAttr()); Value stringStart = rewriter.create( - loc, i8Ptr, globalPtr, ArrayRef{0, 0}); + loc, i8Ptr, globalType, globalPtr, ArrayRef{0, 0}); Value stringLen = rewriter.create(loc, llvmI64, formatStringSize); @@ -314,7 +316,7 @@ Location loc = gpuPrintfOp->getLoc(); mlir::Type llvmI8 = typeConverter->convertType(rewriter.getIntegerType(8)); - mlir::Type i8Ptr = LLVM::LLVMPointerType::get(llvmI8, addressSpace); + mlir::Type i8Ptr = getTypeConverter()->getPointerType(llvmI8, addressSpace); // Note: this is the GPUModule op, not the ModuleOp that surrounds it // This ensures that global constants and declarations are placed within @@ -344,9 +346,12 @@ } // Get a pointer to the format string's first element - Value globalPtr = rewriter.create(loc, global); + Value globalPtr = rewriter.create( + loc, + getTypeConverter()->getPointerType(globalType, global.getAddrSpace()), + global.getSymNameAttr()); Value stringStart = rewriter.create( - loc, i8Ptr, globalPtr, ArrayRef{0, 0}); + loc, i8Ptr, globalType, globalPtr, ArrayRef{0, 0}); // Construct arguments and function call auto argsRange = adaptor.getArgs(); diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -93,8 +93,9 @@ Type llvmVoidType = LLVM::LLVMVoidType::get(context); LLVM::LLVMPointerType llvmPointerType = - LLVM::LLVMPointerType::get(IntegerType::get(context, 8)); - Type llvmPointerPointerType = LLVM::LLVMPointerType::get(llvmPointerType); + this->getTypeConverter()->getPointerType(IntegerType::get(context, 8)); + Type llvmPointerPointerType = + this->getTypeConverter()->getPointerType(llvmPointerType); Type llvmInt8Type = IntegerType::get(context, 8); Type llvmInt32Type = IntegerType::get(context, 32); Type llvmInt64Type = IntegerType::get(context, 64); @@ -363,7 +364,10 @@ } // namespace void GpuToLLVMConversionPass::runOnOperation() { - LLVMTypeConverter converter(&getContext()); + LowerToLLVMOptions options(&getContext()); + options.useOpaquePointers = useOpaquePointers; + + LLVMTypeConverter converter(&getContext(), options); RewritePatternSet patterns(&getContext()); LLVMConversionTarget target(getContext()); @@ -472,8 +476,9 @@ auto stream = adaptor.getAsyncDependencies().front(); Value allocatedPtr = allocCallBuilder.create(loc, rewriter, {sizeBytes, stream}).getResult(); - allocatedPtr = - rewriter.create(loc, elementPtrType, allocatedPtr); + if (!getTypeConverter()->useOpaquePointers()) + allocatedPtr = + rewriter.create(loc, elementPtrType, allocatedPtr); // No alignment. Value alignedPtr = allocatedPtr; @@ -498,9 +503,10 @@ Value pointer = MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc); - auto casted = rewriter.create(loc, llvmPointerType, pointer); + if (!getTypeConverter()->useOpaquePointers()) + pointer = rewriter.create(loc, llvmPointerType, pointer); Value stream = adaptor.getAsyncDependencies().front(); - deallocCallBuilder.create(loc, rewriter, {casted, stream}); + deallocCallBuilder.create(loc, rewriter, {pointer, stream}); rewriter.replaceOp(deallocOp, {stream}); return success(); @@ -665,22 +671,25 @@ argumentTypes); auto one = builder.create(loc, llvmInt32Type, 1); auto structPtr = builder.create( - loc, LLVM::LLVMPointerType::get(structType), one, /*alignment=*/0); + loc, getTypeConverter()->getPointerType(structType), structType, one, + /*alignment=*/0); auto arraySize = builder.create(loc, llvmInt32Type, numArguments); - auto arrayPtr = builder.create(loc, llvmPointerPointerType, - arraySize, /*alignment=*/0); + auto arrayPtr = builder.create( + loc, llvmPointerPointerType, llvmPointerType, arraySize, /*alignment=*/0); for (const auto &en : llvm::enumerate(arguments)) { - auto fieldPtr = builder.create( - loc, LLVM::LLVMPointerType::get(argumentTypes[en.index()]), structPtr, + Value fieldPtr = builder.create( + loc, getTypeConverter()->getPointerType(argumentTypes[en.index()]), + argumentTypes[en.index()], structPtr, ArrayRef{0, en.index()}); builder.create(loc, en.value(), fieldPtr); - auto elementPtr = - builder.create(loc, llvmPointerPointerType, arrayPtr, - ArrayRef{en.index()}); - auto casted = - builder.create(loc, llvmPointerType, fieldPtr); - builder.create(loc, casted, elementPtr); + auto elementPtr = builder.create( + loc, llvmPointerPointerType, llvmPointerType, arrayPtr, + ArrayRef{en.index()}); + if (!getTypeConverter()->useOpaquePointers()) + fieldPtr = + builder.create(loc, llvmPointerType, fieldPtr); + builder.create(loc, fieldPtr, elementPtr); } return arrayPtr; } @@ -706,7 +715,7 @@ std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name)); return LLVM::createGlobalString( loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()), - LLVM::Linkage::Internal); + LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers()); } // Emits LLVM IR to launch a kernel function. Expects the module that contains @@ -761,9 +770,9 @@ SmallString<128> nameBuffer(kernelModule.getName()); nameBuffer.append(kGpuBinaryStorageSuffix); - Value data = - LLVM::createGlobalString(loc, rewriter, nameBuffer.str(), - binaryAttr.getValue(), LLVM::Linkage::Internal); + Value data = LLVM::createGlobalString( + loc, rewriter, nameBuffer.str(), binaryAttr.getValue(), + LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers()); auto module = moduleLoadCallBuilder.create(loc, rewriter, data); // Get the function from the module. The name corresponds to the name of @@ -820,6 +829,9 @@ destinationType.getAddressSpace()), sourcePtr); + if (typeConverter.useOpaquePointers()) + return sourcePtr; + return rewriter.create(loc, destinationType, sourcePtr); } @@ -840,8 +852,10 @@ Type elementPtrType = getElementPtrType(memRefType); Value nullPtr = rewriter.create(loc, elementPtrType); - Value gepPtr = - rewriter.create(loc, elementPtrType, nullPtr, numElements); + Value gepPtr = rewriter.create( + loc, elementPtrType, + typeConverter->convertType(memRefType.getElementType()), nullPtr, + numElements); auto sizeBytes = rewriter.create(loc, getIndexType(), gepPtr); @@ -908,10 +922,10 @@ RewritePatternSet &patterns, StringRef gpuBinaryAnnotation, bool kernelBarePtrCallConv) { - converter.addConversion( - [context = &converter.getContext()](gpu::AsyncTokenType type) -> Type { - return LLVM::LLVMPointerType::get(IntegerType::get(context, 8)); - }); + converter.addConversion([&converter](gpu::AsyncTokenType type) -> Type { + return converter.getPointerType( + IntegerType::get(&converter.getContext(), 8)); + }); patterns.add(m.getOperation()))); if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout) options.overrideIndexBitwidth(indexBitwidth); + options.useOpaquePointers = useOpaquePointers; // Apply in-dialect lowering. In-dialect lowering will replace // ops which need to be lowered further, which is not supported by a diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -106,6 +106,7 @@ ctx, DataLayout(cast(m.getOperation()))); if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout) options.overrideIndexBitwidth(indexBitwidth); + options.useOpaquePointers = useOpaquePointers; if (useBarePtrCallConv) { options.useBarePtrCallConv = true; diff --git a/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp b/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp --- a/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp +++ b/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp @@ -367,7 +367,8 @@ std::string entryPointGlobalName = (name + "_spv_entry_point_name").str(); return LLVM::createGlobalString(loc, builder, entryPointGlobalName, - shaderName, LLVM::Linkage::Internal); + shaderName, LLVM::Linkage::Internal, + /*TODO:useOpaquePointers=*/false); } void VulkanLaunchFuncToVulkanCallsPass::translateVulkanLaunchCall( @@ -385,7 +386,7 @@ // that data to runtime call. Value ptrToSPIRVBinary = LLVM::createGlobalString( loc, builder, kSPIRVBinary, spirvAttributes.first.getValue(), - LLVM::Linkage::Internal); + LLVM::Linkage::Internal, /*TODO:useOpaquePointers=*/false); // Create LLVM constant for the size of SPIR-V binary shader. Value binarySize = builder.create( diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp --- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -3264,7 +3264,8 @@ Value mlir::LLVM::createGlobalString(Location loc, OpBuilder &builder, StringRef name, StringRef value, - LLVM::Linkage linkage) { + LLVM::Linkage linkage, + bool useOpaquePointers) { assert(builder.getInsertionBlock() && builder.getInsertionBlock()->getParentOp() && "expected builder to point to a block constrained in an op"); @@ -3280,11 +3281,20 @@ loc, type, /*isConstant=*/true, linkage, name, builder.getStringAttr(value), /*alignment=*/0); + LLVMPointerType resultType; + LLVMPointerType charPtr; + if (!useOpaquePointers) { + resultType = LLVMPointerType::get(type); + charPtr = LLVMPointerType::get(IntegerType::get(ctx, 8)); + } else { + resultType = charPtr = LLVMPointerType::get(ctx); + } + // Get the pointer to the first character in the global string. - Value globalPtr = builder.create(loc, global); - return builder.create( - loc, LLVM::LLVMPointerType::get(IntegerType::get(ctx, 8)), globalPtr, - ArrayRef{0, 0}); + Value globalPtr = builder.create(loc, resultType, + global.getSymNameAttr()); + return builder.create(loc, charPtr, type, globalPtr, + ArrayRef{0, 0}); } bool mlir::LLVM::satisfiesLLVMModule(Operation *op) { diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s +// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s module attributes {gpu.container_module} { // CHECK-LABEL: llvm.func @main @@ -11,8 +11,7 @@ // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]]) %1, %2 = gpu.alloc async [%0] (%size) : memref // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0] - // CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]] - // CHECK: llvm.call @mgpuMemFree(%[[void_ptr]], %[[stream]]) + // CHECK: llvm.call @mgpuMemFree(%[[float_ptr]], %[[stream]]) %3 = gpu.dealloc async [%2] %1 : memref // CHECK: llvm.call @mgpuStreamSynchronize(%[[stream]]) // CHECK: llvm.call @mgpuStreamDestroy(%[[stream]]) diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" | FileCheck %s -// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco" | FileCheck %s --check-prefix=ROCDL +// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin use-opaque-pointers=1" | FileCheck %s +// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco use-opaque-pointers=1" | FileCheck %s --check-prefix=ROCDL module attributes {gpu.container_module} { @@ -33,7 +33,7 @@ // CHECK-DAG: [[C8:%.*]] = llvm.mlir.constant(8 : index) : i64 // CHECK: [[ADDRESSOF:%.*]] = llvm.mlir.addressof @[[GLOBAL]] // CHECK: [[BINARY:%.*]] = llvm.getelementptr [[ADDRESSOF]]{{\[}}0, 0] - // CHECK-SAME: -> !llvm.ptr + // CHECK-SAME: -> !llvm.ptr // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]]) // CHECK: [[FUNC:%.*]] = llvm.call @mgpuModuleGetFunction([[MODULE]], {{.*}}) @@ -41,9 +41,9 @@ // CHECK: [[STREAM:%.*]] = llvm.call @mgpuStreamCreate // CHECK: [[NUM_PARAMS:%.*]] = llvm.mlir.constant(6 : i32) : i32 - // CHECK-NEXT: [[PARAMS:%.*]] = llvm.alloca [[NUM_PARAMS]] x !llvm.ptr + // CHECK-NEXT: [[PARAMS:%.*]] = llvm.alloca [[NUM_PARAMS]] x !llvm.ptr - // CHECK: [[EXTRA_PARAMS:%.*]] = llvm.mlir.null : !llvm.ptr> + // CHECK: [[EXTRA_PARAMS:%.*]] = llvm.mlir.null : !llvm.ptr // CHECK: llvm.call @mgpuLaunchKernel([[FUNC]], [[C8]], [[C8]], [[C8]], // CHECK-SAME: [[C8]], [[C8]], [[C8]], [[C256]], [[STREAM]], diff --git a/mlir/test/Conversion/GPUCommon/lower-memcpy-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-memcpy-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-memcpy-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-memcpy-to-gpu-runtime-calls.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s +// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s module attributes {gpu.container_module} { @@ -8,10 +8,8 @@ %t0 = gpu.wait async // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint // CHECK-NOT: llvm.addrspacecast - // CHECK: %[[src:.*]] = llvm.bitcast // CHECK: %[[addr_cast:.*]] = llvm.addrspacecast - // CHECK: %[[dst:.*]] = llvm.bitcast %[[addr_cast]] - // CHECK: llvm.call @mgpuMemcpy(%[[dst]], %[[src]], %[[size_bytes]], %[[t0]]) + // CHECK: llvm.call @mgpuMemcpy(%[[addr_cast]], %{{.*}}, %[[size_bytes]], %[[t0]]) %t1 = gpu.memcpy async [%t0] %dst, %src : memref<7xf32, 1>, memref<7xf32> // CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]]) // CHECK: llvm.call @mgpuStreamDestroy(%[[t0]]) diff --git a/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs-typed-pointers.mlir b/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs-typed-pointers.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs-typed-pointers.mlir @@ -0,0 +1,15 @@ +// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-opaque-pointers=0' | FileCheck %s --check-prefixes=CHECK,ROCDL +// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-opaque-pointers=0' | FileCheck %s --check-prefixes=CHECK,NVVM + +gpu.module @kernel { + gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, #gpu.address_space>) { + %c0 = arith.constant 0 : index + memref.store %arg0, %arg1[%c0] : memref<4xf32, #gpu.address_space> + gpu.return + } +} + +// CHECK-LABEL: llvm.func @private +// CHECK: llvm.store +// ROCDL-SAME: : !llvm.ptr +// NVVM-SAME: : !llvm.ptr diff --git a/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir b/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir --- a/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl | FileCheck %s --check-prefixes=CHECK,ROCDL -// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm | FileCheck %s --check-prefixes=CHECK,NVVM +// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-opaque-pointers=1' | FileCheck %s --check-prefixes=CHECK,ROCDL +// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-opaque-pointers=1' | FileCheck %s --check-prefixes=CHECK,NVVM gpu.module @kernel { gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, #gpu.address_space>) { @@ -11,8 +11,8 @@ // CHECK-LABEL: llvm.func @private // CHECK: llvm.store -// ROCDL-SAME: : !llvm.ptr -// NVVM-SAME: : !llvm.ptr +// ROCDL-SAME: : f32, !llvm.ptr<5> +// NVVM-SAME: : f32, !llvm.ptr // ----- @@ -27,7 +27,7 @@ // CHECK-LABEL: llvm.func @workgroup // CHECK: llvm.store -// CHECK-SAME: : !llvm.ptr +// CHECK-SAME: : f32, !llvm.ptr<3> // ----- @@ -42,7 +42,7 @@ // CHECK-LABEL: llvm.func @nested_memref // CHECK: llvm.load -// CHECK-SAME: : !llvm.ptr<{{.*}}, 1> +// CHECK-SAME: : !llvm.ptr<1> // CHECK: [[value:%.+]] = llvm.load -// CHECK-SAME: : !llvm.ptr +// CHECK-SAME: : !llvm.ptr<1> -> f32 // CHECK: llvm.return [[value]] diff --git a/mlir/test/Conversion/GPUCommon/lower-memset-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-memset-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-memset-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-memset-to-gpu-runtime-calls.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s +// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s module attributes {gpu.container_module} { @@ -7,10 +7,8 @@ // CHECK: %[[t0:.*]] = llvm.call @mgpuStreamCreate %t0 = gpu.wait async // CHECK: %[[size_bytes:.*]] = llvm.mlir.constant - // CHECK: %[[value:.*]] = llvm.bitcast // CHECK: %[[addr_cast:.*]] = llvm.addrspacecast - // CHECK: %[[dst:.*]] = llvm.bitcast %[[addr_cast]] - // CHECK: llvm.call @mgpuMemset32(%[[dst]], %[[value]], %[[size_bytes]], %[[t0]]) + // CHECK: llvm.call @mgpuMemset32(%[[addr_cast]], %{{.*}}, %[[size_bytes]], %[[t0]]) %t1 = gpu.memset async [%t0] %dst, %value : memref<7xf32, 1>, f32 // CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]]) // CHECK: llvm.call @mgpuStreamDestroy(%[[t0]]) diff --git a/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s +// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s module attributes {gpu.container_module} { diff --git a/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir b/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir --- a/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir +++ b/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir @@ -1,18 +1,18 @@ -// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck --check-prefix=NVVM %s -// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck --check-prefix=ROCDL %s +// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm='use-opaque-pointers=1' --split-input-file %s | FileCheck --check-prefix=NVVM %s +// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl='use-opaque-pointers=1' --split-input-file %s | FileCheck --check-prefix=ROCDL %s gpu.module @kernel { // NVVM-LABEL: llvm.func @private gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, #gpu.address_space>) { // Allocate private memory inside the function. // NVVM: %[[size:.*]] = llvm.mlir.constant(4 : i64) : i64 - // NVVM: %[[raw:.*]] = llvm.alloca %[[size]] x f32 : (i64) -> !llvm.ptr + // NVVM: %[[raw:.*]] = llvm.alloca %[[size]] x f32 : (i64) -> !llvm.ptr // ROCDL: %[[size:.*]] = llvm.mlir.constant(4 : i64) : i64 - // ROCDL: %[[raw:.*]] = llvm.alloca %[[size]] x f32 : (i64) -> !llvm.ptr + // ROCDL: %[[raw:.*]] = llvm.alloca %[[size]] x f32 : (i64) -> !llvm.ptr<5> // Populate the memref descriptor. - // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> // NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] // NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64 @@ -22,7 +22,7 @@ // NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64 // NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0] - // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<5>, ptr<5>, i64, array<1 x i64>, array<1 x i64>)> // ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] // ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64 @@ -67,16 +67,16 @@ // ROCDL-SAME: { gpu.func @workgroup(%arg0: f32) workgroup(%arg1: memref<4xf32, #gpu.address_space>) { // Get the address of the first element in the global array. - // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr, 3> + // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3> // NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0] - // NVVM-SAME: !llvm.ptr + // NVVM-SAME: !llvm.ptr<3> - // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr, 3> + // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3> // ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0] - // ROCDL-SAME: !llvm.ptr + // ROCDL-SAME: !llvm.ptr<3> // Populate the memref descriptor. - // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] // NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64 @@ -86,7 +86,7 @@ // NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64 // NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0] - // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<1 x i64>, array<1 x i64>)> + // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<1 x i64>, array<1 x i64>)> // ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] // ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64 @@ -128,16 +128,16 @@ // ROCDL-LABEL: llvm.func @workgroup3d gpu.func @workgroup3d(%arg0: f32) workgroup(%arg1: memref<4x2x6xf32, #gpu.address_space>) { // Get the address of the first element in the global array. - // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr, 3> + // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3> // NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0] - // NVVM-SAME: !llvm.ptr + // NVVM-SAME: !llvm.ptr<3> - // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr, 3> + // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[$buffer]] : !llvm.ptr<3> // ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][0, 0] - // ROCDL-SAME: !llvm.ptr + // ROCDL-SAME: !llvm.ptr<3> // Populate the memref descriptor. - // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<3 x i64>, array<3 x i64>)> // NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] // NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64 @@ -155,7 +155,7 @@ // NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : i64 // NVVM: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2] - // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm.struct<(ptr<3>, ptr<3>, i64, array<3 x i64>, array<3 x i64>)> // ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] // ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : i64 @@ -208,14 +208,14 @@ // Private buffers. // NVVM: %[[c3:.*]] = llvm.mlir.constant(3 : i64) - // NVVM: llvm.alloca %[[c3]] x f32 : (i64) -> !llvm.ptr + // NVVM: llvm.alloca %[[c3]] x f32 : (i64) -> !llvm.ptr // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : i64) - // NVVM: llvm.alloca %[[c4]] x f32 : (i64) -> !llvm.ptr + // NVVM: llvm.alloca %[[c4]] x f32 : (i64) -> !llvm.ptr // ROCDL: %[[c3:.*]] = llvm.mlir.constant(3 : i64) - // ROCDL: llvm.alloca %[[c3]] x f32 : (i64) -> !llvm.ptr + // ROCDL: llvm.alloca %[[c3]] x f32 : (i64) -> !llvm.ptr<5> // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : i64) - // ROCDL: llvm.alloca %[[c4]] x f32 : (i64) -> !llvm.ptr + // ROCDL: llvm.alloca %[[c4]] x f32 : (i64) -> !llvm.ptr<5> %c0 = arith.constant 0 : index memref.store %arg0, %arg1[%c0] : memref<1xf32, #gpu.address_space> diff --git a/mlir/test/Conversion/GPUCommon/transfer_write.mlir b/mlir/test/Conversion/GPUCommon/transfer_write.mlir --- a/mlir/test/Conversion/GPUCommon/transfer_write.mlir +++ b/mlir/test/Conversion/GPUCommon/transfer_write.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s +// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s func.func @warp_extract(%arg0: index, %arg1: memref<1024x1024xf32>, %arg2: index, %arg3: vector<1xf32>) { %c0 = arith.constant 0 : index diff --git a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir @@ -0,0 +1,61 @@ +// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=0' --split-input-file | FileCheck %s + +module attributes {gpu.container_module} { + // CHECK-LABEL: llvm.func @main + // CHECK-SAME: %[[size:.*]]: i64 + func.func @main(%size : index) { + // CHECK: %[[stream:.*]] = llvm.call @mgpuStreamCreate() + %0 = gpu.wait async + // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]] + // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]] + // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]]) + %1, %2 = gpu.alloc async [%0] (%size) : memref + // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0] + // CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]] + // CHECK: llvm.call @mgpuMemFree(%[[void_ptr]], %[[stream]]) + %3 = gpu.dealloc async [%2] %1 : memref + // CHECK: llvm.call @mgpuStreamSynchronize(%[[stream]]) + // CHECK: llvm.call @mgpuStreamDestroy(%[[stream]]) + gpu.wait [%3] + return + } + + // CHECK: func @foo + func.func @foo(%dst : memref<7xf32, 1>, %src : memref<7xf32>) { + // CHECK: %[[t0:.*]] = llvm.call @mgpuStreamCreate + %t0 = gpu.wait async + // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint + // CHECK-NOT: llvm.addrspacecast + // CHECK: %[[src:.*]] = llvm.bitcast + // CHECK: %[[addr_cast:.*]] = llvm.addrspacecast + // CHECK: %[[dst:.*]] = llvm.bitcast %[[addr_cast]] + // CHECK: llvm.call @mgpuMemcpy(%[[dst]], %[[src]], %[[size_bytes]], %[[t0]]) + %t1 = gpu.memcpy async [%t0] %dst, %src : memref<7xf32, 1>, memref<7xf32> + // CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]]) + // CHECK: llvm.call @mgpuStreamDestroy(%[[t0]]) + gpu.wait [%t1] + return + } +} + +// ----- + +module attributes {gpu.container_module} { + + // CHECK: func @foo + func.func @foo(%dst : memref<7xf32, 1>, %value : f32) { + // CHECK: %[[t0:.*]] = llvm.call @mgpuStreamCreate + %t0 = gpu.wait async + // CHECK: %[[size_bytes:.*]] = llvm.mlir.constant + // CHECK: %[[value:.*]] = llvm.bitcast + // CHECK: %[[addr_cast:.*]] = llvm.addrspacecast + // CHECK: %[[dst:.*]] = llvm.bitcast %[[addr_cast]] + // CHECK: llvm.call @mgpuMemset32(%[[dst]], %[[value]], %[[size_bytes]], %[[t0]]) + %t1 = gpu.memset async [%t0] %dst, %value : memref<7xf32, 1>, f32 + // CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]]) + // CHECK: llvm.call @mgpuStreamDestroy(%[[t0]]) + gpu.wait [%t1] + return + } +} + diff --git a/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir --- a/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir +++ b/mlir/test/Conversion/GPUToCUDA/lower-nvvm-kernel-to-cubin.mlir @@ -2,7 +2,7 @@ // CHECK: gpu.module @foo attributes {gpu.binary = "CUBIN"} gpu.module @foo { - llvm.func @kernel(%arg0 : f32, %arg1 : !llvm.ptr) + llvm.func @kernel(%arg0 : f32, %arg1 : !llvm.ptr) // CHECK: attributes {gpu.kernel} attributes { gpu.kernel } { llvm.return diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir --- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1' -split-input-file | FileCheck %s -// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s +// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 use-opaque-pointers=1' -split-input-file | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 index-bitwidth=32 use-opaque-pointers=1' -split-input-file | FileCheck --check-prefix=CHECK32 %s gpu.module @test_module { // CHECK-LABEL: func @gpu_index_ops() diff --git a/mlir/test/Conversion/GPUToNVVM/typed-pointers.mlir b/mlir/test/Conversion/GPUToNVVM/typed-pointers.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUToNVVM/typed-pointers.mlir @@ -0,0 +1,39 @@ +// RUN: mlir-opt --convert-gpu-to-nvvm="use-opaque-pointers=0" --split-input-file %s | FileCheck %s +// RUN: mlir-opt --convert-gpu-to-nvvm="index-bitwidth=32 use-opaque-pointers=0" --split-input-file %s | FileCheck --check-prefix=CHECK32 %s + +gpu.module @test_module { + + // CHECK-LABEL: func @gpu_wmma_load_op() -> + // CHECK-SAME: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> + // CHECK32-LABEL: func @gpu_wmma_load_op() -> + func.func @gpu_wmma_load_op() -> (!gpu.mma_matrix<16x16xf16, "AOp">) { + %wg = memref.alloca() {alignment = 32} : memref<32x32xf16, 3> + %i = arith.constant 16 : index + %j = arith.constant 16 : index + %0 = gpu.subgroup_mma_load_matrix %wg[%i, %j] {leadDimension = 32 : index, transpose} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp"> + // CHECK: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i64 + // CHECK: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}] + // CHECK: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + // CHECK: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i64 + // CHECK: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i64 + // CHECK: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i64 + // CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr, i64) -> !llvm.ptr + // CHECK: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32 + // CHECK: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]] + // CHECK-SAME: {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> + // CHECK: llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> + + // CHECK32: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32 + // CHECK32: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}] + // CHECK32: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr, ptr, i32, array<2 x i32>, array<2 x i32>)> + // CHECK32: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i32 + // CHECK32: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i32 + // CHECK32: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i32 + // CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr, i32) -> !llvm.ptr + // CHECK32: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32 + // CHECK32: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]] + // CHECK32-SAME: {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> + // CHECK32: llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> + return %0 : !gpu.mma_matrix<16x16xf16, "AOp"> + } +} diff --git a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir --- a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s -// RUN: mlir-opt --convert-gpu-to-nvvm="index-bitwidth=32" --split-input-file %s | FileCheck --check-prefix=CHECK32 %s +// RUN: mlir-opt --convert-gpu-to-nvvm='use-opaque-pointers=1' --split-input-file %s | FileCheck %s +// RUN: mlir-opt --convert-gpu-to-nvvm="index-bitwidth=32 use-opaque-pointers=1" --split-input-file %s | FileCheck --check-prefix=CHECK32 %s gpu.module @test_module { @@ -13,26 +13,26 @@ %0 = gpu.subgroup_mma_load_matrix %wg[%i, %j] {leadDimension = 32 : index, transpose} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp"> // CHECK: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i64 // CHECK: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}] - // CHECK: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + // CHECK: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i64 // CHECK: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i64 // CHECK: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i64 - // CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr, i64) -> !llvm.ptr + // CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 // CHECK: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32 // CHECK: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]] - // CHECK-SAME: {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> + // CHECK-SAME: {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> // CHECK: llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> // CHECK32: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32 // CHECK32: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}] - // CHECK32: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr, ptr, i32, array<2 x i32>, array<2 x i32>)> + // CHECK32: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<2 x i32>, array<2 x i32>)> // CHECK32: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i32 // CHECK32: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i32 // CHECK32: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i32 - // CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr, i32) -> !llvm.ptr + // CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, f16 // CHECK32: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32 // CHECK32: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]] - // CHECK32-SAME: {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> + // CHECK32-SAME: {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> // CHECK32: llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> return %0 : !gpu.mma_matrix<16x16xf16, "AOp"> } @@ -52,26 +52,26 @@ %0 = gpu.subgroup_mma_load_matrix %wg[%i, %j] {leadDimension = 32 : index, transpose} : memref<32x32xi8, 3> -> !gpu.mma_matrix<16x16xsi8, "AOp"> // CHECK: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i64 // CHECK: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}] - // CHECK: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + // CHECK: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i64 // CHECK: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i64 // CHECK: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i64 - // CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr, i64) -> !llvm.ptr + // CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, i8 // CHECK: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32 // CHECK: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]] - // CHECK-SAME: {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr) -> !llvm.struct<(i32, i32)> + // CHECK-SAME: {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32)> // CHECK: llvm.return %[[FRAG]] : !llvm.struct<(i32, i32)> // CHECK32: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32 // CHECK32: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}] - // CHECK32: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr, ptr, i32, array<2 x i32>, array<2 x i32>)> + // CHECK32: %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<2 x i32>, array<2 x i32>)> // CHECK32: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i32 // CHECK32: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i32 // CHECK32: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i32 - // CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr, i32) -> !llvm.ptr + // CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, i8 // CHECK32: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32 // CHECK32: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]] - // CHECK32-SAME: {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr) -> !llvm.struct<(i32, i32)> + // CHECK32-SAME: {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32)> // CHECK32: llvm.return %[[FRAG]] : !llvm.struct<(i32, i32)> return %0 : !gpu.mma_matrix<16x16xsi8, "AOp"> } @@ -96,14 +96,14 @@ // CHECK: %[[EL2:.*]] = llvm.extractvalue %[[D]][1] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> // CHECK: %[[EL3:.*]] = llvm.extractvalue %[[D]][2] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> // CHECK: %[[EL4:.*]] = llvm.extractvalue %[[D]][3] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> - // CHECK: %[[BASE:.*]] = llvm.extractvalue %17[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + // CHECK: %[[BASE:.*]] = llvm.extractvalue %17[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)> // CHECK: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i64 // CHECK: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i64 // CHECK: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i64 - // CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr, i64) -> !llvm.ptr + // CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f16 // CHECK: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32 // CHECK: nvvm.wmma.store %[[ADDRESS]], %[[LDM32]], %[[EL1]], %[[EL2]], %[[EL3]], %[[EL4]] - // CHECK-SAME: {eltype = #nvvm.mma_type, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : !llvm.ptr, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16> + // CHECK-SAME: {eltype = #nvvm.mma_type, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : !llvm.ptr<3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16> // CHECK: llvm.return // CHECK32: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32 @@ -112,14 +112,14 @@ // CHECK32: %[[EL2:.*]] = llvm.extractvalue %[[D]][1] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> // CHECK32: %[[EL3:.*]] = llvm.extractvalue %[[D]][2] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> // CHECK32: %[[EL4:.*]] = llvm.extractvalue %[[D]][3] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> - // CHECK32: %[[BASE:.*]] = llvm.extractvalue %17[1] : !llvm.struct<(ptr, ptr, i32, array<2 x i32>, array<2 x i32>)> + // CHECK32: %[[BASE:.*]] = llvm.extractvalue %17[1] : !llvm.struct<(ptr<3>, ptr<3>, i32, array<2 x i32>, array<2 x i32>)> // CHECK32: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i32 // CHECK32: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i32 // CHECK32: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i32 - // CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr, i32) -> !llvm.ptr + // CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, f16 // CHECK32: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32 // CHECK32: nvvm.wmma.store %[[ADDRESS]], %[[LDM32]], %[[EL1]], %[[EL2]], %[[EL3]], %[[EL4]] - // CHECK32-SAME: {eltype = #nvvm.mma_type, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : !llvm.ptr, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16> + // CHECK32-SAME: {eltype = #nvvm.mma_type, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : !llvm.ptr<3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16> // CHECK32: llvm.return return } @@ -195,13 +195,13 @@ gpu.module @test_module { // CHECK-LABEL: func @gpu_wmma_mma_loop_op -// CHECK: %[[C:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> +// CHECK: %[[C:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> // CHECK: llvm.br ^bb1(%{{.*}}, %[[C]] : i64, !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>) // CHECK: ^bb1(%{{.*}}: i64, %[[ACC:.+]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>): // 2 preds: ^bb0, ^bb2 // CHECK: llvm.cond_br %{{.*}}, ^bb2, ^bb3 // CHECK: ^bb2: // pred: ^bb1 -// CHECK: %[[A:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> -// CHECK: %[[B:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> +// CHECK: %[[A:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> +// CHECK: %[[B:.+]] = nvvm.wmma.load %{{.*}}, %{{.*}} {eltype = #nvvm.mma_type, frag = #nvvm.mma_frag, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : (!llvm.ptr) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> // CHECK: %[[A0:.+]] = llvm.extractvalue %[[A]][0] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> // CHECK: %[[A1:.+]] = llvm.extractvalue %[[A]][1] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> // CHECK: %[[A2:.+]] = llvm.extractvalue %[[A]][2] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> @@ -229,7 +229,7 @@ // CHECK: %[[E1:.+]] = llvm.extractvalue %[[ACC]][1] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> // CHECK: %[[E2:.+]] = llvm.extractvalue %[[ACC]][2] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> // CHECK: %[[E3:.+]] = llvm.extractvalue %[[ACC]][3] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> -// CHECK: nvvm.wmma.store %{{.*}}, %{{.*}}, %[[E0]], %[[E1]], %[[E2]], %[[E3]] {eltype = #nvvm.mma_type, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : !llvm.ptr, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16> +// CHECK: nvvm.wmma.store %{{.*}}, %{{.*}}, %[[E0]], %[[E1]], %[[E2]], %[[E3]] {eltype = #nvvm.mma_type, k = 16 : i32, layout = #nvvm.mma_layout, m = 16 : i32, n = 16 : i32} : !llvm.ptr, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16> func.func @gpu_wmma_mma_loop_op(%arg0: memref<128x128xf16>, %arg1: memref<128x128xf16>, %arg2: memref<128x128xf16>) { %c0 = arith.constant 0 : index diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir @@ -1,22 +1,22 @@ -// RUN: mlir-opt %s -convert-gpu-to-rocdl=runtime=HIP -split-input-file | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=HIP use-opaque-pointers=1' -split-input-file | FileCheck %s gpu.module @test_module { // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL0:[A-Za-z0-9_]+]]("Hello, world\0A\00") // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL1:[A-Za-z0-9_]+]]("Hello: %d\0A\00") // CHECK-DAG: llvm.func @__ockl_printf_append_args(i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64 - // CHECK-DAG: llvm.func @__ockl_printf_append_string_n(i64, !llvm.ptr, i64, i32) -> i64 + // CHECK-DAG: llvm.func @__ockl_printf_append_string_n(i64, !llvm.ptr, i64, i32) -> i64 // CHECK-DAG: llvm.func @__ockl_printf_begin(i64) -> i64 // CHECK-LABEL: func @test_const_printf gpu.func @test_const_printf() { // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64 // CHECK-NEXT: %[[DESC0:.*]] = llvm.call @__ockl_printf_begin(%0) : (i64) -> i64 - // CHECK-NEXT: %[[FORMATSTR:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL0]] : !llvm.ptr> - // CHECK-NEXT: %[[FORMATSTART:.*]] = llvm.getelementptr %[[FORMATSTR]][0, 0] : (!llvm.ptr>) -> !llvm.ptr + // CHECK-NEXT: %[[FORMATSTR:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL0]] : !llvm.ptr + // CHECK-NEXT: %[[FORMATSTART:.*]] = llvm.getelementptr %[[FORMATSTR]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<14 x i8> // CHECK-NEXT: %[[FORMATLEN:.*]] = llvm.mlir.constant(14 : i64) : i64 // CHECK-NEXT: %[[ISLAST:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-NEXT: %[[ISNTLAST:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_string_n(%[[DESC0]], %[[FORMATSTART]], %[[FORMATLEN]], %[[ISLAST]]) : (i64, !llvm.ptr, i64, i32) -> i64 + // CHECK-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_string_n(%[[DESC0]], %[[FORMATSTART]], %[[FORMATLEN]], %[[ISLAST]]) : (i64, !llvm.ptr, i64, i32) -> i64 gpu.printf "Hello, world\n" gpu.return } @@ -27,12 +27,12 @@ gpu.func @test_printf(%arg0: i32) { // CHECK: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64 // CHECK-NEXT: %[[DESC0:.*]] = llvm.call @__ockl_printf_begin(%0) : (i64) -> i64 - // CHECK-NEXT: %[[FORMATSTR:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL1]] : !llvm.ptr> - // CHECK-NEXT: %[[FORMATSTART:.*]] = llvm.getelementptr %[[FORMATSTR]][0, 0] : (!llvm.ptr>) -> !llvm.ptr + // CHECK-NEXT: %[[FORMATSTR:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL1]] : !llvm.ptr + // CHECK-NEXT: %[[FORMATSTART:.*]] = llvm.getelementptr %[[FORMATSTR]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<11 x i8> // CHECK-NEXT: %[[FORMATLEN:.*]] = llvm.mlir.constant(11 : i64) : i64 // CHECK-NEXT: %[[ISLAST:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-NEXT: %[[ISNTLAST:.*]] = llvm.mlir.constant(0 : i32) : i32 - // CHECK-NEXT: %[[DESC1:.*]] = llvm.call @__ockl_printf_append_string_n(%[[DESC0]], %[[FORMATSTART]], %[[FORMATLEN]], %[[ISNTLAST]]) : (i64, !llvm.ptr, i64, i32) -> i64 + // CHECK-NEXT: %[[DESC1:.*]] = llvm.call @__ockl_printf_append_string_n(%[[DESC0]], %[[FORMATSTART]], %[[FORMATLEN]], %[[ISNTLAST]]) : (i64, !llvm.ptr, i64, i32) -> i64 // CHECK-NEXT: %[[NARGS1:.*]] = llvm.mlir.constant(1 : i32) : i32 // CHECK-NEXT: %[[ARG0_64:.*]] = llvm.zext %[[ARG0]] : i32 to i64 // CHECK-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_args(%[[DESC1]], %[[NARGS1]], %[[ARG0_64]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[ISLAST]]) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64 diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir @@ -1,14 +1,14 @@ -// RUN: mlir-opt %s -convert-gpu-to-rocdl=runtime=OpenCL | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=OpenCL use-opaque-pointers=1' | FileCheck %s gpu.module @test_module { // CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00") {addr_space = 4 : i32} - // CHECK: llvm.func @printf(!llvm.ptr, ...) -> i32 + // CHECK: llvm.func @printf(!llvm.ptr<4>, ...) -> i32 // CHECK-LABEL: func @test_printf // CHECK: (%[[ARG0:.*]]: i32) gpu.func @test_printf(%arg0: i32) { - // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr, 4> - // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr, 4>) -> !llvm.ptr - // CHECK-NEXT: %{{.*}} = llvm.call @printf(%[[IMM2]], %[[ARG0]]) : (!llvm.ptr, i32) -> i32 + // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<4> + // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<4>) -> !llvm.ptr<4>, !llvm.array<11 x i8> + // CHECK-NEXT: %{{.*}} = llvm.call @printf(%[[IMM2]], %[[ARG0]]) : (!llvm.ptr<4>, i32) -> i32 gpu.printf "Hello: %d\n" %arg0 : i32 gpu.return } diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s -// RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s +// RUN: mlir-opt %s -convert-gpu-to-rocdl='use-opaque-pointers=1' -split-input-file | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32 use-opaque-pointers=1' -split-input-file | FileCheck --check-prefix=CHECK32 %s gpu.module @test_module { // CHECK-LABEL: func @gpu_index_ops() @@ -461,7 +461,7 @@ } } -// ---- +// ----- gpu.module @module { // CHECK-LABEL: @spirv_exp diff --git a/mlir/test/Conversion/GPUToROCDL/memref.mlir b/mlir/test/Conversion/GPUToROCDL/memref.mlir --- a/mlir/test/Conversion/GPUToROCDL/memref.mlir +++ b/mlir/test/Conversion/GPUToROCDL/memref.mlir @@ -1,14 +1,14 @@ -// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-rocdl='use-opaque-pointers=1' -split-input-file | FileCheck %s // RUN: mlir-opt %s \ -// RUN: -convert-gpu-to-rocdl=use-bare-ptr-memref-call-conv=true \ +// RUN: -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=true use-opaque-pointers=1' \ // RUN: -split-input-file \ // RUN: | FileCheck %s --check-prefix=BARE gpu.module @memref_conversions { // CHECK: llvm.func @kern - // CHECK-SAME: (%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64) + // CHECK-SAME: (%{{.*}}: !llvm.ptr, %{{.*}}: !llvm.ptr, %{{.*}}: i64, %{{.*}}: i64, %{{.*}}: i64) // BARE: llvm.func @kern - // BARE-SAME: (%{{.*}}: !llvm.ptr) + // BARE-SAME: (%{{.*}}: !llvm.ptr) gpu.func @kern(%arg0: memref<8xf32>) kernel { gpu.return } diff --git a/mlir/test/Conversion/GPUToROCDL/typed-pointers.mlir b/mlir/test/Conversion/GPUToROCDL/typed-pointers.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUToROCDL/typed-pointers.mlir @@ -0,0 +1,34 @@ +// RUN: mlir-opt %s -convert-gpu-to-rocdl="runtime=HIP use-opaque-pointers=0" -split-input-file | FileCheck --check-prefixes=CHECK,HIP %s +// RUN: mlir-opt %s -convert-gpu-to-rocdl="runtime=OpenCL use-opaque-pointers=0" | FileCheck --check-prefixes=CHECK,OCL %s + +gpu.module @test_module { + // HIP-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL1:[A-Za-z0-9_]+]]("Hello: %d\0A\00") + // HIP-DAG: llvm.func @__ockl_printf_append_args(i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64 + // HIP-DAG: llvm.func @__ockl_printf_append_string_n(i64, !llvm.ptr, i64, i32) -> i64 + // HIP-DAG: llvm.func @__ockl_printf_begin(i64) -> i64 + + // OCL: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00") {addr_space = 4 : i32} + // OCL: llvm.func @printf(!llvm.ptr, ...) -> i32 + // CHECK-LABEL: func @test_printf + // CHECK: (%[[ARG0:.*]]: i32) + gpu.func @test_printf(%arg0: i32) { + // OCL: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr, 4> + // OCL-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr, 4>) -> !llvm.ptr + // OCL-NEXT: %{{.*}} = llvm.call @printf(%[[IMM2]], %[[ARG0]]) : (!llvm.ptr, i32) -> i32 + + // HIP: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64 + // HIP-NEXT: %[[DESC0:.*]] = llvm.call @__ockl_printf_begin(%0) : (i64) -> i64 + // HIP-NEXT: %[[FORMATSTR:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL1]] : !llvm.ptr> + // HIP-NEXT: %[[FORMATSTART:.*]] = llvm.getelementptr %[[FORMATSTR]][0, 0] : (!llvm.ptr>) -> !llvm.ptr + // HIP-NEXT: %[[FORMATLEN:.*]] = llvm.mlir.constant(11 : i64) : i64 + // HIP-NEXT: %[[ISLAST:.*]] = llvm.mlir.constant(1 : i32) : i32 + // HIP-NEXT: %[[ISNTLAST:.*]] = llvm.mlir.constant(0 : i32) : i32 + // HIP-NEXT: %[[DESC1:.*]] = llvm.call @__ockl_printf_append_string_n(%[[DESC0]], %[[FORMATSTART]], %[[FORMATLEN]], %[[ISNTLAST]]) : (i64, !llvm.ptr, i64, i32) -> i64 + // HIP-NEXT: %[[NARGS1:.*]] = llvm.mlir.constant(1 : i32) : i32 + // HIP-NEXT: %[[ARG0_64:.*]] = llvm.zext %[[ARG0]] : i32 to i64 + // HIP-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_args(%[[DESC1]], %[[NARGS1]], %[[ARG0_64]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[ISLAST]]) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64 + + gpu.printf "Hello: %d\n" %arg0 : i32 + gpu.return + } +} diff --git a/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir --- a/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir +++ b/mlir/test/Conversion/GPUToROCm/lower-rocdl-kernel-to-hsaco.mlir @@ -2,7 +2,7 @@ // CHECK: gpu.module @foo attributes {gpu.binary = "HSACO"} gpu.module @foo { - llvm.func @kernel(%arg0 : f32, %arg1 : !llvm.ptr) + llvm.func @kernel(%arg0 : f32, %arg1 : !llvm.ptr) // CHECK: attributes {gpu.kernel} attributes { gpu.kernel } { llvm.return