diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h --- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h +++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h @@ -11,11 +11,19 @@ #include namespace mlir { +class LLVMTypeConverter; +class OwningRewritePatternList; + +template +class OperationPass; namespace gpu { class GPUModuleOp; } // namespace gpu -template class OperationPass; + +/// Collect a set of patterns to convert from the GPU dialect to ROCDL. +void populateGpuToROCDLConversionPatterns(LLVMTypeConverter &converter, + OwningRewritePatternList &patterns); /// Creates a pass that lowers GPU dialect operations to ROCDL counterparts. std::unique_ptr> diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h new file mode 100644 --- /dev/null +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h @@ -0,0 +1,171 @@ +//===- GPUOpsLowering.h - GPU FuncOp / ReturnOp lowering -------*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ +#define MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ + +#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" +#include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/IR/Builders.h" + +namespace mlir { + +template +struct GPUFuncOpLowering : ConvertToLLVMPattern { + explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter) + : ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(), + typeConverter.getDialect()->getContext(), + typeConverter) {} + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + assert(operands.empty() && "func op is not expected to have operands"); + auto gpuFuncOp = cast(op); + Location loc = gpuFuncOp.getLoc(); + + SmallVector workgroupBuffers; + workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions()); + for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) { + Value attribution = en.value(); + + auto type = attribution.getType().dyn_cast(); + assert(type && type.hasStaticShape() && "unexpected type in attribution"); + + uint64_t numElements = type.getNumElements(); + + auto elementType = typeConverter.convertType(type.getElementType()) + .template cast(); + auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements); + std::string name = std::string( + llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index())); + auto globalOp = rewriter.create( + gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false, + LLVM::Linkage::Internal, name, /*value=*/Attribute(), + gpu::GPUDialect::getWorkgroupAddressSpace()); + workgroupBuffers.push_back(globalOp); + } + + // Rewrite the original GPU function to an LLVM function. + auto funcType = typeConverter.convertType(gpuFuncOp.getType()) + .template cast() + .getPointerElementTy(); + + // Remap proper input types. + TypeConverter::SignatureConversion signatureConversion( + gpuFuncOp.front().getNumArguments()); + typeConverter.convertFunctionSignature( + gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion); + + // Create the new function operation. Only copy those attributes that are + // not specific to function modeling. + SmallVector attributes; + for (const auto &attr : gpuFuncOp.getAttrs()) { + if (attr.first == SymbolTable::getSymbolAttrName() || + attr.first == impl::getTypeAttrName() || + attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName()) + continue; + attributes.push_back(attr); + } + auto llvmFuncOp = rewriter.create( + gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType, + LLVM::Linkage::External, attributes); + + { + // Insert operations that correspond to converted workgroup and private + // memory attributions to the body of the function. This must operate on + // the original function, before the body region is inlined in the new + // function to maintain the relation between block arguments and the + // parent operation that assigns their semantics. + OpBuilder::InsertionGuard guard(rewriter); + + // Rewrite workgroup memory attributions to addresses of global buffers. + rewriter.setInsertionPointToStart(&gpuFuncOp.front()); + unsigned numProperArguments = gpuFuncOp.getNumArguments(); + auto i32Type = LLVM::LLVMType::getInt32Ty(typeConverter.getDialect()); + + Value zero = nullptr; + if (!workgroupBuffers.empty()) + zero = rewriter.create(loc, i32Type, + rewriter.getI32IntegerAttr(0)); + for (auto en : llvm::enumerate(workgroupBuffers)) { + LLVM::GlobalOp global = en.value(); + Value address = rewriter.create(loc, global); + auto elementType = global.getType().getArrayElementType(); + Value memory = rewriter.create( + loc, elementType.getPointerTo(global.addr_space().getZExtValue()), + address, ArrayRef{zero, zero}); + + // Build a memref descriptor pointing to the buffer to plug with the + // existing memref infrastructure. This may use more registers than + // otherwise necessary given that memref sizes are fixed, but we can try + // and canonicalize that away later. + Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()]; + auto type = attribution.getType().cast(); + auto descr = MemRefDescriptor::fromStaticShape( + rewriter, loc, typeConverter, type, memory); + signatureConversion.remapInput(numProperArguments + en.index(), descr); + } + + // Rewrite private memory attributions to alloca'ed buffers. + unsigned numWorkgroupAttributions = + gpuFuncOp.getNumWorkgroupAttributions(); + auto int64Ty = LLVM::LLVMType::getInt64Ty(typeConverter.getDialect()); + for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) { + Value attribution = en.value(); + auto type = attribution.getType().cast(); + assert(type && type.hasStaticShape() && + "unexpected type in attribution"); + + // Explicitly drop memory space when lowering private memory + // attributions since NVVM models it as `alloca`s in the default + // memory space and does not support `alloca`s with addrspace(5). + auto ptrType = typeConverter.convertType(type.getElementType()) + .template cast() + .getPointerTo(AllocaAddrSpace); + Value numElements = rewriter.create( + gpuFuncOp.getLoc(), int64Ty, + rewriter.getI64IntegerAttr(type.getNumElements())); + Value allocated = rewriter.create( + gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0); + auto descr = MemRefDescriptor::fromStaticShape( + rewriter, loc, typeConverter, type, allocated); + signatureConversion.remapInput( + numProperArguments + numWorkgroupAttributions + en.index(), descr); + } + } + + // Move the region to the new function, update the entry block signature. + rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(), + llvmFuncOp.end()); + rewriter.applySignatureConversion(&llvmFuncOp.getBody(), + signatureConversion); + + rewriter.eraseOp(gpuFuncOp); + return success(); + } +}; + +struct GPUReturnOpLowering : public ConvertToLLVMPattern { + GPUReturnOpLowering(LLVMTypeConverter &typeConverter) + : ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(), + typeConverter.getDialect()->getContext(), + typeConverter) {} + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + rewriter.replaceOpWithNewOp(op, operands); + return success(); + } +}; + +} // namespace mlir + +#endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -21,6 +21,7 @@ #include "mlir/Transforms/DialectConversion.h" #include "llvm/Support/FormatVariadic.h" +#include "../GPUCommon/GPUOpsLowering.h" #include "../GPUCommon/IndexIntrinsicsOpLowering.h" #include "../GPUCommon/OpToFuncCallLowering.h" #include "../PassDetail.h" @@ -88,155 +89,6 @@ } }; -struct GPUFuncOpLowering : ConvertToLLVMPattern { - explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter) - : ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(), - typeConverter.getDialect()->getContext(), - typeConverter) {} - - LogicalResult - matchAndRewrite(Operation *op, ArrayRef operands, - ConversionPatternRewriter &rewriter) const override { - assert(operands.empty() && "func op is not expected to have operands"); - auto gpuFuncOp = cast(op); - Location loc = gpuFuncOp.getLoc(); - - SmallVector workgroupBuffers; - workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions()); - for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) { - Value attribution = en.value(); - - auto type = attribution.getType().dyn_cast(); - assert(type && type.hasStaticShape() && "unexpected type in attribution"); - - uint64_t numElements = type.getNumElements(); - - auto elementType = typeConverter.convertType(type.getElementType()) - .cast(); - auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements); - std::string name = std::string( - llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index())); - auto globalOp = rewriter.create( - gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false, - LLVM::Linkage::Internal, name, /*value=*/Attribute(), - gpu::GPUDialect::getWorkgroupAddressSpace()); - workgroupBuffers.push_back(globalOp); - } - - // Rewrite the original GPU function to an LLVM function. - auto funcType = typeConverter.convertType(gpuFuncOp.getType()) - .cast() - .getPointerElementTy(); - - // Remap proper input types. - TypeConverter::SignatureConversion signatureConversion( - gpuFuncOp.front().getNumArguments()); - typeConverter.convertFunctionSignature( - gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion); - - // Create the new function operation. Only copy those attributes that are - // not specific to function modeling. - SmallVector attributes; - for (const auto &attr : gpuFuncOp.getAttrs()) { - if (attr.first == SymbolTable::getSymbolAttrName() || - attr.first == impl::getTypeAttrName() || - attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName()) - continue; - attributes.push_back(attr); - } - auto llvmFuncOp = rewriter.create( - gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType, - LLVM::Linkage::External, attributes); - - { - // Insert operations that correspond to converted workgroup and private - // memory attributions to the body of the function. This must operate on - // the original function, before the body region is inlined in the new - // function to maintain the relation between block arguments and the - // parent operation that assigns their semantics. - OpBuilder::InsertionGuard guard(rewriter); - - // Rewrite workgroup memory attributions to addresses of global buffers. - rewriter.setInsertionPointToStart(&gpuFuncOp.front()); - unsigned numProperArguments = gpuFuncOp.getNumArguments(); - auto i32Type = LLVM::LLVMType::getInt32Ty(typeConverter.getDialect()); - - Value zero = nullptr; - if (!workgroupBuffers.empty()) - zero = rewriter.create(loc, i32Type, - rewriter.getI32IntegerAttr(0)); - for (auto en : llvm::enumerate(workgroupBuffers)) { - LLVM::GlobalOp global = en.value(); - Value address = rewriter.create(loc, global); - auto elementType = global.getType().getArrayElementType(); - Value memory = rewriter.create( - loc, elementType.getPointerTo(global.addr_space().getZExtValue()), - address, ArrayRef{zero, zero}); - - // Build a memref descriptor pointing to the buffer to plug with the - // existing memref infrastructure. This may use more registers than - // otherwise necessary given that memref sizes are fixed, but we can try - // and canonicalize that away later. - Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()]; - auto type = attribution.getType().cast(); - auto descr = MemRefDescriptor::fromStaticShape( - rewriter, loc, typeConverter, type, memory); - signatureConversion.remapInput(numProperArguments + en.index(), descr); - } - - // Rewrite private memory attributions to alloca'ed buffers. - unsigned numWorkgroupAttributions = - gpuFuncOp.getNumWorkgroupAttributions(); - auto int64Ty = LLVM::LLVMType::getInt64Ty(typeConverter.getDialect()); - for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) { - Value attribution = en.value(); - auto type = attribution.getType().cast(); - assert(type && type.hasStaticShape() && - "unexpected type in attribution"); - - // Explicitly drop memory space when lowering private memory - // attributions since NVVM models it as `alloca`s in the default - // memory space and does not support `alloca`s with addrspace(5). - auto ptrType = typeConverter.convertType(type.getElementType()) - .cast() - .getPointerTo(); - Value numElements = rewriter.create( - gpuFuncOp.getLoc(), int64Ty, - rewriter.getI64IntegerAttr(type.getNumElements())); - Value allocated = rewriter.create( - gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0); - auto descr = MemRefDescriptor::fromStaticShape( - rewriter, loc, typeConverter, type, allocated); - signatureConversion.remapInput( - numProperArguments + numWorkgroupAttributions + en.index(), descr); - } - } - - // Move the region to the new function, update the entry block signature. - rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(), - llvmFuncOp.end()); - rewriter.applySignatureConversion(&llvmFuncOp.getBody(), - signatureConversion); - - rewriter.eraseOp(gpuFuncOp); - return success(); - } -}; - -struct GPUReturnOpLowering : public ConvertToLLVMPattern { - GPUReturnOpLowering(LLVMTypeConverter &typeConverter) - : ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(), - typeConverter.getDialect()->getContext(), - typeConverter) {} - - LogicalResult - matchAndRewrite(Operation *op, ArrayRef operands, - ConversionPatternRewriter &rewriter) const override { - rewriter.replaceOpWithNewOp(op, operands); - return success(); - } -}; - /// Import the GPU Ops to NVVM Patterns. #include "GPUToNVVM.cpp.inc" @@ -300,8 +152,11 @@ NVVM::BlockIdYOp, NVVM::BlockIdZOp>, GPUIndexIntrinsicOpLowering, - GPUShuffleOpLowering, GPUFuncOpLowering, GPUReturnOpLowering>( - converter); + GPUShuffleOpLowering, GPUReturnOpLowering, + // Explicitly drop memory space when lowering private memory + // attributions since NVVM models it as `alloca`s in the default + // memory space and does not support `alloca`s with addrspace(5). + GPUFuncOpLowering<0>>(converter); patterns.insert>(converter, "__nv_fabsf", "__nv_fabs"); patterns.insert>(converter, "__nv_ceilf", diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -14,11 +14,16 @@ #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" +#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" #include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/GPU/Passes.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" +#include "mlir/Dialect/Vector/VectorOps.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" +#include "llvm/Support/FormatVariadic.h" +#include "../GPUCommon/GPUOpsLowering.h" #include "../GPUCommon/IndexIntrinsicsOpLowering.h" #include "../GPUCommon/OpToFuncCallLowering.h" #include "../PassDetail.h" @@ -38,41 +43,25 @@ void runOnOperation() override { gpu::GPUModuleOp m = getOperation(); - OwningRewritePatternList patterns; LLVMTypeConverter converter(m.getContext()); - populateStdToLLVMConversionPatterns(converter, patterns); - patterns.insert< - GPUIndexIntrinsicOpLowering, - GPUIndexIntrinsicOpLowering, - GPUIndexIntrinsicOpLowering, - GPUIndexIntrinsicOpLowering>( - converter); - patterns.insert>(converter, "__ocml_fabs_f32", - "__ocml_fabs_f64"); - patterns.insert>(converter, "__ocml_ceil_f32", - "__ocml_ceil_f64"); - patterns.insert>(converter, "__ocml_cos_f32", - "__ocml_cos_f64"); - patterns.insert>(converter, "__ocml_exp_f32", - "__ocml_exp_f64"); - patterns.insert>(converter, "__ocml_log_f32", - "__ocml_log_f64"); - patterns.insert>( - converter, "__ocml_log10_f32", "__ocml_log10_f64"); - patterns.insert>(converter, "__ocml_log2_f32", - "__ocml_log2_f64"); - patterns.insert>(converter, "__ocml_tanh_f32", - "__ocml_tanh_f64"); - ConversionTarget target(getContext()); - target.addLegalDialect(); + OwningRewritePatternList patterns; + + populateGpuRewritePatterns(m.getContext(), patterns); + applyPatternsAndFoldGreedily(m, patterns); + patterns.clear(); + + populateVectorToLLVMConversionPatterns(converter, patterns); + populateStdToLLVMConversionPatterns(converter, patterns); + populateGpuToROCDLConversionPatterns(converter, patterns); + LLVMConversionTarget target(getContext()); + target.addIllegalDialect(); target.addIllegalOp(); target.addIllegalOp(); + target.addLegalDialect(); + // TODO(whchung): Remove once we support replacing non-root ops. + target.addLegalOp(); if (failed(applyPartialConversion(m, target, patterns, &converter))) signalPassFailure(); } @@ -80,6 +69,36 @@ } // anonymous namespace +void mlir::populateGpuToROCDLConversionPatterns( + LLVMTypeConverter &converter, OwningRewritePatternList &patterns) { + patterns.insert< + GPUIndexIntrinsicOpLowering, + GPUIndexIntrinsicOpLowering, + GPUIndexIntrinsicOpLowering, + GPUIndexIntrinsicOpLowering, + GPUFuncOpLowering<5>, GPUReturnOpLowering>(converter); + patterns.insert>(converter, "__ocml_fabs_f32", + "__ocml_fabs_f64"); + patterns.insert>(converter, "__ocml_ceil_f32", + "__ocml_ceil_f64"); + patterns.insert>(converter, "__ocml_cos_f32", + "__ocml_cos_f64"); + patterns.insert>(converter, "__ocml_exp_f32", + "__ocml_exp_f64"); + patterns.insert>(converter, "__ocml_log_f32", + "__ocml_log_f64"); + patterns.insert>(converter, "__ocml_log10_f32", + "__ocml_log10_f64"); + patterns.insert>(converter, "__ocml_log2_f32", + "__ocml_log2_f64"); + patterns.insert>(converter, "__ocml_tanh_f32", + "__ocml_tanh_f64"); +} + std::unique_ptr> mlir::createLowerGpuOpsToROCDLOpsPass() { return std::make_unique(); diff --git a/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir b/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir @@ -0,0 +1,231 @@ +// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck --check-prefix=NVVM %s +// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck --check-prefix=ROCDL %s + +gpu.module @kernel { + // NVVM-LABEL: llvm.func @private + gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, 5>) { + // Allocate private memory inside the function. + // NVVM: %[[size:.*]] = llvm.mlir.constant(4 : i64) : !llvm.i64 + // NVVM: %[[raw:.*]] = llvm.alloca %[[size]] x !llvm.float : (!llvm.i64) -> !llvm<"float*"> + + // ROCDL: %[[size:.*]] = llvm.mlir.constant(4 : i64) : !llvm.i64 + // ROCDL: %[[raw:.*]] = llvm.alloca %[[size]] x !llvm.float : (!llvm.i64) -> !llvm<"float addrspace(5)*"> + + // Populate the memref descriptor. + // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }"> + // NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] + // NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] + // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64 + // NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2] + // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64 + // NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0] + // NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64 + // NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0] + + // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(5)*, float addrspace(5)*, i64, [1 x i64], [1 x i64] }"> + // ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] + // ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] + // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64 + // ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2] + // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64 + // ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0] + // ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64 + // ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0] + + // "Store" lowering should work just as any other memref, only check that + // we emit some core instructions. + // NVVM: llvm.extractvalue %[[descr6:.*]] + // NVVM: llvm.getelementptr + // NVVM: llvm.store + + // ROCDL: llvm.extractvalue %[[descr6:.*]] + // ROCDL: llvm.getelementptr + // ROCDL: llvm.store + %c0 = constant 0 : index + store %arg0, %arg1[%c0] : memref<4xf32, 5> + + "terminator"() : () -> () + } +} + +// ----- + +gpu.module @kernel { + // Workgroup buffers are allocated as globals. + // NVVM: llvm.mlir.global internal @[[buffer:.*]]() + // NVVM-SAME: addr_space = 3 + // NVVM-SAME: !llvm<"[4 x float]"> + + // ROCDL: llvm.mlir.global internal @[[buffer:.*]]() + // ROCDL-SAME: addr_space = 3 + // ROCDL-SAME: !llvm<"[4 x float]"> + + // NVVM-LABEL: llvm.func @workgroup + // NVVM-SAME: { + + // ROCDL-LABEL: llvm.func @workgroup + // ROCDL-SAME: { + gpu.func @workgroup(%arg0: f32) workgroup(%arg1: memref<4xf32, 3>) { + // Get the address of the first element in the global array. + // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32 + // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[4 x float] addrspace(3)*"> + // NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]] + // NVVM-SAME: !llvm<"float addrspace(3)*"> + + // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32 + // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[4 x float] addrspace(3)*"> + // ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]] + // ROCDL-SAME: !llvm<"float addrspace(3)*"> + + // Populate the memref descriptor. + // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [1 x i64], [1 x i64] }"> + // NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] + // NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] + // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64 + // NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2] + // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64 + // NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0] + // NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64 + // NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0] + + // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [1 x i64], [1 x i64] }"> + // ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] + // ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] + // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64 + // ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2] + // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64 + // ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0] + // ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64 + // ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0] + + // "Store" lowering should work just as any other memref, only check that + // we emit some core instructions. + // NVVM: llvm.extractvalue %[[descr6:.*]] + // NVVM: llvm.getelementptr + // NVVM: llvm.store + + // ROCDL: llvm.extractvalue %[[descr6:.*]] + // ROCDL: llvm.getelementptr + // ROCDL: llvm.store + %c0 = constant 0 : index + store %arg0, %arg1[%c0] : memref<4xf32, 3> + + "terminator"() : () -> () + } +} + +// ----- + +gpu.module @kernel { + // Check that the total size was computed correctly. + // NVVM: llvm.mlir.global internal @[[buffer:.*]]() + // NVVM-SAME: addr_space = 3 + // NVVM-SAME: !llvm<"[48 x float]"> + + // ROCDL: llvm.mlir.global internal @[[buffer:.*]]() + // ROCDL-SAME: addr_space = 3 + // ROCDL-SAME: !llvm<"[48 x float]"> + + // NVVM-LABEL: llvm.func @workgroup3d + // ROCDL-LABEL: llvm.func @workgroup3d + gpu.func @workgroup3d(%arg0: f32) workgroup(%arg1: memref<4x2x6xf32, 3>) { + // Get the address of the first element in the global array. + // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32 + // NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[48 x float] addrspace(3)*"> + // NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]] + // NVVM-SAME: !llvm<"float addrspace(3)*"> + + // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32 + // ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[48 x float] addrspace(3)*"> + // ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]] + // ROCDL-SAME: !llvm<"float addrspace(3)*"> + + // Populate the memref descriptor. + // NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [3 x i64], [3 x i64] }"> + // NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] + // NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] + // NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64 + // NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2] + // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64 + // NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0] + // NVVM: %[[c12:.*]] = llvm.mlir.constant(12 : index) : !llvm.i64 + // NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0] + // NVVM: %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64 + // NVVM: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1] + // NVVM: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64 + // NVVM: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1] + // NVVM: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64 + // NVVM: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2] + // NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64 + // NVVM: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2] + + // ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [3 x i64], [3 x i64] }"> + // ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] + // ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] + // ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64 + // ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2] + // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64 + // ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0] + // ROCDL: %[[c12:.*]] = llvm.mlir.constant(12 : index) : !llvm.i64 + // ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0] + // ROCDL: %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64 + // ROCDL: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1] + // ROCDL: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64 + // ROCDL: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1] + // ROCDL: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64 + // ROCDL: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2] + // ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64 + // ROCDL: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2] + + %c0 = constant 0 : index + store %arg0, %arg1[%c0,%c0,%c0] : memref<4x2x6xf32, 3> + "terminator"() : () -> () + } +} + +// ----- + +gpu.module @kernel { + // Check that several buffers are defined. + // NVVM: llvm.mlir.global internal @[[buffer1:.*]]() + // NVVM-SAME: !llvm<"[1 x float]"> + // NVVM: llvm.mlir.global internal @[[buffer2:.*]]() + // NVVM-SAME: !llvm<"[2 x float]"> + + // ROCDL: llvm.mlir.global internal @[[buffer1:.*]]() + // ROCDL-SAME: !llvm<"[1 x float]"> + // ROCDL: llvm.mlir.global internal @[[buffer2:.*]]() + // ROCDL-SAME: !llvm<"[2 x float]"> + + // NVVM-LABEL: llvm.func @multiple + // ROCDL-LABEL: llvm.func @multiple + gpu.func @multiple(%arg0: f32) + workgroup(%arg1: memref<1xf32, 3>, %arg2: memref<2xf32, 3>) + private(%arg3: memref<3xf32, 5>, %arg4: memref<4xf32, 5>) { + + // Workgroup buffers. + // NVVM: llvm.mlir.addressof @[[buffer1]] + // NVVM: llvm.mlir.addressof @[[buffer2]] + + // ROCDL: llvm.mlir.addressof @[[buffer1]] + // ROCDL: llvm.mlir.addressof @[[buffer2]] + + // Private buffers. + // NVVM: %[[c3:.*]] = llvm.mlir.constant(3 : i64) + // NVVM: llvm.alloca %[[c3]] x !llvm.float : (!llvm.i64) -> !llvm<"float*"> + // NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : i64) + // NVVM: llvm.alloca %[[c4]] x !llvm.float : (!llvm.i64) -> !llvm<"float*"> + + // ROCDL: %[[c3:.*]] = llvm.mlir.constant(3 : i64) + // ROCDL: llvm.alloca %[[c3]] x !llvm.float : (!llvm.i64) -> !llvm<"float addrspace(5)*"> + // ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : i64) + // ROCDL: llvm.alloca %[[c4]] x !llvm.float : (!llvm.i64) -> !llvm<"float addrspace(5)*"> + + %c0 = constant 0 : index + store %arg0, %arg1[%c0] : memref<1xf32, 3> + store %arg0, %arg2[%c0] : memref<2xf32, 3> + store %arg0, %arg3[%c0] : memref<3xf32, 5> + store %arg0, %arg4[%c0] : memref<4xf32, 5> + "terminator"() : () -> () + } +} diff --git a/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir b/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir deleted file mode 100644 --- a/mlir/test/Conversion/GPUToNVVM/memory-attrbution.mlir +++ /dev/null @@ -1,145 +0,0 @@ -// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s - -gpu.module @kernel { - // CHECK-LABEL: llvm.func @private - gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, 5>) { - // Allocate private memory inside the function. - // CHECK: %[[size:.*]] = llvm.mlir.constant(4 : i64) : !llvm.i64 - // CHECK: %[[raw:.*]] = llvm.alloca %[[size]] x !llvm.float : (!llvm.i64) -> !llvm<"float*"> - - // Populate the memref descriptor. - // CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }"> - // CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] - // CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] - // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64 - // CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2] - // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64 - // CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0] - // CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64 - // CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0] - - // "Store" lowering should work just as any other memref, only check that - // we emit some core instructions. - // CHECK: llvm.extractvalue %[[descr6:.*]] - // CHECK: llvm.getelementptr - // CHECK: llvm.store - %c0 = constant 0 : index - store %arg0, %arg1[%c0] : memref<4xf32, 5> - - "terminator"() : () -> () - } -} - -// ----- - -gpu.module @kernel { - // Workgroup buffers are allocated as globals. - // CHECK: llvm.mlir.global internal @[[buffer:.*]]() - // CHECK-SAME: addr_space = 3 - // CHECK-SAME: !llvm<"[4 x float]"> - - // CHECK-LABEL: llvm.func @workgroup - // CHECK-SAME: { - gpu.func @workgroup(%arg0: f32) workgroup(%arg1: memref<4xf32, 3>) { - // Get the address of the first element in the global array. - // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32 - // CHECK: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[4 x float] addrspace(3)*"> - // CHECK: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]] - // CHECK-SAME: !llvm<"float addrspace(3)*"> - - // Populate the memref descriptor. - // CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [1 x i64], [1 x i64] }"> - // CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] - // CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] - // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64 - // CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2] - // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64 - // CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0] - // CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64 - // CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0] - - // "Store" lowering should work just as any other memref, only check that - // we emit some core instructions. - // CHECK: llvm.extractvalue %[[descr6:.*]] - // CHECK: llvm.getelementptr - // CHECK: llvm.store - %c0 = constant 0 : index - store %arg0, %arg1[%c0] : memref<4xf32, 3> - - "terminator"() : () -> () - } -} - -// ----- - -gpu.module @kernel { - // Check that the total size was computed correctly. - // CHECK: llvm.mlir.global internal @[[buffer:.*]]() - // CHECK-SAME: addr_space = 3 - // CHECK-SAME: !llvm<"[48 x float]"> - - // CHECK-LABEL: llvm.func @workgroup3d - gpu.func @workgroup3d(%arg0: f32) workgroup(%arg1: memref<4x2x6xf32, 3>) { - // Get the address of the first element in the global array. - // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32 - // CHECK: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[48 x float] addrspace(3)*"> - // CHECK: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]] - // CHECK-SAME: !llvm<"float addrspace(3)*"> - - // Populate the memref descriptor. - // CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [3 x i64], [3 x i64] }"> - // CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] - // CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] - // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64 - // CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2] - // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64 - // CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0] - // CHECK: %[[c12:.*]] = llvm.mlir.constant(12 : index) : !llvm.i64 - // CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0] - // CHECK: %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64 - // CHECK: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1] - // CHECK: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64 - // CHECK: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1] - // CHECK: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64 - // CHECK: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2] - // CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64 - // CHECK: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2] - - %c0 = constant 0 : index - store %arg0, %arg1[%c0,%c0,%c0] : memref<4x2x6xf32, 3> - "terminator"() : () -> () - } -} - -// ----- - -gpu.module @kernel { - // Check that several buffers are defined. - // CHECK: llvm.mlir.global internal @[[buffer1:.*]]() - // CHECK-SAME: !llvm<"[1 x float]"> - // CHECK: llvm.mlir.global internal @[[buffer2:.*]]() - // CHECK-SAME: !llvm<"[2 x float]"> - - // CHECK-LABEL: llvm.func @multiple - gpu.func @multiple(%arg0: f32) - workgroup(%arg1: memref<1xf32, 3>, %arg2: memref<2xf32, 3>) - private(%arg3: memref<3xf32, 5>, %arg4: memref<4xf32, 5>) { - - // Workgroup buffers. - // CHECK: llvm.mlir.addressof @[[buffer1]] - // CHECK: llvm.mlir.addressof @[[buffer2]] - - // Private buffers. - // CHECK: %[[c3:.*]] = llvm.mlir.constant(3 : i64) - // CHECK: llvm.alloca %[[c3]] x !llvm.float - // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : i64) - // CHECK: llvm.alloca %[[c4]] x !llvm.float - - %c0 = constant 0 : index - store %arg0, %arg1[%c0] : memref<1xf32, 3> - store %arg0, %arg2[%c0] : memref<2xf32, 3> - store %arg0, %arg3[%c0] : memref<3xf32, 5> - store %arg0, %arg4[%c0] : memref<4xf32, 5> - "terminator"() : () -> () - } -} diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -1,9 +1,10 @@ -// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s --dump-input-on-failure -gpu.module @kernel_module { +gpu.module @test_module { // CHECK-LABEL: func @gpu_index_ops() func @gpu_index_ops() - attributes { gpu.kernel } { + -> (index, index, index, index, index, index, + index, index, index, index, index, index) { // CHECK: rocdl.workitem.id.x : !llvm.i32 %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index) // CHECK: rocdl.workitem.id.y : !llvm.i32 @@ -32,68 +33,71 @@ // CHECK: rocdl.grid.dim.z : !llvm.i32 %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index) - std.return + std.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ, + %bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ + : index, index, index, index, index, index, + index, index, index, index, index, index } } // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_fabs_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_fabs_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_fabs - func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %result32 = std.absf %arg_f32 : f32 // CHECK: llvm.call @__ocml_fabs_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.absf %arg_f64 : f64 // CHECK: llvm.call @__ocml_fabs_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } } // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_ceil_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_ceil_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_ceil - func @gpu_ceil(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_ceil(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %result32 = std.ceilf %arg_f32 : f32 // CHECK: llvm.call @__ocml_ceil_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.ceilf %arg_f64 : f64 // CHECK: llvm.call @__ocml_ceil_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } } // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_cos_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_cos_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_cos - func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %result32 = std.cos %arg_f32 : f32 // CHECK: llvm.call @__ocml_cos_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.cos %arg_f64 : f64 // CHECK: llvm.call @__ocml_cos_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } } // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_exp_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_exp_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_exp - func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %exp_f32 = std.exp %arg_f32 : f32 // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float - %result_f32 = std.exp %exp_f32 : f32 + %result32 = std.exp %exp_f32 : f32 // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.exp %arg_f64 : f64 // CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } } @@ -101,20 +105,20 @@ // ----- // Test that we handled properly operation with SymbolTable other than module op -gpu.module @kernel_module { +gpu.module @test_module { "test.symbol_scope"() ({ // CHECK: test.symbol_scope // CHECK: llvm.func @__ocml_exp_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_exp_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_exp - func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %exp_f32 = std.exp %arg_f32 : f32 // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float - %result_f32 = std.exp %exp_f32 : f32 + %result32 = std.exp %exp_f32 : f32 // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.exp %arg_f64 : f64 // CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } "test.finish" () : () -> () }) : () -> () @@ -122,60 +126,60 @@ // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_log_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_log_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_log - func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %result32 = std.log %arg_f32 : f32 // CHECK: llvm.call @__ocml_log_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.log %arg_f64 : f64 // CHECK: llvm.call @__ocml_log_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } } // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_log10_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_log10_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_log10 - func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %result32 = std.log10 %arg_f32 : f32 // CHECK: llvm.call @__ocml_log10_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.log10 %arg_f64 : f64 // CHECK: llvm.call @__ocml_log10_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } } // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_log2_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_log2_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_log2 - func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %result32 = std.log2 %arg_f32 : f32 // CHECK: llvm.call @__ocml_log2_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.log2 %arg_f64 : f64 // CHECK: llvm.call @__ocml_log2_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } } // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_tanh_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_tanh_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_tanh - func @gpu_tanh(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_tanh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %result32 = std.tanh %arg_f32 : f32 // CHECK: llvm.call @__ocml_tanh_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.tanh %arg_f64 : f64 // CHECK: llvm.call @__ocml_tanh_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } }