diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h --- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h +++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h @@ -30,10 +30,12 @@ class ModuleOp; class Operation; class RewritePatternSet; +class TypeConverter; class Pass; namespace gpu { +enum class AddressSpace : uint32_t; class GPUModuleOp; } // namespace gpu @@ -69,6 +71,13 @@ StringRef gpuBinaryAnnotation = {}, bool kernelBarePtrCallConv = false); +/// A function that maps a MemorySpace enum to a target-specific integer value. +using MemorySpaceMapping = std::function; + +/// Populates memory space attribute conversion rules for lowering +/// gpu.address_space to integer values. +void populateGpuMemorySpaceAttributeConversions( + TypeConverter &typeConverter, const MemorySpaceMapping &mapping); } // namespace mlir #endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ diff --git a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td --- a/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td +++ b/mlir/include/mlir/Dialect/GPU/TransformOps/GPUTransformOps.td @@ -14,6 +14,61 @@ include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/IR/OpBase.td" +//===----------------------------------------------------------------------===// +// Apply...ConversionPatternsOp +//===----------------------------------------------------------------------===// + +def ApplyGPUToNVVMConversionPatternsOp : Op]> { + let description = [{ + Collects patterns that convert GPU dialect ops to NVVM dialect ops. These + patterns require an "LLVMTypeConverter". + }]; + let assemblyFormat = "attr-dict"; +} + +def ApplyGPUWwmaToNVVMConversionPatternsOp : Op]> { + let description = [{ + Collects patterns that convert GPU dialect ops related to wmma ops + to NVVM dialect ops. + These patterns require an "LLVMTypeConverter". + }]; + let assemblyFormat = "attr-dict"; +} + +def ApplyGPUSubgroupReduceToNVVMConversionPatternsOp : Op]> { + let description = [{ + Collects patterns that convert GPU dialect ops related to wmma ops + to NVVM dialect ops. + These patterns require an "LLVMTypeConverter". + }]; + let assemblyFormat = "attr-dict"; +} + +//===----------------------------------------------------------------------===// +// Apply...PatternsOp +//===----------------------------------------------------------------------===// + +def ApplyGPURewritePatternsOp : Op]> { + let description = [{ + Collects GPU rewrite patterns comprising: + 1. GpuAllReduceRewrite patterns + 2. GpuGlobalIdRewriter patterns + 3. GpuShuffleRewriter patterns + }]; + let assemblyFormat = "attr-dict"; +} + def ApplyUnrollVectorsSubgroupMmaOp : Op]> { diff --git a/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.td b/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.td --- a/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.td +++ b/mlir/include/mlir/Dialect/NVGPU/TransformOps/NVGPUTransformOps.td @@ -16,7 +16,7 @@ include "mlir/Interfaces/SideEffectInterfaces.td" //===----------------------------------------------------------------------===// -// ApplyNVGPUToNVVMConversionPatternsOp +// Apply...ConversionPatternsOp //===----------------------------------------------------------------------===// def ApplyNVGPUToNVVMConversionPatternsOp : OpgetTypeConverter()); } }; - -/// A function that maps a MemorySpace enum to a target-specific integer value. -using MemorySpaceMapping = - std::function; - -/// Populates memory space attribute conversion rules for lowering -/// gpu.address_space to integer values. -void populateGpuMemorySpaceAttributeConversions( - TypeConverter &typeConverter, const MemorySpaceMapping &mapping); } // namespace mlir #endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// #include "GPUOpsLowering.h" + +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/Attributes.h" #include "mlir/IR/Builders.h" diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -16,6 +16,7 @@ #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" #include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" #include "mlir/Conversion/LLVMCommon/TypeConverter.h" diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -17,6 +17,7 @@ #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h" #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/LLVMCommon/ConversionTarget.h" #include "mlir/Conversion/LLVMCommon/LoweringOptions.h" #include "mlir/Conversion/LLVMCommon/Pattern.h" @@ -24,13 +25,13 @@ #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlow.h" -#include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" #include "mlir/Dialect/Math/IR/Math.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/IR/BuiltinAttributes.h" #include "mlir/Pass/Pass.h" diff --git a/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt b/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt --- a/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt +++ b/mlir/lib/Dialect/GPU/TransformOps/CMakeLists.txt @@ -20,4 +20,8 @@ MLIRTransformDialect MLIRVectorDialect MLIRVectorTransforms + + # ConversionPatterns + MLIRNVGPUToNVVM + MLIRGPUToNVVMTransforms ) diff --git a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp --- a/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp +++ b/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp @@ -8,11 +8,16 @@ #include "mlir/Dialect/GPU/TransformOps/GPUTransformOps.h" +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" +#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" +#include "mlir/Conversion/LLVMCommon/TypeConverter.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/TransformOps/Utils.h" +#include "mlir/Dialect/GPU/Transforms/Passes.h" +#include "mlir/Dialect/LLVMIR/NVVMDialect.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/SCF/IR/DeviceMappingInterface.h" #include "mlir/Dialect/SCF/IR/SCF.h" @@ -29,6 +34,7 @@ #include "mlir/IR/OpDefinition.h" #include "mlir/IR/Visitors.h" #include "mlir/Support/LLVM.h" +#include "mlir/Transforms/DialectConversion.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/TypeSwitch.h" @@ -47,6 +53,85 @@ #define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n") #define DBGS_ALIAS() (llvm::dbgs() << '[' << DEBUG_TYPE_ALIAS << "] ") +//===----------------------------------------------------------------------===// +// Apply...ConversionPatternsOp +//===----------------------------------------------------------------------===// + +void transform::ApplyGPUToNVVMConversionPatternsOp::populatePatterns( + TypeConverter &typeConverter, RewritePatternSet &patterns) { + auto &llvmTypeConverter = static_cast(typeConverter); + // NVVM uses alloca in the default address space to represent private + // memory allocations, so drop private annotations. NVVM uses address + // space 3 for shared memory. NVVM uses the default address space to + // represent global memory. + // Used in populateGpuToNVVMConversionPatternsso attaching here for now. + // TODO: We should have a single to_nvvm_type_converter. + populateGpuMemorySpaceAttributeConversions( + llvmTypeConverter, [](AddressSpace space) -> unsigned { + switch (space) { + case AddressSpace::Global: + return static_cast( + NVVM::NVVMMemorySpace::kGlobalMemorySpace); + case AddressSpace::Workgroup: + return static_cast( + NVVM::NVVMMemorySpace::kSharedMemorySpace); + case AddressSpace::Private: + return 0; + } + llvm_unreachable("unknown address space enum value"); + return 0; + }); + // Used in GPUToNVVM/WmmaOpsToNvvm.cpp so attaching here for now. + // TODO: We should have a single to_nvvm_type_converter. + llvmTypeConverter.addConversion( + [&](MMAMatrixType type) -> Type { return convertMMAToLLVMType(type); }); + populateGpuToNVVMConversionPatterns(llvmTypeConverter, patterns); +} + +LogicalResult +transform::ApplyGPUToNVVMConversionPatternsOp::verifyTypeConverter( + transform::TypeConverterBuilderOpInterface builder) { + if (builder.getTypeConverterType() != "LLVMTypeConverter") + return emitOpError("expected LLVMTypeConverter"); + return success(); +} + +void transform::ApplyGPUWwmaToNVVMConversionPatternsOp::populatePatterns( + TypeConverter &typeConverter, RewritePatternSet &patterns) { + auto &llvmTypeConverter = static_cast(typeConverter); + populateGpuWMMAToNVVMConversionPatterns(llvmTypeConverter, patterns); +} + +LogicalResult +transform::ApplyGPUWwmaToNVVMConversionPatternsOp::verifyTypeConverter( + transform::TypeConverterBuilderOpInterface builder) { + if (builder.getTypeConverterType() != "LLVMTypeConverter") + return emitOpError("expected LLVMTypeConverter"); + return success(); +} + +void transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp:: + populatePatterns(TypeConverter &typeConverter, + RewritePatternSet &patterns) { + auto &llvmTypeConverter = static_cast(typeConverter); + populateGpuSubgroupReduceOpLoweringPattern(llvmTypeConverter, patterns); +} + +LogicalResult transform::ApplyGPUSubgroupReduceToNVVMConversionPatternsOp:: + verifyTypeConverter(transform::TypeConverterBuilderOpInterface builder) { + if (builder.getTypeConverterType() != "LLVMTypeConverter") + return emitOpError("expected LLVMTypeConverter"); + return success(); +} + +//===----------------------------------------------------------------------===// +// Apply...PatternsOp +//===----------------------------------------------------------------------===//s + +void ApplyGPURewritePatternsOp::populatePatterns(RewritePatternSet &patterns) { + populateGpuRewritePatterns(patterns); +} + //===----------------------------------------------------------------------===// // ApplyUnrollVectorsSubgroupMmaOp //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp b/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp --- a/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp +++ b/mlir/lib/Dialect/GPU/Transforms/AllReduceLowering.cpp @@ -387,8 +387,8 @@ static constexpr int kSubgroupSize = 32; }; -struct GpuAllReduceConversion : public RewritePattern { - explicit GpuAllReduceConversion(MLIRContext *context) +struct GpuAllReduceRewrite : public RewritePattern { + explicit GpuAllReduceRewrite(MLIRContext *context) : RewritePattern(gpu::GPUFuncOp::getOperationName(), 1, context) {} LogicalResult matchAndRewrite(Operation *op, @@ -417,5 +417,5 @@ } // namespace void mlir::populateGpuAllReducePatterns(RewritePatternSet &patterns) { - patterns.add(patterns.getContext()); + patterns.add(patterns.getContext()); } diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-32b.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-32b.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-32b.mlir @@ -0,0 +1,77 @@ +// RUN: mlir-opt %s -convert-gpu-to-nvvm='index-bitwidth=32 use-opaque-pointers=1' -split-input-file | FileCheck %s + +// RUN: mlir-opt %s -test-transform-dialect-interpreter | FileCheck %s + +gpu.module @test_module_0 { + // CHECK-LABEL: func @gpu_index_ops() + func.func @gpu_index_ops() + -> (index, index, index, index, index, index, + index, index, index, index, index, index, + index) { + %tIdX = gpu.thread_id x + %tIdY = gpu.thread_id y + %tIdZ = gpu.thread_id z + + %bDimX = gpu.block_dim x + %bDimY = gpu.block_dim y + %bDimZ = gpu.block_dim z + + %bIdX = gpu.block_id x + %bIdY = gpu.block_id y + %bIdZ = gpu.block_id z + + %gDimX = gpu.grid_dim x + %gDimY = gpu.grid_dim y + %gDimZ = gpu.grid_dim z + + // CHECK-NOT: = llvm.sext %{{.*}} : i32 to i64 + %laneId = gpu.lane_id + + func.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ, + %bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ, + %laneId + : index, index, index, index, index, index, + index, index, index, index, index, index, + index + } +} + + + +gpu.module @test_module_1 { + // CHECK-LABEL: func @gpu_index_comp + func.func @gpu_index_comp(%idx : index) -> index { + // CHECK: = llvm.add %{{.*}}, %{{.*}} : i32 + %0 = arith.addi %idx, %idx : index + // CHECK: llvm.return %{{.*}} : i32 + func.return %0 : index + } +} + +transform.sequence failures(propagate) { +^bb1(%toplevel_module: !transform.any_op): + %gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module + : (!transform.any_op) -> !transform.any_op + transform.apply_conversion_patterns to %gpu_module { + transform.apply_conversion_patterns.dialect_to_llvm "arith" + transform.apply_conversion_patterns.dialect_to_llvm "cf" + transform.apply_conversion_patterns.vector.vector_to_llvm + transform.apply_conversion_patterns.func.func_to_llvm + transform.apply_conversion_patterns.dialect_to_llvm "memref" + transform.apply_conversion_patterns.gpu.gpu_to_nvvm + transform.apply_conversion_patterns.gpu.gpu_wmma_to_nvvm + transform.apply_conversion_patterns.gpu.gpu_subgroup_reduce_to_nvvm {has_redux = true} + transform.apply_conversion_patterns.nvgpu.nvgpu_to_nvvm + } with type_converter { + transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter + {index_bitwidth = 32, use_opaque_pointers = true} + } { + legal_dialects = ["llvm", "memref", "nvvm"], + legal_ops = ["func.func", "gpu.module", "gpu.module_end", "gpu.yield"], + illegal_dialects = ["gpu"], + illegal_ops = ["llvm.cos", "llvm.exp", "llvm.exp2", "llvm.fabs", "llvm.fceil", + "llvm.ffloor", "llvm.log", "llvm.log10", "llvm.log2", "llvm.pow", + "llvm.sin", "llvm.sqrt"], + partial_conversion + } : !transform.any_op +} diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir --- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir @@ -1,14 +1,12 @@ // RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 use-opaque-pointers=1' -split-input-file | FileCheck %s -// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 index-bitwidth=32 use-opaque-pointers=1' -split-input-file | FileCheck --check-prefix=CHECK32 %s +// RUN: mlir-opt %s -test-transform-dialect-interpreter | FileCheck %s -gpu.module @test_module { +gpu.module @test_module_0 { // CHECK-LABEL: func @gpu_index_ops() - // CHECK32-LABEL: func @gpu_index_ops() func.func @gpu_index_ops() -> (index, index, index, index, index, index, index, index, index, index, index, index, index) { - // CHECK32-NOT: = llvm.sext %{{.*}} : i32 to i64 // CHECK: = nvvm.read.ptx.sreg.tid.x : i32 // CHECK: = llvm.sext %{{.*}} : i32 to i64 @@ -64,24 +62,21 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_1 { // CHECK-LABEL: func @gpu_index_comp - // CHECK32-LABEL: func @gpu_index_comp func.func @gpu_index_comp(%idx : index) -> index { // CHECK: = llvm.add %{{.*}}, %{{.*}} : i64 - // CHECK32: = llvm.add %{{.*}}, %{{.*}} : i32 %0 = arith.addi %idx, %idx : index // CHECK: llvm.return %{{.*}} : i64 - // CHECK32: llvm.return %{{.*}} : i32 func.return %0 : index } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_2 { // CHECK-LABEL: func @gpu_all_reduce_op() gpu.func @gpu_all_reduce_op() { %arg0 = arith.constant 1.0 : f32 @@ -95,9 +90,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_3 { // CHECK-LABEL: func @gpu_all_reduce_region() gpu.func @gpu_all_reduce_region() { %arg0 = arith.constant 1 : i32 @@ -113,9 +108,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_4 { // CHECK-LABEL: func @gpu_shuffle() func.func @gpu_shuffle() -> (f32, f32, f32, f32) { // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32 @@ -152,9 +147,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_5 { // CHECK-LABEL: func @gpu_sync() func.func @gpu_sync() { // CHECK: nvvm.barrier0 @@ -163,9 +158,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_6 { // CHECK: llvm.func @__nv_fabsf(f32) -> f32 // CHECK: llvm.func @__nv_fabs(f64) -> f64 // CHECK-LABEL: func @gpu_fabs @@ -178,9 +173,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_7 { // CHECK: llvm.func @__nv_cbrtf(f32) -> f32 // CHECK: llvm.func @__nv_cbrt(f64) -> f64 // CHECK-LABEL: func @gpu_cbrt @@ -193,9 +188,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_8 { // CHECK: llvm.func @__nv_ceilf(f32) -> f32 // CHECK: llvm.func @__nv_ceil(f64) -> f64 // CHECK-LABEL: func @gpu_ceil @@ -208,9 +203,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_9 { // CHECK: llvm.func @__nv_floorf(f32) -> f32 // CHECK: llvm.func @__nv_floor(f64) -> f64 // CHECK-LABEL: func @gpu_floor @@ -223,9 +218,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_10 { // CHECK: llvm.func @__nv_cosf(f32) -> f32 // CHECK: llvm.func @__nv_cos(f64) -> f64 // CHECK-LABEL: func @gpu_cos @@ -238,8 +233,8 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_11 { // CHECK: llvm.func @__nv_expf(f32) -> f32 // CHECK: llvm.func @__nv_exp(f64) -> f64 // CHECK-LABEL: func @gpu_exp @@ -252,8 +247,8 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_12 { // CHECK: llvm.func @__nv_exp2f(f32) -> f32 // CHECK: llvm.func @__nv_exp2(f64) -> f64 // CHECK-LABEL: func @gpu_exp2 @@ -266,9 +261,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_13 { // CHECK: llvm.func @__nv_logf(f32) -> f32 // CHECK: llvm.func @__nv_log(f64) -> f64 // CHECK-LABEL: func @gpu_log @@ -281,9 +276,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_14 { // CHECK: llvm.func @__nv_log10f(f32) -> f32 // CHECK: llvm.func @__nv_log10(f64) -> f64 // CHECK-LABEL: func @gpu_log10 @@ -296,9 +291,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_15 { // CHECK: llvm.func @__nv_log1pf(f32) -> f32 // CHECK: llvm.func @__nv_log1p(f64) -> f64 // CHECK-LABEL: func @gpu_log1p @@ -311,9 +306,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_16 { // CHECK: llvm.func @__nv_log2f(f32) -> f32 // CHECK: llvm.func @__nv_log2(f64) -> f64 // CHECK-LABEL: func @gpu_log2 @@ -326,9 +321,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_17 { // CHECK: llvm.func @__nv_sinf(f32) -> f32 // CHECK: llvm.func @__nv_sin(f64) -> f64 // CHECK-LABEL: func @gpu_sin @@ -341,9 +336,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_18 { // CHECK: llvm.func @__nv_tanf(f32) -> f32 // CHECK: llvm.func @__nv_tan(f64) -> f64 // CHECK-LABEL: func @gpu_tan @@ -360,9 +355,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_19 { // CHECK: llvm.func @__nv_tanhf(f32) -> f32 // CHECK: llvm.func @__nv_tanh(f64) -> f64 // CHECK-LABEL: func @gpu_tanh @@ -379,9 +374,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_20 { // CHECK: llvm.func @__nv_rsqrtf(f32) -> f32 // CHECK: llvm.func @__nv_rsqrt(f64) -> f64 // CHECK-LABEL: func @gpu_rsqrt @@ -399,9 +394,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_21 { // CHECK: llvm.func @__nv_sqrtf(f32) -> f32 // CHECK: llvm.func @__nv_sqrt(f64) -> f64 // CHECK-LABEL: func @gpu_sqrt @@ -419,9 +414,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_22 { // CHECK: llvm.func @__nv_atanf(f32) -> f32 // CHECK: llvm.func @__nv_atan(f64) -> f64 // CHECK-LABEL: func @gpu_atan @@ -439,9 +434,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_23 { // CHECK: llvm.func @__nv_atan2f(f32, f32) -> f32 // CHECK: llvm.func @__nv_atan2(f64, f64) -> f64 // CHECK-LABEL: func @gpu_atan2 @@ -460,10 +455,10 @@ } } -// ----- + // Test that we handled properly operation with SymbolTable other than module op -gpu.module @test_module { +gpu.module @test_module_24 { "test.symbol_scope"() ({ // CHECK: test.symbol_scope // CHECK: llvm.func @__nv_expf(f32) -> f32 @@ -480,9 +475,9 @@ }) : () -> () } -// ----- -gpu.module @test_module { + +gpu.module @test_module_25 { // CHECK: llvm.func @__nv_expm1f(f32) -> f32 // CHECK: llvm.func @__nv_expm1(f64) -> f64 // CHECK-LABEL: func @gpu_expm1 @@ -495,9 +490,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_26 { // CHECK: llvm.func @__nv_powf(f32, f32) -> f32 // CHECK: llvm.func @__nv_pow(f64, f64) -> f64 // CHECK-LABEL: func @gpu_pow @@ -510,9 +505,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_27 { // CHECK-LABEL: func @gpu_unroll func.func @gpu_unroll(%arg0 : vector<4xf32>) -> vector<4xf32> { %result = math.exp %arg0 : vector<4xf32> @@ -530,9 +525,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_28 { // CHECK-LABEL: @kernel_func // CHECK: attributes // CHECK: gpu.kernel @@ -542,9 +537,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_29 { // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL0:[A-Za-z0-9_]+]]("Hello, world\0A\00") // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL1:[A-Za-z0-9_]+]]("Hello: %d\0A\00") // CHECK-DAG: llvm.func @vprintf(!llvm.ptr, !llvm.ptr) -> i32 @@ -580,9 +575,9 @@ } } -// ----- -gpu.module @test_module { + +gpu.module @test_module_30 { // CHECK-LABEL: func @subgroup_reduce_add gpu.func @subgroup_reduce_add(%arg0 : i32) { // CHECK: nvvm.redux.sync add {{.*}} @@ -621,3 +616,38 @@ } } +transform.sequence failures(propagate) { +^bb1(%toplevel_module: !transform.any_op): + %gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module + : (!transform.any_op) -> !transform.any_op + + transform.apply_patterns to %gpu_module { + transform.apply_patterns.gpu.gpu_rewrite_patterns + } : !transform.any_op + + transform.apply_conversion_patterns to %gpu_module { + transform.apply_conversion_patterns.dialect_to_llvm "arith" + transform.apply_conversion_patterns.dialect_to_llvm "cf" + transform.apply_conversion_patterns.vector.vector_to_llvm + transform.apply_conversion_patterns.func.func_to_llvm + transform.apply_conversion_patterns.dialect_to_llvm "memref" + transform.apply_conversion_patterns.gpu.gpu_to_nvvm + transform.apply_conversion_patterns.gpu.gpu_wmma_to_nvvm + transform.apply_conversion_patterns.gpu.gpu_subgroup_reduce_to_nvvm + transform.apply_conversion_patterns.nvgpu.nvgpu_to_nvvm + } with type_converter { + transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter + {index_bitwidth = 64, + use_bare_ptr = true, + use_bare_ptr_memref_call_conv = true, + use_opaque_pointers = true} + } { + legal_dialects = ["llvm", "memref", "nvvm", "test"], + legal_ops = ["func.func", "gpu.module", "gpu.module_end", "gpu.yield"], + illegal_dialects = ["gpu"], + illegal_ops = ["llvm.cos", "llvm.exp", "llvm.exp2", "llvm.fabs", "llvm.fceil", + "llvm.ffloor", "llvm.log", "llvm.log10", "llvm.log2","llvm.pow", + "llvm.sin", "llvm.sqrt"], + partial_conversion + } : !transform.any_op +}