Diff 541049

mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir

	// RUN: mlir-opt %s \			// RUN: mlir-opt %s \
	// RUN: -test-transform-dialect-interpreter \			// RUN: -test-transform-dialect-interpreter \
	// RUN: -test-transform-dialect-erase-schedule \			// RUN: -test-transform-dialect-erase-schedule \
	// RUN: -gpu-kernel-outlining \			// RUN: -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76" \
	// RUN: -convert-scf-to-cf \
	// RUN: -convert-vector-to-llvm \
	// RUN: -convert-math-to-llvm \
	// RUN: -expand-strided-metadata \
	// RUN: -lower-affine \
	// RUN: -convert-index-to-llvm=index-bitwidth=32 \
	// RUN: -convert-arith-to-llvm \
	// RUN: -finalize-memref-to-llvm \
	// RUN: -convert-func-to-llvm \
	// RUN: -canonicalize \
	// RUN: \| mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm{use-opaque-pointers=1},lower-affine,convert-scf-to-cf,convert-vector-to-llvm,convert-math-to-llvm,expand-strided-metadata,lower-affine,convert-index-to-llvm{index-bitwidth=32},convert-arith-to-llvm,reconcile-unrealized-casts,gpu-to-cubin{chip=sm_80 features=+ptx76}))' \
	// RUN: \| mlir-opt -convert-index-to-llvm=index-bitwidth=32 \
	// RUN: -gpu-to-llvm \
	// RUN: -convert-func-to-llvm \
	// RUN: -reconcile-unrealized-casts \
	// RUN: \| mlir-cpu-runner \			// RUN: \| mlir-cpu-runner \
	// RUN: --shared-libs=%mlir_cuda_runtime \			// RUN: --shared-libs=%mlir_cuda_runtime \
	// RUN: --shared-libs=%mlir_runner_utils \			// RUN: --shared-libs=%mlir_runner_utils \
	// RUN: --entry-point-result=void \			// RUN: --entry-point-result=void \
	// RUN: \| FileCheck %s			// RUN: \| FileCheck %s

	!lhs_memref_type = memref<16x16xf16>			!lhs_memref_type = memref<16x16xf16>
	!rhs_memref_type = memref<16x8xf16>			!rhs_memref_type = memref<16x8xf16>
	▲ Show 20 Lines • Show All 212 Lines • Show Last 20 Lines

mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir

	// RUN: mlir-opt %s \			// RUN: mlir-opt %s \
	// RUN: -test-transform-dialect-interpreter \			// RUN: -test-transform-dialect-interpreter \
	// RUN: \| FileCheck %s --check-prefix=CHECK-MMA-SYNC			// RUN: \| FileCheck %s --check-prefix=CHECK-MMA-SYNC

	// CHECK-MMA-SYNC-LABEL: func @main() {			// CHECK-MMA-SYNC-LABEL: func @main() {
	// CHECK-MMA-SYNC: nvgpu.mma.sync(%{{.*}}) {mmaShape = [16, 8, 4], tf32Enabled}			// CHECK-MMA-SYNC: nvgpu.mma.sync(%{{.*}}) {mmaShape = [16, 8, 4], tf32Enabled}
	// CHECK-MMA-SYNC-SAME: : (vector<2x1xf32>, vector<1x1xf32>, vector<2x2xf32>) -> vector<2x2xf32>			// CHECK-MMA-SYNC-SAME: : (vector<2x1xf32>, vector<1x1xf32>, vector<2x2xf32>) -> vector<2x2xf32>

	// Tested to run locally in 1.7s.			// Tested to run locally in 1.7s.

	// RUN: mlir-opt %s \			// RUN: mlir-opt %s \
	// RUN: -test-transform-dialect-interpreter \			// RUN: -test-transform-dialect-interpreter \
	// RUN: -test-transform-dialect-erase-schedule \			// RUN: -test-transform-dialect-erase-schedule \
	// RUN: -gpu-kernel-outlining \			// RUN: -test-lower-to-nvvm="kernel-index-bitwidth=32 cubin-chip=sm_80 cubin-features=+ptx76" \
	// RUN: -convert-scf-to-cf \
	// RUN: -convert-vector-to-llvm \
	// RUN: -convert-math-to-llvm \
	// RUN: -expand-strided-metadata \
	// RUN: -lower-affine \
	// RUN: -convert-index-to-llvm=index-bitwidth=32 \
	// RUN: -convert-arith-to-llvm \
	// RUN: -finalize-memref-to-llvm \
	// RUN: -convert-func-to-llvm \
	// RUN: -canonicalize \
	// RUN: \| mlir-opt -pass-pipeline='builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm{use-opaque-pointers=1},lower-affine,convert-scf-to-cf,convert-vector-to-llvm,convert-math-to-llvm,expand-strided-metadata,lower-affine,convert-index-to-llvm{index-bitwidth=32},convert-arith-to-llvm,reconcile-unrealized-casts,gpu-to-cubin{chip=sm_80 features=+ptx76}))' \
	// RUN: \| mlir-opt -convert-index-to-llvm=index-bitwidth=32 \
	// RUN: -gpu-to-llvm \
	// RUN: -convert-func-to-llvm \
	// RUN: -reconcile-unrealized-casts \
	// RUN: \| mlir-cpu-runner \			// RUN: \| mlir-cpu-runner \
	// RUN: --shared-libs=%mlir_cuda_runtime \			// RUN: --shared-libs=%mlir_cuda_runtime \
	// RUN: --shared-libs=%mlir_runner_utils \			// RUN: --shared-libs=%mlir_runner_utils \
	// RUN: --entry-point-result=void \			// RUN: --entry-point-result=void \
	// RUN: \| FileCheck %s			// RUN: \| FileCheck %s

	!lhs_memref_type = memref<16x4xf32>			!lhs_memref_type = memref<16x4xf32>
	!rhs_memref_type = memref<4x8xf32>			!rhs_memref_type = memref<4x8xf32>
	▲ Show 20 Lines • Show All 141 Lines • Show Last 20 Lines

mlir/test/lib/Dialect/CMakeLists.txt

	add_subdirectory(Affine)			add_subdirectory(Affine)
	add_subdirectory(Arith)			add_subdirectory(Arith)
	add_subdirectory(Bufferization)			add_subdirectory(Bufferization)
	add_subdirectory(ControlFlow)			add_subdirectory(ControlFlow)
	add_subdirectory(DLTI)			add_subdirectory(DLTI)
	add_subdirectory(Func)			add_subdirectory(Func)
	add_subdirectory(GPU)			add_subdirectory(GPU)
	add_subdirectory(Linalg)			add_subdirectory(Linalg)
	add_subdirectory(LLVM)			add_subdirectory(LLVM)
	add_subdirectory(Math)			add_subdirectory(Math)
	add_subdirectory(MemRef)			add_subdirectory(MemRef)
	add_subdirectory(NVGPU)			add_subdirectory(NVGPU)
				add_subdirectory(NVVM)
	add_subdirectory(SCF)			add_subdirectory(SCF)
	add_subdirectory(Shape)			add_subdirectory(Shape)
	add_subdirectory(SPIRV)			add_subdirectory(SPIRV)
	add_subdirectory(Tensor)			add_subdirectory(Tensor)
	add_subdirectory(Test)			add_subdirectory(Test)
	add_subdirectory(TestDyn)			add_subdirectory(TestDyn)
	add_subdirectory(Tosa)			add_subdirectory(Tosa)
	add_subdirectory(Transform)			add_subdirectory(Transform)
	add_subdirectory(Vector)			add_subdirectory(Vector)

mlir/test/lib/Dialect/NVVM/CMakeLists.txt

This file was added.

				if (MLIR_ENABLE_CUDA_RUNNER)
				# Configure CUDA support. Using check_language first allows us to give a
				# custom error message.
				include(CheckLanguage)
				check_language(CUDA)
				if (CMAKE_CUDA_COMPILER)
				enable_language(CUDA)
				else()
				message(SEND_ERROR
				"Building the mlir cuda runner requires a working CUDA install")
				endif()

				# We need the libcuda.so library.
				find_library(CUDA_RUNTIME_LIBRARY cuda HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} REQUIRED)

				get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
				set(LIBS
				${conversion_libs}

				MLIRAnalysis
				MLIRArithDialect
				MLIRBuiltinToLLVMIRTranslation
				MLIRExecutionEngine
				MLIRFuncDialect
				MLIRGPUDialect
				MLIRIR
				MLIRJitRunner
				MLIRLLVMDialect
				MLIRLLVMCommonConversion
				MLIRLLVMToLLVMIRTranslation
				MLIRToLLVMIRTranslationRegistration
				MLIRMemRefDialect
				MLIRMemRefToLLVM
				MLIRParser
				MLIRSPIRVDialect
				MLIRSPIRVTransforms
				MLIRSupport
				MLIRTargetLLVMIRExport
				MLIRTransforms
				MLIRTranslateLib
				MLIRVectorDialect
				MLIRVectorToLLVM
				)

				# Exclude tests from libMLIR.so
				add_mlir_library(MLIRNVVMTestPasses
				TestLowerToNVVM.cpp

				EXCLUDE_FROM_LIBMLIR

				LINK_LIBS PUBLIC
				${LIBS}
				)
				endif()

mlir/test/lib/Dialect/NVVM/TestLowerToNVVM.cpp

This file was added.

				//===- TestLowerToNVVM.cpp - Test lowering to NVVM as a sink pass ---------===//
				//
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				//
				// This file implements a pass for testing the lowering to NVVM as a generally
				// usable sink pass.
				//
				//===----------------------------------------------------------------------===//

				#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
				#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
				#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
				#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
				#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
				#include "mlir/Conversion/GPUToSPIRV/GPUToSPIRVPass.h"
				#include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"
				#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h"
				#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
				#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
				#include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
				#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
				#include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"
				#include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
				#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
				#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
				#include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
				#include "mlir/Dialect/Arith/IR/Arith.h"
				#include "mlir/Dialect/Func/IR/FuncOps.h"
				#include "mlir/Dialect/GPU/IR/GPUDialect.h"
				#include "mlir/Dialect/GPU/Transforms/Passes.h"
				#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
				#include "mlir/Dialect/LLVMIR/Transforms/RequestCWrappers.h"
				#include "mlir/Dialect/Linalg/Passes.h"
				#include "mlir/Dialect/MemRef/IR/MemRef.h"
				#include "mlir/Dialect/MemRef/Transforms/Passes.h"
				#include "mlir/Dialect/Vector/IR/VectorOps.h"
				#include "mlir/ExecutionEngine/JitRunner.h"
				#include "mlir/Pass/Pass.h"
				#include "mlir/Pass/PassManager.h"
				#include "mlir/Pass/PassOptions.h"
				#include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
				#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
				#include "mlir/Transforms/Passes.h"
				#include "llvm/Support/InitLLVM.h"
				#include "llvm/Support/TargetSelect.h"

				using namespace mlir;

				namespace {
				struct TestLowerToNVVMOptions
				: public PassPipelineOptions<TestLowerToNVVMOptions> {
				PassOptions::Option<int64_t> hostIndexBitWidth{
				*this, "host-index-bitwidth",
				llvm::cl::desc("Bitwidth of the index type for the host (warning this "
				"should be 64 until the GPU layering is fixed)"),
				llvm::cl::init(64)};
				PassOptions::Option<bool> hostUseBarePtrCallConv{
				*this, "host-bare-ptr-calling-convention",
				llvm::cl::desc(
				"Whether to use the bareptr calling convention on the host (warning "
				"this should be false until the GPU layering is fixed)"),
				llvm::cl::init(false)};
				PassOptions::Option<int64_t> kernelIndexBitWidth{
				*this, "kernel-index-bitwidth",
				llvm::cl::desc("Bitwidth of the index type for the GPU kernels"),
				llvm::cl::init(64)};
				PassOptions::Option<bool> kernelUseBarePtrCallConv{
				*this, "kernel-bare-ptr-calling-convention",
				llvm::cl::desc(
				"Whether to use the bareptr calling convention on the kernel "
				"(warning this should be false until the GPU layering is fixed)"),
				llvm::cl::init(false)};
				PassOptions::Option<std::string> cubinTriple{
				*this, "cubin-triple",
				llvm::cl::desc("Triple to use to serialize to cubin."),
				llvm::cl::init("nvptx64-nvidia-cuda")};
				PassOptions::Option<std::string> cubinChip{
				*this, "cubin-chip", llvm::cl::desc("Chip to use to serialize to cubin."),
				llvm::cl::init("sm_80")};
				PassOptions::Option<std::string> cubinFeatures{
				*this, "cubin-features",
				llvm::cl::desc("Features to use to serialize to cubin."),
				llvm::cl::init("+ptx76")};
				};

				//===----------------------------------------------------------------------===//
				// GPUModule-specific stuff.
				//===----------------------------------------------------------------------===//
				void buildGpuPassPipeline(OpPassManager &pm,
				const TestLowerToNVVMOptions &options) {
				pm.addNestedPass<gpu::GPUModuleOp>(createStripDebugInfoPass());

				pm.addNestedPass<gpu::GPUModuleOp>(createConvertVectorToSCFPass());
				// Blanket-convert any remaining linalg ops to loops if any remain.
				pm.addNestedPass<gpu::GPUModuleOp>(createConvertLinalgToLoopsPass());
				// Convert SCF to CF (always needed).
				pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());
				// Convert Math to LLVM (always needed).
				pm.addNestedPass<gpu::GPUModuleOp>(createConvertMathToLLVMPass());
				// Expand complicated MemRef operations before lowering them.
				pm.addNestedPass<gpu::GPUModuleOp>(memref::createExpandStridedMetadataPass());
				// The expansion may create affine expressions. Get rid of them.
				pm.addNestedPass<gpu::GPUModuleOp>(createLowerAffinePass());

				// Convert MemRef to LLVM (always needed).
				// TODO: C++20 designated initializers.
				FinalizeMemRefToLLVMConversionPassOptions
				finalizeMemRefToLLVMConversionPassOptions;
				// Must be 64b on the host, things don't compose properly around
				// gpu::LaunchOp and gpu::HostRegisterOp.
				// TODO: fix GPU layering.
				finalizeMemRefToLLVMConversionPassOptions.indexBitwidth =
				options.kernelIndexBitWidth;
				finalizeMemRefToLLVMConversionPassOptions.useOpaquePointers = true;
				pm.addNestedPass<gpu::GPUModuleOp>(createFinalizeMemRefToLLVMConversionPass(
				finalizeMemRefToLLVMConversionPassOptions));

				// Convert Func to LLVM (always needed).
				// TODO: C++20 designated initializers.
				ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions;
				// Must be 64b on the host, things don't compose properly around
				// gpu::LaunchOp and gpu::HostRegisterOp.
				// TODO: fix GPU layering.
				convertFuncToLLVMPassOptions.indexBitwidth = options.kernelIndexBitWidth;
				convertFuncToLLVMPassOptions.useBarePtrCallConv =
				options.kernelUseBarePtrCallConv;
				convertFuncToLLVMPassOptions.useOpaquePointers = true;
				pm.addNestedPass<gpu::GPUModuleOp>(
				createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions));

				ftynseUnsubmitted Done Reply Inline Actions inconsistent? ftynse: //in//consistent?
				// TODO: C++20 designated initializers.
				ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
				// Must be 64b on the host, things don't compose properly around
				// gpu::LaunchOp and gpu::HostRegisterOp.
				// TODO: fix GPU layering.
				convertIndexToLLVMPassOpt.indexBitwidth = options.kernelIndexBitWidth;
				pm.addNestedPass<gpu::GPUModuleOp>(
				createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));

				gurayppUnsubmitted Done Reply Inline Actions We need to call `createConvertNVVMToLLVMPass` as well. I added that for the PTX builder guraypp: We need to call `createConvertNVVMToLLVMPass` as well. I added that for the PTX builder
				nicolasvasilacheAuthorUnsubmitted Done Reply Inline Actions discussed offline, @guraypp will pick this up in a followup as he knows what needs to be done and I'd be in discovery mode nicolasvasilache: discussed offline, @guraypp will pick this up in a followup as he knows what needs to be done…
				// TODO: C++20 designated initializers.
				// The following pass is inconsistent.
				// ConvertGpuOpsToNVVMOpsOptions convertGpuOpsToNVVMOpsOptions;
				// convertGpuOpsToNVVMOpsOptions.indexBitwidth =
				// options.kernelIndexBitWidth;
				pm.addNestedPass<gpu::GPUModuleOp>(
				// TODO: fix inconsistence.
				createLowerGpuOpsToNVVMOpsPass(/indexBitWidth=/
				options.kernelIndexBitWidth));

				// TODO: C++20 designated initializers.
				ConvertNVGPUToNVVMPassOptions convertNVGPUToNVVMPassOptions;
				convertNVGPUToNVVMPassOptions.useOpaquePointers = true;
				pm.addNestedPass<gpu::GPUModuleOp>(
				createConvertNVGPUToNVVMPass(convertNVGPUToNVVMPassOptions));
				pm.addNestedPass<gpu::GPUModuleOp>(createConvertSCFToCFPass());

				// TODO: C++20 designated initializers.
				GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions;
				// Note: hostBarePtrCallConv must be false for now otherwise
				// gpu::HostRegister is ill-defined: it wants unranked memrefs but can't
				// lower the to bare ptr.
				gpuToLLVMConversionOptions.hostBarePtrCallConv =
				options.hostUseBarePtrCallConv;
				gpuToLLVMConversionOptions.kernelBarePtrCallConv =
				options.kernelUseBarePtrCallConv;
				gpuToLLVMConversionOptions.useOpaquePointers = true;

				// TODO: something useful here.
				// gpuToLLVMConversionOptions.gpuBinaryAnnotation = "";
				pm.addNestedPass<gpu::GPUModuleOp>(
				createGpuToLLVMConversionPass(gpuToLLVMConversionOptions));

				// Convert vector to LLVM (always needed).
				// TODO: C++20 designated initializers.
				ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
				convertVectorToLLVMPassOptions.reassociateFPReductions = true;
				pm.addNestedPass<gpu::GPUModuleOp>(
				createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));

				// Sprinkle some cleanups.
				pm.addPass(createCanonicalizerPass());
				pm.addPass(createCSEPass());

				// Finally we can reconcile unrealized casts.
				pm.addNestedPass<gpu::GPUModuleOp>(createReconcileUnrealizedCastsPass());
				pm.addNestedPass<gpu::GPUModuleOp>(createGpuSerializeToCubinPass(
				options.cubinTriple, options.cubinChip, options.cubinFeatures));
				}

				void buildLowerToNVVMPassPipeline(OpPassManager &pm,
				const TestLowerToNVVMOptions &options) {
				//===----------------------------------------------------------------------===//
				// Host-specific stuff.
				//===----------------------------------------------------------------------===//
				// Important, must be run at the top-level.
				pm.addPass(createGpuKernelOutliningPass());

				// Important, all host passes must be run at the func level so that host
				// conversions can remain with 64 bit indices without polluting the GPU
				// kernel that may have 32 bit indices.
				// Must be 64b on the host, things don't compose properly around
				// gpu::LaunchOp and gpu::HostRegisterOp.
				// TODO: fix GPU layering.
				pm.addNestedPass<func::FuncOp>(createConvertVectorToSCFPass());
				ftynseUnsubmitted Done Reply Inline Actions Can we put this 64 in a named constant? ftynse: Can we put this 64 in a named constant?
				// Blanket-convert any remaining linalg ops to loops if any remain.
				pm.addNestedPass<func::FuncOp>(createConvertLinalgToLoopsPass());
				// Convert SCF to CF (always needed).
				pm.addNestedPass<func::FuncOp>(createConvertSCFToCFPass());
				// Convert Math to LLVM (always needed).
				pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
				// Expand complicated MemRef operations before lowering them.
				pm.addNestedPass<func::FuncOp>(memref::createExpandStridedMetadataPass());
				// The expansion may create affine expressions. Get rid of them.
				pm.addNestedPass<func::FuncOp>(createLowerAffinePass());

				// Convert MemRef to LLVM (always needed).
				// TODO: C++20 designated initializers.
				FinalizeMemRefToLLVMConversionPassOptions
				finalizeMemRefToLLVMConversionPassOptions;
				finalizeMemRefToLLVMConversionPassOptions.useAlignedAlloc = true;
				// Must be 64b on the host, things don't compose properly around
				// gpu::LaunchOp and gpu::HostRegisterOp.
				// TODO: fix GPU layering.
				finalizeMemRefToLLVMConversionPassOptions.indexBitwidth =
				options.hostIndexBitWidth;
				finalizeMemRefToLLVMConversionPassOptions.useOpaquePointers = true;
				pm.addNestedPass<func::FuncOp>(createFinalizeMemRefToLLVMConversionPass(
				finalizeMemRefToLLVMConversionPassOptions));

				// Convert Func to LLVM (always needed).
				// TODO: C++20 designated initializers.
				ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions;
				// Must be 64b on the host, things don't compose properly around
				// gpu::LaunchOp and gpu::HostRegisterOp.
				// TODO: fix GPU layering.
				convertFuncToLLVMPassOptions.indexBitwidth = options.hostIndexBitWidth;
				convertFuncToLLVMPassOptions.useBarePtrCallConv =
				options.hostUseBarePtrCallConv;
				convertFuncToLLVMPassOptions.useOpaquePointers = true;
				pm.addNestedPass<func::FuncOp>(
				createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions));

				// TODO: C++20 designated initializers.
				ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt;
				// Must be 64b on the host, things don't compose properly around
				// gpu::LaunchOp and gpu::HostRegisterOp.
				// TODO: fix GPU layering.
				convertIndexToLLVMPassOpt.indexBitwidth = options.hostIndexBitWidth;
				pm.addNestedPass<func::FuncOp>(
				createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt));

				pm.addNestedPass<func::FuncOp>(createArithToLLVMConversionPass());

				// Sprinkle some cleanups.
				pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
				pm.addNestedPass<func::FuncOp>(createCSEPass());

				//===----------------------------------------------------------------------===//
				// GPUModule-specific stuff.
				//===----------------------------------------------------------------------===//
				buildGpuPassPipeline(pm, options);

				//===----------------------------------------------------------------------===//
				// Host post-GPUModule-specific stuff.
				//===----------------------------------------------------------------------===//
				// Convert vector to LLVM (always needed).
				// TODO: C++20 designated initializers.
				ConvertVectorToLLVMPassOptions convertVectorToLLVMPassOptions;
				convertVectorToLLVMPassOptions.reassociateFPReductions = true;
				pm.addNestedPass<func::FuncOp>(
				createConvertVectorToLLVMPass(convertVectorToLLVMPassOptions));

				ConvertIndexToLLVMPassOptions convertIndexToLLVMPassOpt3;
				// Must be 64b on the host, things don't compose properly around
				// gpu::LaunchOp and gpu::HostRegisterOp.
				// TODO: fix GPU layering.
				convertIndexToLLVMPassOpt3.indexBitwidth = options.hostIndexBitWidth;
				pm.addPass(createConvertIndexToLLVMPass(convertIndexToLLVMPassOpt3));

				// This must happen after cubin translation otherwise gpu.launch_func is
				// illegal if no cubin annotation is present.
				// TODO: C++20 designated initializers.
				GpuToLLVMConversionPassOptions gpuToLLVMConversionOptions;
				// Note: hostBarePtrCallConv must be false for now otherwise
				// gpu::HostRegister is ill-defined: it wants unranked memrefs but can't
				// lower the to bare ptr.
				gpuToLLVMConversionOptions.hostBarePtrCallConv =
				options.hostUseBarePtrCallConv;
				gpuToLLVMConversionOptions.kernelBarePtrCallConv =
				options.kernelUseBarePtrCallConv;
				gpuToLLVMConversionOptions.useOpaquePointers = true;
				// TODO: something useful here.
				// gpuToLLVMConversionOptions.gpuBinaryAnnotation = "";
				pm.addPass(createGpuToLLVMConversionPass(gpuToLLVMConversionOptions));

				// Convert Func to LLVM (always needed).
				// TODO: C++20 designated initializers.
				ConvertFuncToLLVMPassOptions convertFuncToLLVMPassOptions2;
				// Must be 64b on the host, things don't compose properly around
				// gpu::LaunchOp and gpu::HostRegisterOp.
				convertFuncToLLVMPassOptions2.indexBitwidth = options.hostIndexBitWidth;
				convertFuncToLLVMPassOptions2.useBarePtrCallConv =
				options.hostUseBarePtrCallConv;
				convertFuncToLLVMPassOptions2.useOpaquePointers = true;
				pm.addPass(createConvertFuncToLLVMPass(convertFuncToLLVMPassOptions2));

				// Sprinkle some cleanups.
				pm.addPass(createCanonicalizerPass());
				pm.addPass(createCSEPass());

				// Finally we can reconcile unrealized casts.
				pm.addPass(createReconcileUnrealizedCastsPass());
				}
				} // namespace

				namespace mlir {
				namespace test {
				void registerTestLowerToNVVM() {
				PassPipelineRegistration<TestLowerToNVVMOptions>(
				"test-lower-to-nvvm",
				"An example of pipeline to lower the main dialects (arith, linalg, "
				"memref, scf, vector) down to NVVM.",
				buildLowerToNVVMPassPipeline);
				}
				} // namespace test
				} // namespace mlir

mlir/tools/mlir-opt/CMakeLists.txt

	set(LLVM_OPTIONAL_SOURCES			set(LLVM_OPTIONAL_SOURCES
	null.cpp			null.cpp
	)			)

	get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)			get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
	get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)			get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
	get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)			get_property(extension_libs GLOBAL PROPERTY MLIR_EXTENSION_LIBS)
	set(LLVM_LINK_COMPONENTS			set(LLVM_LINK_COMPONENTS
	Core			Core
	Support			Support
	AsmParser			AsmParser
	)			)

	if(MLIR_INCLUDE_TESTS)			if(MLIR_INCLUDE_TESTS)
				if(MLIR_ENABLE_CUDA_RUNNER)
				set(cuda_test_libs
				MLIRNVVMTestPasses
				)
				endif()
	set(test_libs			set(test_libs
				${cuda_test_libs}
	MLIRTestFuncToLLVM			MLIRTestFuncToLLVM
	MLIRAffineTransformsTestPasses			MLIRAffineTransformsTestPasses
	MLIRArithTestPasses			MLIRArithTestPasses
	MLIRBufferizationTestPasses			MLIRBufferizationTestPasses
	MLIRControlFlowTestPasses			MLIRControlFlowTestPasses
	MLIRDLTITestPasses			MLIRDLTITestPasses
	MLIRFuncTestPasses			MLIRFuncTestPasses
	MLIRGPUTestPasses			MLIRGPUTestPasses
	▲ Show 20 Lines • Show All 69 Lines • Show Last 20 Lines

mlir/tools/mlir-opt/mlir-opt.cpp

Show First 20 Lines • Show All 103 Lines • ▼ Show 20 Lines
void registerTestLinalgGreedyFusion();		void registerTestLinalgGreedyFusion();
void registerTestLinalgTransforms();		void registerTestLinalgTransforms();
void registerTestLivenessPass();		void registerTestLivenessPass();
void registerTestLoopFusion();		void registerTestLoopFusion();
void registerTestCFGLoopInfoPass();		void registerTestCFGLoopInfoPass();
void registerTestLoopMappingPass();		void registerTestLoopMappingPass();
void registerTestLoopUnrollingPass();		void registerTestLoopUnrollingPass();
void registerTestLowerToLLVM();		void registerTestLowerToLLVM();
		void registerTestLowerToNVVM();
void registerTestMakeIsolatedFromAbovePass();		void registerTestMakeIsolatedFromAbovePass();
void registerTestMatchReductionPass();		void registerTestMatchReductionPass();
void registerTestMathAlgebraicSimplificationPass();		void registerTestMathAlgebraicSimplificationPass();
void registerTestMathPolynomialApproximationPass();		void registerTestMathPolynomialApproximationPass();
void registerTestMemRefDependenceCheck();		void registerTestMemRefDependenceCheck();
void registerTestMemRefStrideCalculation();		void registerTestMemRefStrideCalculation();
void registerTestNextAccessPass();		void registerTestNextAccessPass();
void registerTestOneToNTypeConversionPass();		void registerTestOneToNTypeConversionPass();
▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	void registerTestPasses() {
mlir::test::registerTestCallGraphPass();		mlir::test::registerTestCallGraphPass();
mlir::test::registerTestCfAssertPass();		mlir::test::registerTestCfAssertPass();
mlir::test::registerTestConstantFold();		mlir::test::registerTestConstantFold();
mlir::test::registerTestControlFlowSink();		mlir::test::registerTestControlFlowSink();
mlir::test::registerTestDiagnosticsPass();		mlir::test::registerTestDiagnosticsPass();
mlir::test::registerTestDialectConversionPasses();		mlir::test::registerTestDialectConversionPasses();
#if MLIR_CUDA_CONVERSIONS_ENABLED		#if MLIR_CUDA_CONVERSIONS_ENABLED
mlir::test::registerTestGpuSerializeToCubinPass();		mlir::test::registerTestGpuSerializeToCubinPass();
		mlir::test::registerTestLowerToNVVM();
#endif		#endif
#if MLIR_ROCM_CONVERSIONS_ENABLED		#if MLIR_ROCM_CONVERSIONS_ENABLED
mlir::test::registerTestGpuSerializeToHsacoPass();		mlir::test::registerTestGpuSerializeToHsacoPass();
#endif		#endif
mlir::test::registerTestDecomposeCallGraphTypes();		mlir::test::registerTestDecomposeCallGraphTypes();
mlir::test::registerTestDataLayoutPropagation();		mlir::test::registerTestDataLayoutPropagation();
mlir::test::registerTestDataLayoutQuery();		mlir::test::registerTestDataLayoutQuery();
mlir::test::registerTestDeadCodeAnalysisPass();		mlir::test::registerTestDeadCodeAnalysisPass();
▲ Show 20 Lines • Show All 58 Lines • ▼ Show 20 Lines	#endif
registerAllDialects(registry);		registerAllDialects(registry);
registerAllExtensions(registry);		registerAllExtensions(registry);

#ifdef MLIR_INCLUDE_TESTS		#ifdef MLIR_INCLUDE_TESTS
::test::registerTestDialect(registry);		::test::registerTestDialect(registry);
::test::registerTestTransformDialectExtension(registry);		::test::registerTestTransformDialectExtension(registry);
::test::registerTestDynDialect(registry);		::test::registerTestDynDialect(registry);
#endif		#endif
return mlir::asMainReturnCode(		return mlir::asMainReturnCode(mlir::MlirOptMain(
mlir::MlirOptMain(argc, argv, "MLIR modular optimizer driver\n", registry));		argc, argv, "MLIR modular optimizer driver\n", registry));
}		}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][cuda] Add a test-lower-to-nvvm catchall passpipeline.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 541049

mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir

mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir

mlir/test/lib/Dialect/CMakeLists.txt

mlir/test/lib/Dialect/NVVM/CMakeLists.txt

mlir/test/lib/Dialect/NVVM/TestLowerToNVVM.cpp

mlir/tools/mlir-opt/CMakeLists.txt

mlir/tools/mlir-opt/mlir-opt.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][cuda] Add a test-lower-to-nvvm catchall passpipeline.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 541049

mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f16-f16-accum.mlir

mlir/test/Integration/GPU/CUDA/TensorCore/sm80/transform-mma-sync-matmul-f32.mlir

mlir/test/lib/Dialect/CMakeLists.txt

mlir/test/lib/Dialect/NVVM/CMakeLists.txt

mlir/test/lib/Dialect/NVVM/TestLowerToNVVM.cpp

mlir/tools/mlir-opt/CMakeLists.txt

mlir/tools/mlir-opt/mlir-opt.cpp

[mlir][cuda] Add a test-lower-to-nvvm catchall passpipeline.
ClosedPublic