Diff 321195

mlir/lib/Conversion/GPUCommon/CMakeLists.txt

Show All 23 Lines	add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms

LINK_COMPONENTS		LINK_COMPONENTS
Core		Core
MC		MC
${AMDGPU_LIBS}		${AMDGPU_LIBS}
${NVPTX_LIBS}		${NVPTX_LIBS}

LINK_LIBS PUBLIC		LINK_LIBS PUBLIC
		MLIRAsyncToLLVM
MLIRGPU		MLIRGPU
MLIRIR		MLIRIR
MLIRLLVMIR		MLIRLLVMIR
MLIRPass		MLIRPass
MLIRSupport		MLIRSupport
MLIRStandardToLLVM		MLIRStandardToLLVM
)		)

mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp

Show All 10 Lines
// ABI, this pass uses a slim runtime layer that builds on top of the public		// ABI, this pass uses a slim runtime layer that builds on top of the public
// API from GPU runtime headers.		// API from GPU runtime headers.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"		#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"

#include "../PassDetail.h"		#include "../PassDetail.h"
		#include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h"
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"		#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
#include "mlir/Dialect/GPU/GPUDialect.h"		#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"		#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Attributes.h"		#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"		#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinOps.h"		#include "mlir/IR/BuiltinOps.h"
#include "mlir/IR/BuiltinTypes.h"		#include "mlir/IR/BuiltinTypes.h"

▲ Show 20 Lines • Show All 261 Lines • ▼ Show 20 Lines	private:
matchAndRewrite(gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands,		matchAndRewrite(gpu::MemcpyOp memcpyOp, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override;		ConversionPatternRewriter &rewriter) const override;
};		};
} // namespace		} // namespace

void GpuToLLVMConversionPass::runOnOperation() {		void GpuToLLVMConversionPass::runOnOperation() {
LLVMTypeConverter converter(&getContext());		LLVMTypeConverter converter(&getContext());
OwningRewritePatternList patterns;		OwningRewritePatternList patterns;
		LLVMConversionTarget target(getContext());

populateStdToLLVMConversionPatterns(converter, patterns);		populateStdToLLVMConversionPatterns(converter, patterns);
		populateAsyncStructuralTypeConversionsAndLegality(&getContext(), converter,
		herhutUnsubmitted Done Reply Inline Actions Long term we have to find a better way to do these things. For now this is fine (just writing this here for documentation). herhut: Long term we have to find a better way to do these things. For now this is fine (just writing…
		patterns, target);
populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);		populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);

LLVMConversionTarget target(getContext());
if (failed(		if (failed(
applyPartialConversion(getOperation(), target, std::move(patterns))))		applyPartialConversion(getOperation(), target, std::move(patterns))))
signalPassFailure();		signalPassFailure();
}		}

LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,		LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,
ArrayRef<Value> arguments) const {		ArrayRef<Value> arguments) const {
auto module = builder.getBlock()->getParent()->getParentOfType<ModuleOp>();		auto module = builder.getBlock()->getParent()->getParentOfType<ModuleOp>();
▲ Show 20 Lines • Show All 425 Lines • Show Last 20 Lines

mlir/test/mlir-cuda-runner/async.mlir

This file was added.

				// RUN: mlir-cuda-runner %s --entry-point-result=void -O0 \
				// RUN: --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext \
				// RUN: --shared-libs=%linalg_test_lib_dir/libmlir_async_runtime%shlibext \
				// RUN: --shared-libs=%linalg_test_lib_dir/libmlir_runner_utils%shlibext \
				// RUN: \| FileCheck %s

				func @main() {
				%c0 = constant 0 : index
				%c1 = constant 1 : index
				%count = constant 2 : index

				// initialize h0 on host
				%h0 = alloc(%count) : memref<?xi32>
				%h0_unranked = memref_cast %h0 : memref<?xi32> to memref<*xi32>
				gpu.host_register %h0_unranked : memref<*xi32>
				herhutUnsubmitted Not Done Reply Inline Actions Is this needed? herhut: Is this needed?
				csiggAuthorUnsubmitted Done Reply Inline Actions It is not required for correctness if we copy the result as you suggested below. But without page-locking, the memcpy becomes synchronous and the second and third memcpy would schedule (and execute) after the first has completed even without synchronization. csigg: It is not required for correctness if we copy the result as you suggested below. But without…

				%v0 = constant 42 : i32
				store %v0, %h0[%c0] : memref<?xi32>
				store %v0, %h0[%c1] : memref<?xi32>

				// copy h0 to b0 on device.
				%t0, %f0 = async.execute () -> !async.value<memref<?xi32>> {
				%b0 = gpu.alloc(%count) : memref<?xi32>
				gpu.memcpy %b0, %h0 : memref<?xi32>, memref<?xi32>
				async.yield %b0 : memref<?xi32>
				}

				// copy h0 to b1 and b2 (fork)
				%t1, %f1 = async.execute [%t0] (
				%f0 as %b0 : !async.value<memref<?xi32>>
				) -> !async.value<memref<?xi32>> {
				%b1 = gpu.alloc(%count) : memref<?xi32>
				gpu.memcpy %b1, %b0 : memref<?xi32>, memref<?xi32>
				async.yield %b1 : memref<?xi32>
				}
				%t2, %f2 = async.execute [%t0] (
				%f0 as %b0 : !async.value<memref<?xi32>>
				) -> !async.value<memref<?xi32>> {
				%b2 = gpu.alloc(%count) : memref<?xi32>
				gpu.memcpy %b2, %b0 : memref<?xi32>, memref<?xi32>
				async.yield %b2 : memref<?xi32>
				}

				// h0 = b1 + b2 (join).
				%t3 = async.execute [%t1, %t2] (
				%f1 as %b1 : !async.value<memref<?xi32>>,
				%f2 as %b2 : !async.value<memref<?xi32>>
				) {
				gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %c1, %grid_y = %c1, %grid_z = %c1)
				threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %c1, %block_z = %c1) {
				%v1 = load %b1[%tx] : memref<?xi32>
				%v2 = load %b2[%tx] : memref<?xi32>
				%sum = addi %v1, %v2 : i32
				store %sum, %h0[%tx] : memref<?xi32>
				gpu.terminator
				}
				async.yield
				}

				async.await %t3 : !async.token
				herhutUnsubmitted Not Done Reply Inline Actions Why not explicitly copy back, too? herhut: Why not explicitly copy back, too?
				csiggAuthorUnsubmitted Done Reply Inline Actions I had that initially, but I wanted to keep the number of GPU operations minimal. csigg: I had that initially, but I wanted to keep the number of GPU operations minimal.
				// CHECK: [84, 84]
				call @print_memref_i32(%h0_unranked) : (memref<*xi32>) -> ()
				return
				}

				func private @print_memref_i32(memref<*xi32>)

mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp

//===- mlir-cuda-runner.cpp - MLIR CUDA Execution Driver-------------------===//		//===- mlir-cuda-runner.cpp - MLIR CUDA Execution Driver-------------------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This is a command line utility that executes an MLIR file on the GPU by		// This is a command line utility that executes an MLIR file on the GPU by
// translating MLIR to NVVM/LVVM IR before JIT-compiling and executing the		// translating MLIR to NVVM/LVVM IR before JIT-compiling and executing the
// latter.		// latter.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"

		#include "mlir/Conversion/AsyncToLLVM/AsyncToLLVM.h"
#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"		#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"		#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"		#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"		#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
		#include "mlir/Dialect/Async/Passes.h"
#include "mlir/Dialect/GPU/GPUDialect.h"		#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/GPU/Passes.h"		#include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"		#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"		#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/ExecutionEngine/JitRunner.h"		#include "mlir/ExecutionEngine/JitRunner.h"
#include "mlir/ExecutionEngine/OptUtils.h"		#include "mlir/ExecutionEngine/OptUtils.h"
#include "mlir/IR/BuiltinOps.h"		#include "mlir/IR/BuiltinOps.h"
#include "mlir/InitAllDialects.h"		#include "mlir/InitAllDialects.h"
▲ Show 20 Lines • Show All 76 Lines • ▼ Show 20 Lines	OwnedBlob compilePtxToCubin(const std::string ptx, Location loc,

return result;		return result;
}		}

static LogicalResult runMLIRPasses(ModuleOp m) {		static LogicalResult runMLIRPasses(ModuleOp m) {
PassManager pm(m.getContext());		PassManager pm(m.getContext());
applyPassManagerCLOptions(pm);		applyPassManagerCLOptions(pm);

const char gpuBinaryAnnotation[] = "nvvm.cubin";		const char gpuBinaryAnnotation[] = "nvvm.cubin";
		herhutUnsubmitted Not Done Reply Inline Actions Just wondering whether all of this could also become a textual pass specification. Then the cuda runner would essentially only contain of a special pass that wraps the creation of cubin blobs (the `createConvertGPUKernelToBlobPass`) and otherwise uses a specified pipeline. Then we would not need to bake in the async lowering here. herhut: Just wondering whether all of this could also become a textual pass specification. Then the…
		csiggAuthorUnsubmitted Done Reply Inline Actions Good idea, I will try that. csigg: Good idea, I will try that.
pm.addPass(createGpuKernelOutliningPass());		pm.addPass(createGpuKernelOutliningPass());
auto &kernelPm = pm.nest<gpu::GPUModuleOp>();		auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
kernelPm.addPass(createStripDebugInfoPass());		kernelPm.addPass(createStripDebugInfoPass());
kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());		kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
kernelPm.addPass(createConvertGPUKernelToBlobPass(		kernelPm.addPass(createConvertGPUKernelToBlobPass(
translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda",		translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda",
"sm_35", "+ptx60", gpuBinaryAnnotation));		"sm_35", "+ptx60", gpuBinaryAnnotation));
		auto &funcPm = pm.nest<FuncOp>();
		funcPm.addPass(createGpuAsyncRegionPass());
		funcPm.addPass(createAsyncRefCountingPass());
pm.addPass(createGpuToLLVMConversionPass(gpuBinaryAnnotation));		pm.addPass(createGpuToLLVMConversionPass(gpuBinaryAnnotation));
		pm.addPass(createAsyncToAsyncRuntimePass());
		pm.addPass(createConvertAsyncToLLVMPass());
		mlir::LowerToLLVMOptions lower_to_llvm_opts;
		pm.addPass(mlir::createLowerToLLVMPass(lower_to_llvm_opts));

return pm.run(m);		return pm.run(m);
}		}

int main(int argc, char **argv) {		int main(int argc, char **argv) {
registerPassManagerCLOptions();		registerPassManagerCLOptions();
llvm::InitLLVM y(argc, argv);		llvm::InitLLVM y(argc, argv);
llvm::InitializeNativeTarget();		llvm::InitializeNativeTarget();
Show All 15 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] Add gpu async integration test.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 321195

mlir/lib/Conversion/GPUCommon/CMakeLists.txt

mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp

mlir/test/mlir-cuda-runner/async.mlir

mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[mlir] Add gpu async integration test.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 321195

mlir/lib/Conversion/GPUCommon/CMakeLists.txt

mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp

mlir/test/mlir-cuda-runner/async.mlir

mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp

[mlir] Add gpu async integration test.
ClosedPublic