This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
mlir/
-
include/mlir/Dialect/NVGPU/IR/
-
mlir/
-
Dialect/
-
NVGPU/
-
IR/
-
NVGPU.td
-
lib/
-
Conversion/NVGPUToNVVM/
-
NVGPUToNVVM/
-
NVGPUToNVVM.cpp
-
Dialect/NVGPU/IR/
-
NVGPU/
-
IR/
-
NVGPUDialect.cpp
-
test/Conversion/NVGPUToNVVM/
-
Conversion/
-
NVGPUToNVVM/
-
nvgpu-to-nvvm.mlir

Differential D159342

[MLIR][NVGPU] Introduce `nvgpu.wargroup.mma.store` Op for Hopper GPUs
AbandonedPublic

Authored by guraypp on Sep 1 2023, 1:33 AM.

Download Raw Diff

Details

Reviewers

nicolasvasilache
herhut
manishucsd
qcolombet

Summary

This work introduces a new operation called wargroup.mma.store to the NVGPU dialect of MLIR. The purpose of this operation is to facilitate storing fragmanted results of WGMMA to the given memref.

An example of fragmentation is given here :
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d

The wargroup.mma.store does followings:

Takes one or more fragmented results matrix.
Calculates indexes per thread in warp group and stores the data into give memref.

Here's an example usage of the nvgpu.wargroup.mma operation:

%res, %res2 = nvgpu.wargroup.mma ...
nvgpu.wargroup.mma.store [%res1, %res2], %matrixD : !nvgpu.warpgroup.result<tensor = !llvm.struct<...>>, !nvgpu.warpgroup.result<tensor = !llvm.struct<...>> to memref<128x128xf32,3>

Depens on D158434

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

guraypp created this revision.Sep 1 2023, 1:33 AM

Herald added a project: Restricted Project. · View Herald TranscriptSep 1 2023, 1:33 AM

Herald added subscribers: bviyer, Moerafaat, zero9178 and 24 others. · View Herald Transcript

guraypp requested review of this revision.Sep 1 2023, 1:33 AM

Herald added a reviewer: nicolasvasilache. · View Herald TranscriptSep 1 2023, 1:33 AM

Herald added a reviewer: herhut. · View Herald Transcript

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: stephenneuendorffer, nicolasvasilache, jholewinski. · View Herald Transcript

guraypp added reviewers: manishucsd, qcolombet.Sep 1 2023, 1:33 AM

Harbormaster completed remote builds in B256207: Diff 555299.Sep 1 2023, 1:34 AM

add a case for limitation

Harbormaster completed remote builds in B256209: Diff 555301.Sep 1 2023, 1:36 AM

add note

Harbormaster completed remote builds in B256210: Diff 555302.Sep 1 2023, 1:37 AM

guraypp added a child revision: D159426: [MLIR] Add H100 matmul 128x128x64 (f32+=f16*f16).Sep 4 2023, 5:28 AM

moved to github
https://github.com/llvm/llvm-project/pull/65441

Revision Contents

Path

Size

mlir/

include/

mlir/

Dialect/

NVGPU/

IR/

NVGPU.td

19 lines

lib/

Conversion/

NVGPUToNVVM/

NVGPUToNVVM.cpp

85 lines

Dialect/

NVGPU/

IR/

NVGPUDialect.cpp

31 lines

test/

Conversion/

NVGPUToNVVM/

nvgpu-to-nvvm.mlir

85 lines

Diff 555302

mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td

Show First 20 Lines • Show All 707 Lines • ▼ Show 20 Lines	def NVGPU_WarpgroupMmaOp : NVGPU_Op<"wargroup.mma"> {
let results = (outs Variadic<NVGPU_WarpgroupResult>:$matrixD);		let results = (outs Variadic<NVGPU_WarpgroupResult>:$matrixD);
let assemblyFormat = [{		let assemblyFormat = [{
$descriptorA`,` $descriptorB`,` $matrixC (`,` `group` `=` $waitGroup^ )? attr-dict		$descriptorA`,` $descriptorB`,` $matrixC (`,` `group` `=` $waitGroup^ )? attr-dict
`:` type($descriptorA) `,` type($descriptorB) `,` type($matrixC) `->` type($matrixD)		`:` type($descriptorA) `,` type($descriptorB) `,` type($matrixC) `->` type($matrixD)
}];		}];
let hasVerifier = 1;		let hasVerifier = 1;
}		}

		def NVGPU_WarpgroupMmaStoreOp : NVGPU_Op<"wargroup.mma.store"> {
		let description = [{
		The `nvgpu.wargroup.mma.store` op performs the store of fragmented result
		in $matrixD to give memref.

		[See the details of register fragment layout for accumulator matrix D](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#wgmma-64n16-d)

		Note that, the op must be run with warp group.
		}];

		let arguments = (ins Variadic<NVGPU_WarpgroupResult>:$matrixD,
		Arg<AnyMemRef, "", [MemWrite]>:$dstMemref);

		let assemblyFormat = [{
		`[` $matrixD `]` `,` $dstMemref attr-dict `:` type($matrixD) `to` type($dstMemref)
		}];
		let hasVerifier = 1;
		}

#endif // NVGPU		#endif // NVGPU

mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp

//===- NVGPUToNVVM.cpp - NVGPU to NVVM dialect conversion -----------------===//		//===- NVGPUToNVVM.cpp - NVGPU to NVVM dialect conversion -----------------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"		#include "mlir/Conversion/NVGPUToNVVM/NVGPUToNVVM.h"

#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"		#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"		#include "mlir/Conversion/LLVMCommon/ConversionTarget.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"		#include "mlir/Conversion/LLVMCommon/Pattern.h"
		#include "mlir/Dialect/Arith/IR/Arith.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"		#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"		#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMTypes.h"		#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"		#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"		#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"		#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"		#include "mlir/Dialect/Vector/IR/VectorOps.h"
#include "mlir/IR/PatternMatch.h"		#include "mlir/IR/PatternMatch.h"
▲ Show 20 Lines • Show All 382 Lines • ▼ Show 20 Lines
};		};

struct ConvertNVGPUToNVVMPass		struct ConvertNVGPUToNVVMPass
: public impl::ConvertNVGPUToNVVMPassBase<ConvertNVGPUToNVVMPass> {		: public impl::ConvertNVGPUToNVVMPassBase<ConvertNVGPUToNVVMPass> {
using Base::Base;		using Base::Base;

void getDependentDialects(DialectRegistry &registry) const override {		void getDependentDialects(DialectRegistry &registry) const override {
registry.insert<memref::MemRefDialect, LLVM::LLVMDialect, NVVM::NVVMDialect,		registry.insert<memref::MemRefDialect, LLVM::LLVMDialect, NVVM::NVVMDialect,
vector::VectorDialect>();		arith::ArithDialect>();
}		}

void runOnOperation() override {		void runOnOperation() override {
LowerToLLVMOptions options(&getContext());		LowerToLLVMOptions options(&getContext());
options.useOpaquePointers = useOpaquePointers;		options.useOpaquePointers = useOpaquePointers;
RewritePatternSet patterns(&getContext());		RewritePatternSet patterns(&getContext());
LLVMTypeConverter converter(&getContext(), options);		LLVMTypeConverter converter(&getContext(), options);
IRRewriter rewriter(&getContext());		IRRewriter rewriter(&getContext());
/// device-side async tokens cannot be materialized in nvvm. We just		/// device-side async tokens cannot be materialized in nvvm. We just
/// convert them to a dummy i32 type in order to easily drop them during		/// convert them to a dummy i32 type in order to easily drop them during
/// conversion.		/// conversion.
converter.addConversion([&](nvgpu::DeviceAsyncTokenType type) -> Type {		converter.addConversion([&](nvgpu::DeviceAsyncTokenType type) -> Type {
return converter.convertType(IntegerType::get(type.getContext(), 32));		return converter.convertType(IntegerType::get(type.getContext(), 32));
});		});
		converter.addConversion([&](nvgpu::WarpgroupResultType type) -> Type {
		return converter.convertType(type.getTensor());
		});
converter.addConversion([&](nvgpu::MBarrierTokenType type) -> Type {		converter.addConversion([&](nvgpu::MBarrierTokenType type) -> Type {
return converter.convertType(IntegerType::get(type.getContext(), 64));		return converter.convertType(IntegerType::get(type.getContext(), 64));
});		});
converter.addConversion(		converter.addConversion(
[&](nvgpu::WarpgroupMatrixDescriptorType type) -> Type {		[&](nvgpu::WarpgroupMatrixDescriptorType type) -> Type {
return converter.convertType(IntegerType::get(type.getContext(), 64));		return converter.convertType(IntegerType::get(type.getContext(), 64));
});		});
converter.addConversion([&](nvgpu::MBarrierType type) -> Type {		converter.addConversion([&](nvgpu::MBarrierType type) -> Type {
return converter.convertType(		return converter.convertType(
nvgpu::getMBarrierMemrefType(rewriter.getContext(), type));		nvgpu::getMBarrierMemrefType(rewriter.getContext(), type));
});		});
converter.addConversion([&](nvgpu::TensorMapDescriptorType type) -> Type {		converter.addConversion([&](nvgpu::TensorMapDescriptorType type) -> Type {
return converter.getPointerType(type.getTensor().getElementType());		return converter.getPointerType(type.getTensor().getElementType());
});		});
populateNVGPUToNVVMConversionPatterns(converter, patterns);		populateNVGPUToNVVMConversionPatterns(converter, patterns);
LLVMConversionTarget target(getContext());		LLVMConversionTarget target(getContext());
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();		target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
		target.addLegalDialect<::mlir::arith::ArithDialect>();
target.addLegalDialect<::mlir::memref::MemRefDialect>();		target.addLegalDialect<::mlir::memref::MemRefDialect>();
target.addLegalDialect<::mlir::vector::VectorDialect>();
target.addLegalDialect<::mlir::NVVM::NVVMDialect>();		target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
if (failed(applyPartialConversion(getOperation(), target,		if (failed(applyPartialConversion(getOperation(), target,
std::move(patterns))))		std::move(patterns))))
signalPassFailure();		signalPassFailure();
}		}
};		};

/// Returns the constraints for the sparse MMA inline assembly instruction.		/// Returns the constraints for the sparse MMA inline assembly instruction.
▲ Show 20 Lines • Show All 832 Lines • ▼ Show 20 Lines	matchAndRewrite(nvgpu::WarpgroupMmaOp op, OpAdaptor adaptor,
rewriter.create<NVVM::WgmmaWaitGroupSyncOp>(loc, op.getWaitGroup());		rewriter.create<NVVM::WgmmaWaitGroupSyncOp>(loc, op.getWaitGroup());

ValueRange myres(wgmmaResults);		ValueRange myres(wgmmaResults);
rewriter.replaceOp(op, myres);		rewriter.replaceOp(op, myres);
return success();		return success();
}		}
};		};

		struct NVGPUWarpgroupMmaStoreOpLowering
		: public ConvertOpToLLVMPattern<nvgpu::WarpgroupMmaStoreOp> {
		using ConvertOpToLLVMPattern<
		nvgpu::WarpgroupMmaStoreOp>::ConvertOpToLLVMPattern;

		void storeFragmentedMatrix(Value wgmmaResult, nvgpu::WarpgroupMmaStoreOp op,
		OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter,
		int offset) const {
		Location loc = op->getLoc();
		Type i32 = rewriter.getI32Type();

		auto makeConst = [&](int32_t index) -> Value {
		return rewriter.create<LLVM::ConstantOp>(
		loc, i32, rewriter.getI32IntegerAttr(index));
		};
		Value c4 = makeConst(4);
		Value c32 = makeConst(kWarpSize);
		Value c8 = makeConst(8);
		Value c2 = makeConst(2);
		Value c1 = makeConst(1);
		Value c16 = makeConst(16);

		auto makeMul = [&](Value lhs, Value rhs) -> Value {
		return rewriter.create<LLVM::MulOp>(loc, lhs.getType(), lhs, rhs);
		};
		auto makeAdd = [&](Value lhs, Value rhs) -> Value {
		return rewriter.create<LLVM::AddOp>(loc, lhs.getType(), lhs, rhs);
		};

		Value tidx = rewriter.create<NVVM::ThreadIdXOp>(loc, i32);
		Value laneId = rewriter.create<LLVM::URemOp>(loc, i32, tidx, c32);
		Value warpId = rewriter.create<LLVM::UDivOp>(loc, i32, tidx, c32);
		Value lane4Id = rewriter.create<LLVM::UDivOp>(loc, i32, laneId, c4);
		Value lane4modId = rewriter.create<LLVM::URemOp>(loc, i32, laneId, c4);

		auto makeExtractAndStore = [&](int i, Value wgmmaResult, Value x, Value y,
		TypedValue<::mlir::MemRefType> memref) {
		Type it = rewriter.getIndexType();
		Value idx = rewriter.create<arith::IndexCastOp>(loc, it, x);
		Value idy0 = rewriter.create<arith::IndexCastOp>(loc, it, y);
		Value idy1 = rewriter.create<arith::IndexCastOp>(loc, it, makeAdd(y, c1));
		Value d0 = rewriter.create<LLVM::ExtractValueOp>(loc, wgmmaResult, i);
		Value d1 = rewriter.create<LLVM::ExtractValueOp>(loc, wgmmaResult, i + 1);
		rewriter.create<memref::StoreOp>(loc, d0, memref, ValueRange{idx, idy0});
		rewriter.create<memref::StoreOp>(loc, d1, memref, ValueRange{idx, idy1});
		};

		Value tj = makeMul(lane4modId, c2);
		Value ti = makeAdd(lane4Id, makeMul(warpId, c16));
		if (offset)
		ti = makeAdd(ti, makeConst(offset));
		for (int i = 0; i < 2; ++i) {
		Value idx = makeAdd(ti, makeMul(makeConst(i), c8));
		for (int j = 0; j < 16; ++j) {
		Value idy = makeAdd(tj, makeMul(makeConst(j), c8));
		int sIndex = i * 2 + j * 4;
		makeExtractAndStore(sIndex, wgmmaResult, idx, idy, op.getDstMemref());
		}
		}
		}

		LogicalResult
		matchAndRewrite(nvgpu::WarpgroupMmaStoreOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override {
		int offset = 0;
		for (auto result : adaptor.getMatrixD()) {
		auto stype = result.getType().cast<LLVM::LLVMStructType>();
		storeFragmentedMatrix(result, op, adaptor, rewriter, offset);
		offset += stype.getBody().size();
		}
		rewriter.eraseOp(op);
		return success();
		}
		};

} // namespace		} // namespace

void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,		void mlir::populateNVGPUToNVVMConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns) {		RewritePatternSet &patterns) {
patterns.add<		patterns.add<
		NVGPUWarpgroupMmaStoreOpLowering, // nvgpu.wargroup.mma.store`
NVGPUMBarrierCreateLowering, // nvgpu.mbarrier.create		NVGPUMBarrierCreateLowering, // nvgpu.mbarrier.create
NVGPUMBarrierInitLowering, // nvgpu.mbarrier.init		NVGPUMBarrierInitLowering, // nvgpu.mbarrier.init
NVGPUMBarrierArriveLowering, // nvgpu.mbarrier.arrive		NVGPUMBarrierArriveLowering, // nvgpu.mbarrier.arrive
NVGPUMBarrierArriveNoCompleteLowering, // nvgpu.mbarrier.arrive.no_complete		NVGPUMBarrierArriveNoCompleteLowering, // nvgpu.mbarrier.arrive.no_complete
NVGPUMBarrierTestWaitLowering, // nvgpu.mbarrier.test_wait_parity		NVGPUMBarrierTestWaitLowering, // nvgpu.mbarrier.test_wait_parity
NVGPUMBarrierTryWaitParityLowering, // nvgpu.mbarrier.try_wait_parity		NVGPUMBarrierTryWaitParityLowering, // nvgpu.mbarrier.try_wait_parity
NVGPUTmaAsyncLoadOpLowering, // nvgpu.tma.async.load		NVGPUTmaAsyncLoadOpLowering, // nvgpu.tma.async.load
NVGPUTmaCreateDescriptorOpLowering, // nvgpu.tma.create.descriptor		NVGPUTmaCreateDescriptorOpLowering, // nvgpu.tma.create.descriptor
NVGPUMBarrierArriveExpectTxLowering, // nvgpu.mbarrier.arrive.expect_tx		NVGPUMBarrierArriveExpectTxLowering, // nvgpu.mbarrier.arrive.expect_tx
NVGPUGenerateGmmaDescriptorLowering, // nvgpu.wgmma.generate.descriptor		NVGPUGenerateGmmaDescriptorLowering, // nvgpu.wgmma.generate.descriptor
NVGPUWarpgroupMmaOpLowering, // nvgpu.wargroup.mma		NVGPUWarpgroupMmaOpLowering, // nvgpu.wargroup.mma
MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,		MmaSyncOptoNVVM, MmaLdMatrixOpToNVVM, NVGPUAsyncCopyLowering,
NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,		NVGPUAsyncCreateGroupLowering, NVGPUAsyncWaitLowering,
NVGPUMmaSparseSyncLowering>(converter);		NVGPUMmaSparseSyncLowering>(converter);
}		}

mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp

//===- NVGPUDialect.cpp - MLIR NVGPU ops implementation -------------------===//		//===- NVGPUDialect.cpp - MLIR NVGPU ops implementation -------------------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file implements the NVGPU dialect and its operations.		// This file implements the NVGPU dialect and its operations.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"		#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"		#include "mlir/Dialect/GPU/IR/GPUDialect.h"
		#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
#include "mlir/IR/Builders.h"		#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinAttributes.h"		#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/BuiltinTypes.h"		#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Diagnostics.h"		#include "mlir/IR/Diagnostics.h"
#include "mlir/IR/DialectImplementation.h"		#include "mlir/IR/DialectImplementation.h"
#include "mlir/IR/Matchers.h"		#include "mlir/IR/Matchers.h"
#include "mlir/IR/OpImplementation.h"		#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/PatternMatch.h"		#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/TypeUtilities.h"		#include "mlir/IR/TypeUtilities.h"
#include "mlir/IR/Verifier.h"		#include "mlir/IR/Verifier.h"
		#include "mlir/Support/LogicalResult.h"
		#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringExtras.h"		#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/TypeSwitch.h"		#include "llvm/ADT/TypeSwitch.h"

using namespace mlir;		using namespace mlir;
using namespace mlir::nvgpu;		using namespace mlir::nvgpu;

#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.cpp.inc"		#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.cpp.inc"

▲ Show 20 Lines • Show All 464 Lines • ▼ Show 20 Lines	return emitOpError() << "hit a limitation: " << matrixC.getElementType()
<< " += " << matrixA.getElementType() << " * "		<< " += " << matrixA.getElementType() << " * "
<< matrixB.getElementType()		<< matrixB.getElementType()
<< ", it is not supported yet";		<< ", it is not supported yet";
}		}

return success();		return success();
}		}

		LogicalResult WarpgroupMmaStoreOp::verify() {
		Type stype =
		getMatrixD().front().getType().cast<WarpgroupResultType>().getTensor();

		for (auto result : getMatrixD()) {
		auto resultStype = result.getType()
		.cast<WarpgroupResultType>()
		.getTensor()
		.dyn_cast<LLVM::LLVMStructType>();
		if (!resultStype)
		return emitOpError() << "result is " << result.getType()
		<< " but must keep type of llvm struct";
		if (stype != resultStype)
		return emitOpError() << "all results must be the same type";

		// todo improve this limitation
		if (!resultStype.getBody().front().isF32()) {
		return emitOpError() << "supporst only f32 results for the time being";
		}
		}

		if (!llvm::all_equal(stype.cast<LLVM::LLVMStructType>().getBody())) {
		return emitOpError() << "all element types must be equal ";
		}

		return success();
		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// TableGen'd dialect, type, and op definitions		// TableGen'd dialect, type, and op definitions
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#define GET_ATTRDEF_CLASSES		#define GET_ATTRDEF_CLASSES
#include "mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.cpp.inc"		#include "mlir/Dialect/NVGPU/IR/NVGPUAttrDefs.cpp.inc"

#include "mlir/Dialect/NVGPU/IR/NVGPUEnums.cpp.inc"		#include "mlir/Dialect/NVGPU/IR/NVGPUEnums.cpp.inc"

#define GET_OP_CLASSES		#define GET_OP_CLASSES
#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"		#include "mlir/Dialect/NVGPU/IR/NVGPU.cpp.inc"

#define GET_TYPEDEF_CLASSES		#define GET_TYPEDEF_CLASSES
#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.cpp.inc"		#include "mlir/Dialect/NVGPU/IR/NVGPUTypes.cpp.inc"

mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir

	Show First 20 Lines • Show All 726 Lines • ▼ Show 20 Lines
	// CHECK: nvvm.wgmma.wait.group.sync.aligned 1			// CHECK: nvvm.wgmma.wait.group.sync.aligned 1
	%c0 = arith.constant 0 : index			%c0 = arith.constant 0 : index
	%f0 = arith.constant 0.0 : f32			%f0 = arith.constant 0.0 : f32
	%acc = vector.transfer_read %D[%c0, %c0], %f0 {in_bounds = [true, true]} : memref<128x128xf32,3>, vector<128x128xf32>			%acc = vector.transfer_read %D[%c0, %c0], %f0 {in_bounds = [true, true]} : memref<128x128xf32,3>, vector<128x128xf32>
	%wgmmaResult, %wgmmaResult2 = nvgpu.wargroup.mma %descA, %descB, %acc, group = 1 {transposeB}:			%wgmmaResult, %wgmmaResult2 = nvgpu.wargroup.mma %descA, %descB, %acc, group = 1 {transposeB}:
	!nvgpu.wgmma.descriptor<tensor = memref<128x64xf16, 3>>,			!nvgpu.wgmma.descriptor<tensor = memref<128x64xf16, 3>>,
	!nvgpu.wgmma.descriptor<tensor = memref<64x128xf16, 3>>,			!nvgpu.wgmma.descriptor<tensor = memref<64x128xf16, 3>>,
	vector<128x128xf32> -> !nvgpu.warpgroup.result<tensor = !accMatrixStruct>, !nvgpu.warpgroup.result<tensor = !accMatrixStruct>			vector<128x128xf32> -> !nvgpu.warpgroup.result<tensor = !accMatrixStruct>, !nvgpu.warpgroup.result<tensor = !accMatrixStruct>
				return
				}

				// CHECK-LABEL: @warpgroup_mma_store(
				// CHECK-SAME: %[[arg0:[a-zA-Z0-9_]+]]: !nvgpu.warpgroup.result<tensor = !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>>, %[[arg1:[a-zA-Z0-9_]+]]: memref<128x128xf32, 3>
				func.func @warpgroup_mma_store(%result1 : !nvgpu.warpgroup.result<tensor = !accMatrixStruct>, %matrixD: memref<128x128xf32,3>) {
				// CHECK: %[[S0:.+]] = builtin.unrealized_conversion_cast %[[arg0]] :
				// CHECK: %[[S2:.+]] = llvm.mlir.constant(4 : i32) : i32
				// CHECK: %[[S3:.+]] = llvm.mlir.constant(32 : i32) : i32
				// CHECK: %[[S4:.+]] = llvm.mlir.constant(8 : i32) : i32
				// CHECK: %[[S5:.+]] = llvm.mlir.constant(2 : i32) : i32
				// CHECK: %[[S6:.+]] = llvm.mlir.constant(1 : i32) : i32
				// CHECK: %[[S7:.+]] = llvm.mlir.constant(16 : i32) : i32

				// ### Store {d0, d1} of each thread ###

				// CHECK: %[[S8:.+]] = nvvm.read.ptx.sreg.tid.x : i32
				// CHECK: %[[S9:.+]] = llvm.urem %[[S8]], %[[S3]] : i32
				// CHECK: %[[S10:.+]] = llvm.udiv %[[S8]], %[[S3]] : i32
				// CHECK: %[[S11:.+]] = llvm.udiv %[[S9]], %[[S2]] : i32
				// CHECK: %[[S12:.+]] = llvm.urem %[[S9]], %[[S2]] : i32
				// CHECK: %[[S13:.+]] = llvm.mul %[[S12]], %[[S5]] : i32
				// CHECK: %[[S14:.+]] = llvm.mul %[[S10]], %[[S7]] : i32
				// CHECK: %[[S15:.+]] = llvm.add %[[S11]], %[[S14]] : i32
				// CHECK: %[[S16:.+]] = llvm.mlir.constant(0 : i32) : i32
				// CHECK: %[[S17:.+]] = llvm.mul %[[S16]], %[[S4]] : i32
				// CHECK: %[[S18:.+]] = llvm.add %[[S15]], %[[S17]] : i32
				// CHECK: %[[S19:.+]] = llvm.mlir.constant(0 : i32) : i32
				// CHECK: %[[S20:.+]] = llvm.mul %[[S19]], %[[S4]] : i32
				// CHECK: %[[S21:.+]] = llvm.add %[[S13]], %[[S20]] : i32
				// CHECK: %[[S22:.+]] = arith.index_cast %[[S18]] : i32 to index
				// CHECK: %[[S23:.+]] = arith.index_cast %[[S21]] : i32 to index
				// CHECK: %[[S24:.+]] = llvm.add %[[S21]], %[[S6]] : i32
				// CHECK: %[[S25:.+]] = arith.index_cast %[[S24]] : i32 to index
				// CHECK: %[[S26:.+]] = llvm.extractvalue %[[S0]][0] : !llvm.struct
				// CHECK: %[[S27:.+]] = llvm.extractvalue %[[S0]][1] : !llvm.struct
				// CHECK: memref.store %[[S26]], %[[arg1]][%[[S22]], %[[S23]]] : memref<128x128xf32, 3>
				// CHECK: memref.store %[[S27]], %[[arg1]][%[[S22]], %[[S25]]] : memref<128x128xf32, 3>

				// ### Store {d2, d3} of each thread ###

				// CHECK: %[[S28:.+]] = llvm.mlir.constant(1 : i32) : i32
				// CHECK: %[[S29:.+]] = llvm.mul %[[S28]], %[[S4]] : i32
				// CHECK: %[[S30:.+]] = llvm.add %[[S13]], %[[S29]] : i32
				// CHECK: %[[S31:.+]] = arith.index_cast %[[S18]] : i32 to index
				// CHECK: %[[S32:.+]] = arith.index_cast %[[S30]] : i32 to index
				// CHECK: %[[S33:.+]] = llvm.add %[[S30]], %[[S6]] : i32
				// CHECK: %[[S34:.+]] = arith.index_cast %[[S33]] : i32 to index
				// CHECK: %[[S35:.+]] = llvm.extractvalue %[[S0]][4] : !llvm.struct<
				// CHECK: %[[S36:.+]] = llvm.extractvalue %[[S0]][5] : !llvm.struct<
				// CHECK: memref.store %[[S35]], %[[arg1]][%[[S31]], %[[S32]]] : memref<128x128xf32, 3>
				// CHECK: memref.store %[[S36]], %[[arg1]][%[[S31]], %[[S34]]] : memref<128x128xf32, 3>

				// ### Store {d4, d5} of each thread ###

				// CHECK: %[[S37:.+]] = llvm.mlir.constant(2 : i32) : i32
				// CHECK: %[[S38:.+]] = llvm.mul %[[S37]], %[[S4]] : i32
				// CHECK: %[[S39:.+]] = llvm.add %[[S13]], %[[S38]] : i32
				// CHECK: %[[S40:.+]] = arith.index_cast %[[S18]] : i32 to index
				// CHECK: %[[S41:.+]] = arith.index_cast %[[S39]] : i32 to index
				// CHECK: %[[S42:.+]] = llvm.add %[[S39]], %[[S6]] : i32
				// CHECK: %[[S43:.+]] = arith.index_cast %[[S42]] : i32 to index
				// CHECK: %[[S44:.+]] = llvm.extractvalue %[[S0]][8] : !llvm.struct<
				// CHECK: %[[S45:.+]] = llvm.extractvalue %[[S0]][9] : !llvm.struct<
				// CHECK: memref.store %[[S44]], %[[arg1]][%[[S40]], %[[S41]]] : memref<128x128xf32, 3>
				// CHECK: memref.store %[[S45]], %[[arg1]][%[[S40]], %[[S43]]] : memref<128x128xf32, 3>

				// ### Store {d6, d7} of each thread ###

				// CHECK: %[[S46:.+]] = llvm.mlir.constant(3 : i32) : i32
				// CHECK: %[[S47:.+]] = llvm.mul %[[S46]], %[[S4]] : i32
				// CHECK: %[[S48:.+]] = llvm.add %[[S13]], %[[S47]] : i32
				// CHECK: %[[S49:.+]] = arith.index_cast %[[S18]] : i32 to index
				// CHECK: %[[S50:.+]] = arith.index_cast %[[S48]] : i32 to index
				// CHECK: %[[S51:.+]] = llvm.add %[[S48]], %[[S6]] : i32
				// CHECK: %[[S52:.+]] = arith.index_cast %[[S51]] : i32 to index
				// CHECK: %[[S53:.+]] = llvm.extractvalue %[[S0]][12] : !llvm.struct<
				// CHECK: %[[S54:.+]] = llvm.extractvalue %[[S0]][13] : !llvm.struct<
				// CHECK: memref.store %[[S53]], %[[arg1]][%[[S49]], %[[S50]]] : memref<128x128xf32, 3>
				// CHECK: memref.store %[[S54]], %[[arg1]][%[[S49]], %[[S52]]] : memref<128x128xf32, 3>

				// Pattern continues similarly 60x times until {... d126, d127}

				nvgpu.wargroup.mma.store [%result1], %matrixD : !nvgpu.warpgroup.result<tensor = !accMatrixStruct> to memref<128x128xf32,3>
	return			return
	}			}

	transform.sequence failures(propagate) {			transform.sequence failures(propagate) {
	^bb1(%arg1: !transform.any_op):			^bb1(%arg1: !transform.any_op):
	%0 = transform.structured.match ops{["func.func"]} in %arg1			%0 = transform.structured.match ops{["func.func"]} in %arg1
	: (!transform.any_op) -> !transform.any_op			: (!transform.any_op) -> !transform.any_op
	transform.apply_conversion_patterns to %0 {			transform.apply_conversion_patterns to %0 {
	transform.apply_conversion_patterns.nvgpu.nvgpu_to_nvvm			transform.apply_conversion_patterns.nvgpu.nvgpu_to_nvvm
	} with type_converter {			} with type_converter {
	transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter			transform.apply_conversion_patterns.memref.memref_to_llvm_type_converter
	{use_opaque_pointers = true}			{use_opaque_pointers = true}
	} {legal_dialects = ["arith", "func", "llvm", "memref", "nvvm", "vector", "scf"], partial_conversion} : !transform.any_op			} {legal_dialects = ["arith", "func", "llvm", "memref", "nvvm", "vector", "scf"], partial_conversion} : !transform.any_op
	}			}
	No newline at end of file			No newline at end of file