This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
mlir/
-
include/mlir/Dialect/GPU/
-
mlir/
-
Dialect/
-
GPU/
-
GPUOps.td
-
lib/Conversion/GPUToNVVM/
-
Conversion/
-
GPUToNVVM/
1/2
WmmaOpsToNvvm.cpp
-
test/Conversion/GPUToNVVM/
-
Conversion/
-
GPUToNVVM/
-
wmma-ops-to-nvvm.mlir

Differential D113383

[mlir][nvvm] Remove special case ptr arithmetic lowering in gpu to nvvm
ClosedPublic

Authored by ThomasRaoux on Nov 8 2021, 12:50 AM.

Download Raw Diff

Details

Reviewers

mravishankar
bondhugula
nicolasvasilache
herhut

Commits

rGf309939d065a: [mlir][nvvm] Remove special case ptr arithmetic lowering in gpu to nvvm

Summary

Use existing helper instead of handling only a subset of indices lowering arithmetic. Also relax the restriction on the memref rank for the GPU mma ops as we can now support any rank.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

ThomasRaoux created this revision.Nov 8 2021, 12:50 AM

Herald added subscribers: wenzhicui, wrengr, Chia-hungDuan and 21 others. · View Herald TranscriptNov 8 2021, 12:50 AM

ThomasRaoux requested review of this revision.Nov 8 2021, 12:50 AM

Herald added a reviewer: herhut. · View Herald TranscriptNov 8 2021, 12:50 AM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added a subscriber: stephenneuendorffer. · View Herald Transcript

ThomasRaoux edited the summary of this revision. (Show Details)Nov 8 2021, 12:50 AM

Harbormaster completed remote builds in B132963: Diff 385415.Nov 8 2021, 1:03 AM

herhut added inline comments.Nov 10 2021, 7:48 AM

mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
206	Why is this cast no longer needed?

ThomasRaoux added inline comments.Nov 10 2021, 9:23 AM

mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp
206	I don't think this comment was correct, the intrinsic is overloaded and can take any pointer type. The pointer type itself doesn't matter since the intrinsic is converted one to one to ptx op `wmma.load.a.sync.row.mnk.` where pointers are not typed.

Thanks for the explanation.

This revision is now accepted and ready to land.Nov 10 2021, 9:27 AM

Closed by commit rGf309939d065a: [mlir][nvvm] Remove special case ptr arithmetic lowering in gpu to nvvm (authored by ThomasRaoux). · Explain WhyNov 10 2021, 10:00 AM

This revision was automatically updated to reflect the committed changes.

ThomasRaoux added a commit: rGf309939d065a: [mlir][nvvm] Remove special case ptr arithmetic lowering in gpu to nvvm.

Revision Contents

Path

Size

mlir/

include/

mlir/

Dialect/

GPU/

GPUOps.td

4 lines

lib/

Conversion/

GPUToNVVM/

WmmaOpsToNvvm.cpp

99 lines

test/

Conversion/

GPUToNVVM/

wmma-ops-to-nvvm.mlir

68 lines

Diff 386216

mlir/include/mlir/Dialect/GPU/GPUOps.td

Show First 20 Lines • Show All 985 Lines • ▼ Show 20 Lines	let description = [{
Example:		Example:

```mlir		```mlir
%0 = gpu.subgroup_mma_load_matrix src[%i,%j] : {leadDimension = 32 : i32}		%0 = gpu.subgroup_mma_load_matrix src[%i,%j] : {leadDimension = 32 : i32}
: memref<32x32xf16, 3>, !gpu.mma_matrix<16x16xf16, "AOp">		: memref<32x32xf16, 3>, !gpu.mma_matrix<16x16xf16, "AOp">
```		```
}];		}];

let arguments = (ins Arg<MemRefRankOf<[F16, F32], [2]>, "", [MemRead]>:$srcMemref,		let arguments = (ins Arg<MemRefOf<[F16, F32]>, "", [MemRead]>:$srcMemref,
Variadic<Index>:$indices,		Variadic<Index>:$indices,
IndexAttr:$leadDimension);		IndexAttr:$leadDimension);

let results = (outs GPU_MMAMatrix:$res);		let results = (outs GPU_MMAMatrix:$res);

let assemblyFormat = [{		let assemblyFormat = [{
$srcMemref`[`$indices`]` attr-dict `:` type($srcMemref) `->` type($res)		$srcMemref`[`$indices`]` attr-dict `:` type($srcMemref) `->` type($res)
}];		}];
Show All 23 Lines	let description = [{

```mlir		```mlir
gpu.subgroup_mma_store_matrix %D, %sg[%i,%j] : { leadDimension = 32 : i32}		gpu.subgroup_mma_store_matrix %D, %sg[%i,%j] : { leadDimension = 32 : i32}
: !gpu.mma_matrix<16x16xf16, "COp">, memref<32x32xf16, 3>		: !gpu.mma_matrix<16x16xf16, "COp">, memref<32x32xf16, 3>
```		```
}];		}];

let arguments = (ins Arg<MMAMatrixOf<[F16, F32]>>:$src,		let arguments = (ins Arg<MMAMatrixOf<[F16, F32]>>:$src,
Arg<MemRefRankOf<[F16, F32], [2]>, "",[MemWrite]>:$dstMemref,		Arg<MemRefOf<[F16, F32]>, "",[MemWrite]>:$dstMemref,
Variadic<Index>:$indices,		Variadic<Index>:$indices,
IndexAttr:$leadDimension);		IndexAttr:$leadDimension);

let assemblyFormat = [{		let assemblyFormat = [{
$src`,` $dstMemref`[`$indices`]` attr-dict `:` type($src)`,` type($dstMemref)		$src`,` $dstMemref`[`$indices`]` attr-dict `:` type($src)`,` type($dstMemref)
}];		}];

let verifier = [{ return ::verify(*this); }];		let verifier = [{ return ::verify(*this); }];
▲ Show 20 Lines • Show All 143 Lines • Show Last 20 Lines

mlir/lib/Conversion/GPUToNVVM/WmmaOpsToNvvm.cpp

Show First 20 Lines • Show All 71 Lines • ▼ Show 20 Lines	struct WmmaLoadOpToNVVMLowering
LogicalResult		LogicalResult
matchAndRewrite(gpu::SubgroupMmaLoadMatrixOp subgroupMmaLoadMatrixOp,		matchAndRewrite(gpu::SubgroupMmaLoadMatrixOp subgroupMmaLoadMatrixOp,
OpAdaptor adaptor,		OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {		ConversionPatternRewriter &rewriter) const override {
Operation *op = subgroupMmaLoadMatrixOp.getOperation();		Operation *op = subgroupMmaLoadMatrixOp.getOperation();
if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)))		if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)))
return failure();		return failure();

Location loc = op->getLoc();

// MemRefDescriptor to extract alignedPtr and offset.
MemRefDescriptor promotedSrcOp(adaptor.srcMemref());

// Emit ops which compute the load offset using `srcOffsetI`,
// `srcOffsetJ`. The actualOffset is (memrefOffset + (alignedPtr +
// ((leadDimension * srcOffsetI) + srcOffsetJ)). The memrefs here are
// assumed to be normalized and hence the simple conversion works.
IntegerAttr leadDimension = subgroupMmaLoadMatrixOp.leadDimensionAttr();
SmallVector<Value> indices(adaptor.indices());
Value srcOffsetIVal = indices[0];
Value srcOffsetJVal = indices[1];
Value leadingDim = rewriter.create<LLVM::ConstantOp>(
loc, srcOffsetIVal.getType(), leadDimension);
Value numElemsLeadDim =
rewriter.create<LLVM::MulOp>(loc, leadingDim, srcOffsetIVal);
Value loadOffset =
rewriter.create<LLVM::AddOp>(loc, numElemsLeadDim, srcOffsetJVal);

Value promotedSrcOpToUse;
promotedSrcOpToUse = promotedSrcOp.offset(rewriter, loc);
Value actualOffset =
rewriter.create<LLVM::AddOp>(loc, loadOffset, promotedSrcOpToUse);
Value loadAddress = rewriter.create<LLVM::GEPOp>(
loc, promotedSrcOp.getElementPtrType(),
promotedSrcOp.alignedPtr(rewriter, loc), ArrayRef<Value>{actualOffset});

// Bitcast the base address pointer of the destination memref, So that
// values can be stored in chunks of 32-bits and semantics match with the
// intrinsic exposed by NVPTX backend.
Value loadAddressCasted = rewriter.create<LLVM::BitcastOp>(
loc,
LLVM::LLVMPointerType::get(
rewriter.getI32Type(),
promotedSrcOp.getElementPtrType().getAddressSpace()),
loadAddress);

// Get the shape of the MMAMatrix type being returned. The shape will		// Get the shape of the MMAMatrix type being returned. The shape will
// choose which intrinsic this op will be lowered to.		// choose which intrinsic this op will be lowered to.
gpu::MMAMatrixType retType =		gpu::MMAMatrixType retType =
subgroupMmaLoadMatrixOp.res().getType().cast<gpu::MMAMatrixType>();		subgroupMmaLoadMatrixOp.res().getType().cast<gpu::MMAMatrixType>();
ArrayRef<int64_t> retTypeShape = retType.getShape();		ArrayRef<int64_t> retTypeShape = retType.getShape();
int64_t m = 0;		int64_t m = 0;
int64_t n = 0;		int64_t n = 0;
int64_t k = 0;		int64_t k = 0;
Show All 15 Lines	matchAndRewrite(gpu::SubgroupMmaLoadMatrixOp subgroupMmaLoadMatrixOp,
}		}
NVVM::MMALayout layout = NVVM::MMALayout::row;		NVVM::MMALayout layout = NVVM::MMALayout::row;
NVVM::MMAFrag frag = convertOperand(retType.getOperand());		NVVM::MMAFrag frag = convertOperand(retType.getOperand());
// Check that there is an exisiting instruction for the combination we need.		// Check that there is an exisiting instruction for the combination we need.
if (NVVM::WMMALoadOp::getIntrinsicID(m, n, k, layout, eltype, frag) == 0)		if (NVVM::WMMALoadOp::getIntrinsicID(m, n, k, layout, eltype, frag) == 0)
return rewriter.notifyMatchFailure(op, kInvalidCaseStr);		return rewriter.notifyMatchFailure(op, kInvalidCaseStr);

Type resType = convertMMAToLLVMType(retType);		Type resType = convertMMAToLLVMType(retType);
		Location loc = op->getLoc();

// Create nvvm.mma_load op according to the operand types.		// Create nvvm.mma_load op according to the operand types.
Value leadingDim32 = rewriter.create<LLVM::ConstantOp>(		Value dataPtr = getStridedElementPtr(
loc, rewriter.getI32Type(), leadDimension);		loc, subgroupMmaLoadMatrixOp.srcMemref().getType().cast<MemRefType>(),
		adaptor.srcMemref(), adaptor.indices(), rewriter);

		Value leadingDim = rewriter.create<LLVM::ConstantOp>(
		loc, rewriter.getI32Type(),
		subgroupMmaLoadMatrixOp.leadDimensionAttr());
rewriter.replaceOpWithNewOp<NVVM::WMMALoadOp>(		rewriter.replaceOpWithNewOp<NVVM::WMMALoadOp>(
op, resType, loadAddressCasted, leadingDim32, m, n, k, layout, eltype,		op, resType, dataPtr, leadingDim, m, n, k, layout, eltype, frag);
frag);

return success();		return success();
}		}
};		};

/// This class implements the conversion of GPU MMA storeOp to wmma.store op		/// This class implements the conversion of GPU MMA storeOp to wmma.store op
/// in the NVVM dialect. The conversion not only emits the NVVM op but also		/// in the NVVM dialect. The conversion not only emits the NVVM op but also
/// emits code that is necessary to unpack the data in the source and		/// emits code that is necessary to unpack the data in the source and
/// convert the data in the format that is needed by the NVVM op.		/// convert the data in the format that is needed by the NVVM op.
struct WmmaStoreOpToNVVMLowering		struct WmmaStoreOpToNVVMLowering
: public ConvertOpToLLVMPattern<gpu::SubgroupMmaStoreMatrixOp> {		: public ConvertOpToLLVMPattern<gpu::SubgroupMmaStoreMatrixOp> {
using ConvertOpToLLVMPattern<		using ConvertOpToLLVMPattern<
gpu::SubgroupMmaStoreMatrixOp>::ConvertOpToLLVMPattern;		gpu::SubgroupMmaStoreMatrixOp>::ConvertOpToLLVMPattern;

LogicalResult		LogicalResult
matchAndRewrite(gpu::SubgroupMmaStoreMatrixOp subgroupMmaStoreMatrixOp,		matchAndRewrite(gpu::SubgroupMmaStoreMatrixOp subgroupMmaStoreMatrixOp,
OpAdaptor adaptor,		OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {		ConversionPatternRewriter &rewriter) const override {
Operation *op = subgroupMmaStoreMatrixOp.getOperation();		Operation *op = subgroupMmaStoreMatrixOp.getOperation();
if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)))		if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)))
return failure();		return failure();

Location loc = op->getLoc();		Location loc = op->getLoc();

// MemRefDescriptor to extract alignedPtr and offset.
MemRefDescriptor promotedDstOp(adaptor.dstMemref());

// Emit ops which compute the store offset using `dstOffsetI`,
// `dstOffsetJ`. The actualOffset is (memrefOffset + (alignedPtr +
// ((leadDimension * dstOffsetI) + dstOffsetJ)).
auto leadDimension = subgroupMmaStoreMatrixOp.leadDimensionAttr();
SmallVector<Value> indices(adaptor.indices());
Value dstOffsetIVal = indices[0];
Value dstOffsetJVal = indices[1];
Value leadingDim = rewriter.create<LLVM::ConstantOp>(
loc, dstOffsetIVal.getType(), leadDimension);
Value numElemsLeadDim =
rewriter.create<LLVM::MulOp>(loc, leadingDim, dstOffsetIVal);
Value loadOffset =
rewriter.create<LLVM::AddOp>(loc, numElemsLeadDim, dstOffsetJVal);

Value promotedDstOpToUse;
promotedDstOpToUse = promotedDstOp.offset(rewriter, loc);
Value actualOffset =
rewriter.create<LLVM::AddOp>(loc, loadOffset, promotedDstOpToUse);
Value storeAddress = rewriter.create<LLVM::GEPOp>(
loc, promotedDstOp.getElementPtrType(),
promotedDstOp.alignedPtr(rewriter, loc), ArrayRef<Value>{actualOffset});

// Bitcast the base address pointer of the destination memref, So that
herhutUnsubmitted Not Done Reply Inline Actions Why is this cast no longer needed? herhut: Why is this cast no longer needed?
ThomasRaouxAuthorUnsubmitted Done Reply Inline Actions I don't think this comment was correct, the intrinsic is overloaded and can take any pointer type. The pointer type itself doesn't matter since the intrinsic is converted one to one to ptx op `wmma.load.a.sync.row.mnk.` where pointers are not typed. ThomasRaoux: I don't think this comment was correct, the intrinsic is overloaded and can take any pointer…
// values can be stored in chunks of 32-bits and semantics match with the
// intrinsic exposed by NVPTX backend.
Value storeAddressCasted = rewriter.create<LLVM::BitcastOp>(
loc,
LLVM::LLVMPointerType::get(
rewriter.getI32Type(),
promotedDstOp.getElementPtrType().getAddressSpace()),
storeAddress);

SmallVector<Value, 4> storeOpOperands;		SmallVector<Value, 4> storeOpOperands;
// Get the shape of the MMAMatrix type being stored. The shape will		// Get the shape of the MMAMatrix type being stored. The shape will
// choose which intrinsic this op will be lowered to.		// choose which intrinsic this op will be lowered to.
gpu::MMAMatrixType srcType =		gpu::MMAMatrixType srcType =
subgroupMmaStoreMatrixOp.src().getType().cast<gpu::MMAMatrixType>();		subgroupMmaStoreMatrixOp.src().getType().cast<gpu::MMAMatrixType>();
ArrayRef<int64_t> srcTypeShape = srcType.getShape();		ArrayRef<int64_t> srcTypeShape = srcType.getShape();
NVVM::MMALayout layout = NVVM::MMALayout::row;		NVVM::MMALayout layout = NVVM::MMALayout::row;
NVVM::MMATypes eltype = getElementType(srcType);		NVVM::MMATypes eltype = getElementType(srcType);
int64_t m = srcTypeShape[0];		int64_t m = srcTypeShape[0];
int64_t n = srcTypeShape[1];		int64_t n = srcTypeShape[1];
int64_t k = NVVM::WMMAStoreOp::inferKDimension(m, n, eltype);		int64_t k = NVVM::WMMAStoreOp::inferKDimension(m, n, eltype);
if (NVVM::WMMAStoreOp::getIntrinsicID(m, n, k, layout, eltype) == 0)		if (NVVM::WMMAStoreOp::getIntrinsicID(m, n, k, layout, eltype) == 0)
return rewriter.notifyMatchFailure(op, kInvalidCaseStr);		return rewriter.notifyMatchFailure(op, kInvalidCaseStr);

auto matrixType = adaptor.src().getType().cast<LLVM::LLVMStructType>();		auto matrixType = adaptor.src().getType().cast<LLVM::LLVMStructType>();
for (unsigned i = 0, e = matrixType.getBody().size(); i < e; ++i) {		for (unsigned i = 0, e = matrixType.getBody().size(); i < e; ++i) {
Value toUse = rewriter.create<LLVM::ExtractValueOp>(		Value toUse = rewriter.create<LLVM::ExtractValueOp>(
loc, matrixType.getBody()[i], adaptor.src(),		loc, matrixType.getBody()[i], adaptor.src(),
rewriter.getI32ArrayAttr(i));		rewriter.getI32ArrayAttr(i));
storeOpOperands.push_back(toUse);		storeOpOperands.push_back(toUse);
}		}
Value leadingDim32 = rewriter.create<LLVM::ConstantOp>(
loc, rewriter.getI32Type(), leadDimension);
rewriter.create<NVVM::WMMAStoreOp>(loc, storeAddressCasted, m, n, k, layout,
eltype, storeOpOperands, leadingDim32);

rewriter.eraseOp(op);		Value dataPtr = getStridedElementPtr(
		loc, subgroupMmaStoreMatrixOp.dstMemref().getType().cast<MemRefType>(),
		adaptor.dstMemref(), adaptor.indices(), rewriter);
		Value leadingDim = rewriter.create<LLVM::ConstantOp>(
		loc, rewriter.getI32Type(),
		subgroupMmaStoreMatrixOp.leadDimensionAttr());
		rewriter.replaceOpWithNewOp<NVVM::WMMAStoreOp>(
		op, dataPtr, m, n, k, layout, eltype, storeOpOperands, leadingDim);
return success();		return success();
}		}
};		};

/// This class implements the conversion of GPU MMA computeOp to wmma.mma op		/// This class implements the conversion of GPU MMA computeOp to wmma.mma op
/// in the NVVM dialect.		/// in the NVVM dialect.
struct WmmaMmaOpToNVVMLowering		struct WmmaMmaOpToNVVMLowering
: public ConvertOpToLLVMPattern<gpu::SubgroupMmaComputeOp> {		: public ConvertOpToLLVMPattern<gpu::SubgroupMmaComputeOp> {
▲ Show 20 Lines • Show All 194 Lines • Show Last 20 Lines

mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir

// RUN: mlir-opt --convert-gpu-to-nvvm --split-input-file %s \| FileCheck %s		// RUN: mlir-opt --convert-gpu-to-nvvm --split-input-file %s \| FileCheck %s
// RUN: mlir-opt --convert-gpu-to-nvvm="index-bitwidth=32" --split-input-file %s \| FileCheck --check-prefix=CHECK32 %s		// RUN: mlir-opt --convert-gpu-to-nvvm="index-bitwidth=32" --split-input-file %s \| FileCheck --check-prefix=CHECK32 %s

gpu.module @test_module {		gpu.module @test_module {

// CHECK-LABEL: func @gpu_wmma_load_op() ->		// CHECK-LABEL: func @gpu_wmma_load_op() ->
// CHECK-SAME: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> {		// CHECK-SAME: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)> {
// CHECK32-LABEL: func @gpu_wmma_load_op() ->		// CHECK32-LABEL: func @gpu_wmma_load_op() ->
builtin.func @gpu_wmma_load_op() -> (!gpu.mma_matrix<16x16xf16, "AOp">) {		builtin.func @gpu_wmma_load_op() -> (!gpu.mma_matrix<16x16xf16, "AOp">) {
%wg = memref.alloca() {alignment = 32} : memref<32x32xf16, 3>		%wg = memref.alloca() {alignment = 32} : memref<32x32xf16, 3>
%i = arith.constant 16 : index		%i = arith.constant 16 : index
%j = arith.constant 16 : index		%j = arith.constant 16 : index
%0 = gpu.subgroup_mma_load_matrix %wg[%i, %j] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">		%0 = gpu.subgroup_mma_load_matrix %wg[%i, %j] {leadDimension = 32 : index} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">
// CHECK: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i64		// CHECK: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i64
// CHECK: %{{.}} = llvm.insertvalue %{{.}}, %{{.}}[{{.}}, {{.*}}]		// CHECK: %{{.}} = llvm.insertvalue %{{.}}, %{{.}}[{{.}}, {{.*}}]
		// CHECK: %[[BASE:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i64, array<2 x i64>, array<2 x i64>)>
// CHECK: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i64		// CHECK: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i64
// CHECK: %[[LI:.*]] = llvm.mul %[[LDM]], %[[INX]] : i64		// CHECK: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i64
// CHECK: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i64		// CHECK: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i64
// CHECK: %[[OFFSET:.]] = llvm.extractvalue %{{.}}[2] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i64, array<2 x i64>, array<2 x i64>)>		// CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i64) -> !llvm.ptr<f16, 3>
// CHECK: %[[LIJO:.*]] = llvm.add %[[LIJ]], %[[OFFSET]] : i64
// CHECK: %[[BASE:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i64, array<2 x i64>, array<2 x i64>)>
// CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJO]]] : (!llvm.ptr<f16, 3>, i64) -> !llvm.ptr<f16, 3>
// CHECK: %[[CADDRESS:.*]] = llvm.bitcast %[[ADDRESS]] : !llvm.ptr<f16, 3> to !llvm.ptr<i32, 3>
// CHECK: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32		// CHECK: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
// CHECK: %[[FRAG:.*]] = nvvm.wmma.load %[[CADDRESS]], %[[LDM32]]		// CHECK: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]]
// CHECK-SAME: {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK-SAME: {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>

// CHECK32: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32		// CHECK32: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32
// CHECK32: %{{.}} = llvm.insertvalue %{{.}}, %{{.}}[{{.}}, {{.*}}]		// CHECK32: %{{.}} = llvm.insertvalue %{{.}}, %{{.}}[{{.}}, {{.*}}]
		// CHECK32: %[[BASE:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i32, array<2 x i32>, array<2 x i32>)>
// CHECK32: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i32		// CHECK32: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i32
// CHECK32: %[[LI:.*]] = llvm.mul %[[LDM]], %[[INX]] : i32		// CHECK32: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i32
// CHECK32: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i32		// CHECK32: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i32
// CHECK32: %[[OFFSET:.]] = llvm.extractvalue %{{.}}[2] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i32, array<2 x i32>, array<2 x i32>)>		// CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i32) -> !llvm.ptr<f16, 3>
// CHECK32: %[[LIJO:.*]] = llvm.add %[[LIJ]], %[[OFFSET]] : i32
// CHECK32: %[[BASE:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i32, array<2 x i32>, array<2 x i32>)>
// CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJO]]] : (!llvm.ptr<f16, 3>, i32) -> !llvm.ptr<f16, 3>
// CHECK32: %[[CADDRESS:.*]] = llvm.bitcast %[[ADDRESS]] : !llvm.ptr<f16, 3> to !llvm.ptr<i32, 3>
// CHECK32: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32		// CHECK32: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
// CHECK32: %[[FRAG:.*]] = nvvm.wmma.load %[[CADDRESS]], %[[LDM32]]		// CHECK32: %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]]
// CHECK32-SAME: {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK32-SAME: {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK32: llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK32: llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
return %0 : !gpu.mma_matrix<16x16xf16, "AOp">		return %0 : !gpu.mma_matrix<16x16xf16, "AOp">
}		}
}		}

// -----		// -----

gpu.module @test_module {		gpu.module @test_module {

// CHECK-LABEL: func @gpu_wmma_store_op		// CHECK-LABEL: func @gpu_wmma_store_op
// CHECK-SAME: (%[[D:.*]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>) {		// CHECK-SAME: (%[[D:.*]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>) {
// CHECK32-LABEL: func @gpu_wmma_store_op		// CHECK32-LABEL: func @gpu_wmma_store_op
// CHECK32-SAME: (%[[D:.*]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>) {		// CHECK32-SAME: (%[[D:.*]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>) {
builtin.func @gpu_wmma_store_op(%arg0 : !gpu.mma_matrix<16x16xf16, "COp">) -> () {		builtin.func @gpu_wmma_store_op(%arg0 : !gpu.mma_matrix<16x16xf16, "COp">) -> () {
%sg = memref.alloca(){alignment = 32} : memref<32x32xf16, 3>		%sg = memref.alloca(){alignment = 32} : memref<32x32xf16, 3>
%i = arith.constant 16 : index		%i = arith.constant 16 : index
%j = arith.constant 16 : index		%j = arith.constant 16 : index
gpu.subgroup_mma_store_matrix %arg0, %sg[%i,%j] {leadDimension= 32 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<32x32xf16, 3>		gpu.subgroup_mma_store_matrix %arg0, %sg[%i,%j] {leadDimension= 32 : index} : !gpu.mma_matrix<16x16xf16, "COp">, memref<32x32xf16, 3>
// CHECK: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i64		// CHECK: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i64
// CHECK: %{{.}} = llvm.insertvalue %{{.}}, %{{.}}[{{.}}, {{.*}}]		// CHECK: %{{.}} = llvm.insertvalue %{{.}}, %{{.}}[{{.}}, {{.*}}]
// CHECK: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i64
// CHECK: %[[LI:.*]] = llvm.mul %[[LDM]], %[[INX]] : i64
// CHECK: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i64
// CHECK: %[[OFFSET:.*]] = llvm.extractvalue %17[2] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i64, array<2 x i64>, array<2 x i64>)>
// CHECK: %[[LIJO:.*]] = llvm.add %[[LIJ]], %[[OFFSET]] : i64
// CHECK: %[[BASE:.*]] = llvm.extractvalue %17[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i64, array<2 x i64>, array<2 x i64>)>
// CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJO]]] : (!llvm.ptr<f16, 3>, i64) -> !llvm.ptr<f16, 3>
// CHECK: %[[CADDRESS:.*]] = llvm.bitcast %[[ADDRESS]] : !llvm.ptr<f16, 3> to !llvm.ptr<i32, 3>
// CHECK: %[[EL1:.*]] = llvm.extractvalue %[[D]][0 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[EL1:.*]] = llvm.extractvalue %[[D]][0 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[EL2:.*]] = llvm.extractvalue %[[D]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[EL2:.*]] = llvm.extractvalue %[[D]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[EL3:.*]] = llvm.extractvalue %[[D]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[EL3:.*]] = llvm.extractvalue %[[D]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[EL4:.*]] = llvm.extractvalue %[[D]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[EL4:.*]] = llvm.extractvalue %[[D]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
		// CHECK: %[[BASE:.*]] = llvm.extractvalue %17[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i64, array<2 x i64>, array<2 x i64>)>
		// CHECK: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i64
		// CHECK: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i64
		// CHECK: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i64
		// CHECK: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i64) -> !llvm.ptr<f16, 3>
// CHECK: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32		// CHECK: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
// CHECK: nvvm.wmma.store %[[CADDRESS]], %[[LDM32]], %[[EL1]], %[[EL2]], %[[EL3]], %[[EL4]]		// CHECK: nvvm.wmma.store %[[ADDRESS]], %[[LDM32]], %[[EL1]], %[[EL2]], %[[EL3]], %[[EL4]]
// CHECK-SAME: {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : !llvm.ptr<i32, 3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>		// CHECK-SAME: {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : !llvm.ptr<f16, 3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
// CHECK: llvm.return		// CHECK: llvm.return

// CHECK32: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32		// CHECK32: %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32
// CHECK32: %{{.}} = llvm.insertvalue %{{.}}, %{{.}}[{{.}}, {{.*}}]		// CHECK32: %{{.}} = llvm.insertvalue %{{.}}, %{{.}}[{{.}}, {{.*}}]
// CHECK32: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i32
// CHECK32: %[[LI:.*]] = llvm.mul %[[LDM]], %[[INX]] : i32
// CHECK32: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i32
// CHECK32: %[[OFFSET:.*]] = llvm.extractvalue %17[2] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i32, array<2 x i32>, array<2 x i32>)>
// CHECK32: %[[LIJO:.*]] = llvm.add %[[LIJ]], %[[OFFSET]] : i32
// CHECK32: %[[BASE:.*]] = llvm.extractvalue %17[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i32, array<2 x i32>, array<2 x i32>)>
// CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJO]]] : (!llvm.ptr<f16, 3>, i32) -> !llvm.ptr<f16, 3>
// CHECK32: %[[CADDRESS:.*]] = llvm.bitcast %[[ADDRESS]] : !llvm.ptr<f16, 3> to !llvm.ptr<i32, 3>
// CHECK32: %[[EL1:.*]] = llvm.extractvalue %[[D]][0 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK32: %[[EL1:.*]] = llvm.extractvalue %[[D]][0 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK32: %[[EL2:.*]] = llvm.extractvalue %[[D]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK32: %[[EL2:.*]] = llvm.extractvalue %[[D]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK32: %[[EL3:.*]] = llvm.extractvalue %[[D]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK32: %[[EL3:.*]] = llvm.extractvalue %[[D]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK32: %[[EL4:.*]] = llvm.extractvalue %[[D]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK32: %[[EL4:.*]] = llvm.extractvalue %[[D]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
		// CHECK32: %[[BASE:.*]] = llvm.extractvalue %17[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i32, array<2 x i32>, array<2 x i32>)>
		// CHECK32: %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i32
		// CHECK32: %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]] : i32
		// CHECK32: %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]] : i32
		// CHECK32: %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i32) -> !llvm.ptr<f16, 3>
// CHECK32: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32		// CHECK32: %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
// CHECK32: nvvm.wmma.store %[[CADDRESS]], %[[LDM32]], %[[EL1]], %[[EL2]], %[[EL3]], %[[EL4]]		// CHECK32: nvvm.wmma.store %[[ADDRESS]], %[[LDM32]], %[[EL1]], %[[EL2]], %[[EL3]], %[[EL4]]
// CHECK32-SAME: {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : !llvm.ptr<i32, 3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>		// CHECK32-SAME: {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : !llvm.ptr<f16, 3>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>
// CHECK32: llvm.return		// CHECK32: llvm.return
return		return
}		}
}		}

// -----		// -----

gpu.module @test_module {		gpu.module @test_module {
Show All 30 Lines	gpu.module @test_module {
}		}
}		}

// -----		// -----

gpu.module @test_module {		gpu.module @test_module {

// CHECK-LABEL: func @gpu_wmma_mma_loop_op		// CHECK-LABEL: func @gpu_wmma_mma_loop_op
// CHECK: %[[C:.+]] = nvvm.wmma.load %{{.}}, %{{.}} {eltype = "f16", frag = "c", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<i32>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[C:.+]] = nvvm.wmma.load %{{.}}, %{{.}} {eltype = "f16", frag = "c", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: llvm.br ^bb1(%{{.*}}, %[[C]] : i64, !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>)		// CHECK: llvm.br ^bb1(%{{.*}}, %[[C]] : i64, !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>)
// CHECK: ^bb1(%{{.*}}: i64, %[[ACC:.+]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>): // 2 preds: ^bb0, ^bb2		// CHECK: ^bb1(%{{.*}}: i64, %[[ACC:.+]]: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>): // 2 preds: ^bb0, ^bb2
// CHECK: llvm.cond_br %{{.*}}, ^bb2, ^bb3		// CHECK: llvm.cond_br %{{.*}}, ^bb2, ^bb3
// CHECK: ^bb2: // pred: ^bb1		// CHECK: ^bb2: // pred: ^bb1
// CHECK: %[[A:.+]] = nvvm.wmma.load %{{.}}, %{{.}} {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<i32>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[A:.+]] = nvvm.wmma.load %{{.}}, %{{.}} {eltype = "f16", frag = "a", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[B:.+]] = nvvm.wmma.load %{{.}}, %{{.}} {eltype = "f16", frag = "b", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<i32>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[B:.+]] = nvvm.wmma.load %{{.}}, %{{.}} {eltype = "f16", frag = "b", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : (!llvm.ptr<f16>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[A0:.+]] = llvm.extractvalue %[[A]][0 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[A0:.+]] = llvm.extractvalue %[[A]][0 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[A1:.+]] = llvm.extractvalue %[[A]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[A1:.+]] = llvm.extractvalue %[[A]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[A2:.+]] = llvm.extractvalue %[[A]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[A2:.+]] = llvm.extractvalue %[[A]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[A3:.+]] = llvm.extractvalue %[[A]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[A3:.+]] = llvm.extractvalue %[[A]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[A4:.+]] = llvm.extractvalue %[[A]][4 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[A4:.+]] = llvm.extractvalue %[[A]][4 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[A5:.+]] = llvm.extractvalue %[[A]][5 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[A5:.+]] = llvm.extractvalue %[[A]][5 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[A6:.+]] = llvm.extractvalue %[[A]][6 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[A6:.+]] = llvm.extractvalue %[[A]][6 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[A7:.+]] = llvm.extractvalue %[[A]][7 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[A7:.+]] = llvm.extractvalue %[[A]][7 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
Show All 11 Lines
// CHECK: %[[ACC3:.+]] = llvm.extractvalue %[[ACC]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[ACC3:.+]] = llvm.extractvalue %[[ACC]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[ACC_MUL:.+]] = nvvm.wmma.mma %[[A0]], %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A5]], %[[A6]], %[[A7]], %[[B0]], %[[B1]], %[[B2]], %[[B3]], %[[B4]], %[[B5]], %[[B6]], %[[B7]], %[[ACC0]], %[[ACC1]], %[[ACC2]], %[[ACC3]] {eltypeA = "f16", eltypeB = "f16", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32} : (vector<2xf16>, {{.*}} -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[ACC_MUL:.+]] = nvvm.wmma.mma %[[A0]], %[[A1]], %[[A2]], %[[A3]], %[[A4]], %[[A5]], %[[A6]], %[[A7]], %[[B0]], %[[B1]], %[[B2]], %[[B3]], %[[B4]], %[[B5]], %[[B6]], %[[B7]], %[[ACC0]], %[[ACC1]], %[[ACC2]], %[[ACC3]] {eltypeA = "f16", eltypeB = "f16", k = 16 : i32, layoutA = "row", layoutB = "row", m = 16 : i32, n = 16 : i32} : (vector<2xf16>, {{.*}} -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: llvm.br ^bb1(%{{.*}}, %[[ACC_MUL]] : i64, !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>)		// CHECK: llvm.br ^bb1(%{{.*}}, %[[ACC_MUL]] : i64, !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>)
// CHECK: ^bb3: // pred: ^bb1		// CHECK: ^bb3: // pred: ^bb1
// CHECK: %[[E0:.+]] = llvm.extractvalue %[[ACC]][0 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[E0:.+]] = llvm.extractvalue %[[ACC]][0 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[E1:.+]] = llvm.extractvalue %[[ACC]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[E1:.+]] = llvm.extractvalue %[[ACC]][1 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[E2:.+]] = llvm.extractvalue %[[ACC]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[E2:.+]] = llvm.extractvalue %[[ACC]][2 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: %[[E3:.+]] = llvm.extractvalue %[[ACC]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>		// CHECK: %[[E3:.+]] = llvm.extractvalue %[[ACC]][3 : i32] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
// CHECK: nvvm.wmma.store %{{.}}, %{{.}}, %[[E0]], %[[E1]], %[[E2]], %[[E3]] {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : !llvm.ptr<i32>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>		// CHECK: nvvm.wmma.store %{{.}}, %{{.}}, %[[E0]], %[[E1]], %[[E2]], %[[E3]] {eltype = "f16", k = 16 : i32, layout = "row", m = 16 : i32, n = 16 : i32} : !llvm.ptr<f16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>

builtin.func @gpu_wmma_mma_loop_op(%arg0: memref<128x128xf16>, %arg1: memref<128x128xf16>, %arg2: memref<128x128xf16>) {		builtin.func @gpu_wmma_mma_loop_op(%arg0: memref<128x128xf16>, %arg1: memref<128x128xf16>, %arg2: memref<128x128xf16>) {
%c0 = arith.constant 0 : index		%c0 = arith.constant 0 : index
%c128 = arith.constant 128 : index		%c128 = arith.constant 128 : index
%c32 = arith.constant 32 : index		%c32 = arith.constant 32 : index
%0 = gpu.subgroup_mma_load_matrix %arg2[%c0, %c0] {leadDimension = 128 : index} : memref<128x128xf16> -> !gpu.mma_matrix<16x16xf16, "COp">		%0 = gpu.subgroup_mma_load_matrix %arg2[%c0, %c0] {leadDimension = 128 : index} : memref<128x128xf16> -> !gpu.mma_matrix<16x16xf16, "COp">
br ^bb1(%c0, %0 : index, !gpu.mma_matrix<16x16xf16, "COp">)		br ^bb1(%c0, %0 : index, !gpu.mma_matrix<16x16xf16, "COp">)
^bb1(%1: index, %2: !gpu.mma_matrix<16x16xf16, "COp">): // 2 preds: ^bb0, ^bb2		^bb1(%1: index, %2: !gpu.mma_matrix<16x16xf16, "COp">): // 2 preds: ^bb0, ^bb2
▲ Show 20 Lines • Show All 68 Lines • Show Last 20 Lines