This is an archive of the discontinued LLVM Phabricator instance.

[mlir][NvGpu] Fix nvgpu.mma.sync lowering to NVVM for f32, tf32 types
ClosedPublic

Authored by christopherbate on Apr 27 2022, 9:15 AM.

Download Raw Diff

Details

Reviewers

herhut
csigg
nirvedhmeshram

Commits

rG9879807393d3: [mlir][NvGpu] Fix nvgpu.mma.sync lowering to NVVM for f32, tf32 types

Summary

Adds missing logic in the lowering from NvGPU to NVVM to support fp32
(in an accumulator operand) and tf32 (in multiplicand operand) types.
Fixes logic in one of the helper functions for converting the result
of a mma.sync operation with multiple 8x256bit output tiles, which is
the case for f32 outputs.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

christopherbate created this revision.Apr 27 2022, 9:15 AM

Herald added a project: Restricted Project. · View Herald TranscriptApr 27 2022, 9:15 AM

Herald added subscribers: mattd, gchakrabarti, sdasgup3 and 23 others. · View Herald Transcript

christopherbate requested review of this revision.Apr 27 2022, 9:15 AM

Herald added a reviewer: herhut. · View Herald TranscriptApr 27 2022, 9:15 AM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: stephenneuendorffer, nicolasvasilache. · View Herald Transcript

Harbormaster completed remote builds in B161616: Diff 425535.Apr 27 2022, 9:37 AM

Thanks. Adding @csigg as an FYI.

This revision is now accepted and ready to land.Apr 28 2022, 2:49 AM

I have a patch for NVVM mma.sync TF32 support. I should land t hat before this one, then update this one to include additional fixes for TF32

Updated the diff to include changes that support TF32 operands.

christopherbate retitled this revision from [mlir][NvGpu] Fix nvgpu.mma.sync lowering to NVVM for f32 types to [mlir][NvGpu] Fix nvgpu.mma.sync lowering to NVVM for f32, tf32 types.May 5 2022, 10:42 AM

christopherbate edited the summary of this revision. (Show Details)

Harbormaster completed remote builds in B162958: Diff 427382.May 5 2022, 11:44 AM

LGTM

Closed by commit rG9879807393d3: [mlir][NvGpu] Fix nvgpu.mma.sync lowering to NVVM for f32, tf32 types (authored by christopherbate). · Explain WhyMay 8 2022, 8:58 PM

This revision was automatically updated to reflect the committed changes.

christopherbate added a commit: rG9879807393d3: [mlir][NvGpu] Fix nvgpu.mma.sync lowering to NVVM for f32, tf32 types.

Revision Contents

Path

Size

mlir/

lib/

Conversion/

NVGPUToNVVM/

NVGPUToNVVM.cpp

82 lines

test/

Conversion/

NVGPUToNVVM/

mma-sync-to-nvvm.mlir

58 lines

Diff 427382

mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp

Show All 19 Lines
static Type inferIntrinsicResultType(Type vectorResultType) {		static Type inferIntrinsicResultType(Type vectorResultType) {
MLIRContext *ctx = vectorResultType.getContext();		MLIRContext *ctx = vectorResultType.getContext();
auto a = vectorResultType.cast<LLVM::LLVMArrayType>();		auto a = vectorResultType.cast<LLVM::LLVMArrayType>();
auto f16x2Ty = LLVM::getFixedVectorType(Float16Type::get(ctx), 2);		auto f16x2Ty = LLVM::getFixedVectorType(Float16Type::get(ctx), 2);
auto i32Ty = IntegerType::get(ctx, 32);		auto i32Ty = IntegerType::get(ctx, 32);
auto i32x2Ty = LLVM::getFixedVectorType(i32Ty, 2);		auto i32x2Ty = LLVM::getFixedVectorType(i32Ty, 2);
Type f64Ty = Float64Type::get(ctx);		Type f64Ty = Float64Type::get(ctx);
Type f64x2Ty = LLVM::getFixedVectorType(f64Ty, 2);		Type f64x2Ty = LLVM::getFixedVectorType(f64Ty, 2);
		Type f32Ty = Float32Type::get(ctx);
		Type f32x2Ty = LLVM::getFixedVectorType(f32Ty, 2);
if (a.getElementType() == f16x2Ty) {		if (a.getElementType() == f16x2Ty) {
return LLVM::LLVMStructType::getLiteral(		return LLVM::LLVMStructType::getLiteral(
ctx, SmallVector<Type>(a.getNumElements(), f16x2Ty));		ctx, SmallVector<Type>(a.getNumElements(), f16x2Ty));
}		}
if (a.getElementType() == i32x2Ty) {		if (a.getElementType() == i32x2Ty) {
return LLVM::LLVMStructType::getLiteral(		return LLVM::LLVMStructType::getLiteral(
ctx,		ctx,
SmallVector<Type>(static_cast<size_t>(a.getNumElements()) * 2, i32Ty));		SmallVector<Type>(static_cast<size_t>(a.getNumElements()) * 2, i32Ty));
}		}
if (a.getElementType() == f64x2Ty) {		if (a.getElementType() == f64x2Ty) {
return LLVM::LLVMStructType::getLiteral(ctx, {f64Ty, f64Ty});		return LLVM::LLVMStructType::getLiteral(ctx, {f64Ty, f64Ty});
}		}
		if (a.getElementType() == f32x2Ty) {
		return LLVM::LLVMStructType::getLiteral(
		ctx,
		SmallVector<Type>(static_cast<size_t>(a.getNumElements()) * 2, f32Ty));
		}
		if (a.getElementType() == LLVM::getFixedVectorType(f32Ty, 1)) {
		return LLVM::LLVMStructType::getLiteral(
		ctx, SmallVector<Type>(static_cast<size_t>(a.getNumElements()), f32Ty));
		}
return vectorResultType;		return vectorResultType;
}		}

/// Convert the SSA result of the NVVM intrinsic `nvvm.mma.sync` (which is		/// Convert the SSA result of the NVVM intrinsic `nvvm.mma.sync` (which is
/// always an LLVM struct) into a fragment that is compatible with the vector		/// always an LLVM struct) into a fragment that is compatible with the vector
/// type of this operation. This involves extracting elements from the struct		/// type of this operation. This involves extracting elements from the struct
/// and inserting them into an LLVM array. These extra data-movement		/// and inserting them into an LLVM array. These extra data-movement
/// operations should be canonicalized away by the LLVM backend.		/// operations should be canonicalized away by the LLVM backend.
static Value convertIntrinsicResult(Location loc, Type intrinsicResultType,		static Value convertIntrinsicResult(Location loc, Type intrinsicResultType,
Type resultType, Value intrinsicResult,		Type resultType, Value intrinsicResult,
RewriterBase &rewriter) {		RewriterBase &rewriter) {
MLIRContext *ctx = rewriter.getContext();		MLIRContext *ctx = rewriter.getContext();
auto structType = intrinsicResultType.dyn_cast<LLVM::LLVMStructType>();		auto structType = intrinsicResultType.dyn_cast<LLVM::LLVMStructType>();
auto arrayType = resultType.dyn_cast<LLVM::LLVMArrayType>();		auto arrayType = resultType.dyn_cast<LLVM::LLVMArrayType>();
Type i32Ty = rewriter.getI32Type();		Type i32Ty = rewriter.getI32Type();
		Type f32Ty = rewriter.getF32Type();
Type f64Ty = rewriter.getF64Type();		Type f64Ty = rewriter.getF64Type();
Type f16x2Ty = LLVM::getFixedVectorType(rewriter.getF16Type(), 2);		Type f16x2Ty = LLVM::getFixedVectorType(rewriter.getF16Type(), 2);
Type i32x2Ty = LLVM::getFixedVectorType(i32Ty, 2);		Type i32x2Ty = LLVM::getFixedVectorType(i32Ty, 2);
Type f64x2Ty = LLVM::getFixedVectorType(f64Ty, 2);		Type f64x2Ty = LLVM::getFixedVectorType(f64Ty, 2);
		Type f32x2Ty = LLVM::getFixedVectorType(f32Ty, 2);
		Type f32x1Ty = LLVM::getFixedVectorType(f32Ty, 1);

auto makeConst = [&](int32_t index) -> Value {		auto makeConst = [&](int32_t index) -> Value {
return rewriter.create<LLVM::ConstantOp>(loc, IntegerType::get(ctx, 32),		return rewriter.create<LLVM::ConstantOp>(loc, IntegerType::get(ctx, 32),
rewriter.getI32IntegerAttr(index));		rewriter.getI32IntegerAttr(index));
};		};

if (arrayType) {		if (arrayType) {
SmallVector<Value, 4> elements;		SmallVector<Value, 4> elements;

if (arrayType.getElementType() == f16x2Ty) {		// The intrinsic returns 32-bit wide elements in a form which can be
		// directly bitcasted and inserted into the result vector.
		if (arrayType.getElementType() == f16x2Ty \|\|
		arrayType.getElementType() == f32x1Ty) {
for (unsigned i = 0; i < structType.getBody().size(); i++) {		for (unsigned i = 0; i < structType.getBody().size(); i++) {
elements.push_back(rewriter.create<LLVM::ExtractValueOp>(		Value el = rewriter.create<LLVM::ExtractValueOp>(
loc, structType.getBody()[i], intrinsicResult,		loc, structType.getBody()[i], intrinsicResult,
rewriter.getI64ArrayAttr(i)));		rewriter.getI64ArrayAttr(i));
		el = rewriter.createOrFold<LLVM::BitcastOp>(
		loc, arrayType.getElementType(), el);
		elements.push_back(el);
}		}
}		}

// The intrinsic returns i32 and f64 values as individual scalars. We need		// The intrinsic returns i32, f64, and f32 values as individual scalars,
// to extract them from the struct and pack them into vectors.		// even when the result is notionally a 64-bit wide element (e.g. f32x2). We
		// need to extract them from the struct and pack them into the 64-bit wide
		// rows of the vector result.
if (arrayType.getElementType() == i32x2Ty \|\|		if (arrayType.getElementType() == i32x2Ty \|\|
arrayType.getElementType() == f64x2Ty) {		arrayType.getElementType() == f64x2Ty \|\|
		arrayType.getElementType() == f32x2Ty) {

		for (unsigned i = 0, e = structType.getBody().size() / 2; i < e; i++) {
Value vec =		Value vec =
rewriter.create<LLVM::UndefOp>(loc, arrayType.getElementType());		rewriter.create<LLVM::UndefOp>(loc, arrayType.getElementType());
for (unsigned i = 0, e = structType.getBody().size() / 2; i < e; i++) {
Value x1 = rewriter.create<LLVM::ExtractValueOp>(		Value x1 = rewriter.create<LLVM::ExtractValueOp>(
loc, structType.getBody()[i * 2], intrinsicResult,		loc, structType.getBody()[i * 2], intrinsicResult,
rewriter.getI64ArrayAttr(i * 2));		rewriter.getI64ArrayAttr(i * 2));
Value x2 = rewriter.create<LLVM::ExtractValueOp>(		Value x2 = rewriter.create<LLVM::ExtractValueOp>(
loc, structType.getBody()[i * 2 + 1], intrinsicResult,		loc, structType.getBody()[i * 2 + 1], intrinsicResult,
rewriter.getI64ArrayAttr(i * 2 + 1));		rewriter.getI64ArrayAttr(i * 2 + 1));
vec = rewriter.create<LLVM::InsertElementOp>(loc, vec.getType(), vec,		vec = rewriter.create<LLVM::InsertElementOp>(loc, vec.getType(), vec,
x1, makeConst(0));		x1, makeConst(0));
vec = rewriter.create<LLVM::InsertElementOp>(loc, vec.getType(), vec,		vec = rewriter.create<LLVM::InsertElementOp>(loc, vec.getType(), vec,
x2, makeConst(1));		x2, makeConst(1));
}
elements.push_back(vec);		elements.push_back(vec);
}		}
		}

// Create the final vectorized result.		// Create the final vectorized result.
Value result = rewriter.create<LLVM::UndefOp>(loc, arrayType);		Value result = rewriter.create<LLVM::UndefOp>(loc, arrayType);
for (const auto &el : llvm::enumerate(elements)) {		for (const auto &el : llvm::enumerate(elements)) {
result = rewriter.create<LLVM::InsertValueOp>(		result = rewriter.create<LLVM::InsertValueOp>(
loc, arrayType, result, el.value(),		loc, arrayType, result, el.value(),
rewriter.getI64ArrayAttr(el.index()));		rewriter.getI64ArrayAttr(el.index()));
}		}
return result;		return result;
}		}

return intrinsicResult;		return intrinsicResult;
}		}

/// The `gpu.mma.sync` converter below expects matrix fragment operands to be		/// The `gpu.mma.sync` converter below expects matrix fragment operands to be
/// given as 2D `vectors` where the rows are 32b or 64b wide. The		/// given as 2D `vectors` where the rows are 32b or 64b wide. The
/// `nvvm.mma.sync` op expects these argments to be a given in a long list of		/// `nvvm.mma.sync` op expects these argments to be a given in a long list of
/// scalars of certain types. This function helps unpack the `vector` arguments		/// scalars of certain types. This function helps unpack the `vector` arguments
/// and cast them to the types expected by `nvvm.mma.sync`.		/// and cast them to the types expected by `nvvm.mma.sync`.
static SmallVector<Value> unpackOperandVector(RewriterBase &rewriter,		static SmallVector<Value> unpackOperandVector(RewriterBase &rewriter,
Location loc, Value operand) {		Location loc, Value operand,
		NVVM::MMATypes operandPtxType) {
SmallVector<Value> result;		SmallVector<Value> result;
Type i32Ty = rewriter.getI32Type();		Type i32Ty = rewriter.getI32Type();
Type f64Ty = rewriter.getF64Type();		Type f64Ty = rewriter.getF64Type();
		Type f32Ty = rewriter.getF32Type();
Type i8Ty = rewriter.getI8Type();		Type i8Ty = rewriter.getI8Type();
Type i8x4Ty = LLVM::getFixedVectorType(i8Ty, 4);		Type i8x4Ty = LLVM::getFixedVectorType(i8Ty, 4);
		Type f32x1Ty = LLVM::getFixedVectorType(f32Ty, 1);
auto arrayTy = operand.getType().cast<LLVM::LLVMArrayType>();		auto arrayTy = operand.getType().cast<LLVM::LLVMArrayType>();

for (unsigned i = 0, e = arrayTy.getNumElements(); i < e; ++i) {		for (unsigned i = 0, e = arrayTy.getNumElements(); i < e; ++i) {
Value toUse = rewriter.create<LLVM::ExtractValueOp>(		Value toUse = rewriter.create<LLVM::ExtractValueOp>(
loc, arrayTy.getElementType(), operand, rewriter.getI64ArrayAttr(i));		loc, arrayTy.getElementType(), operand, rewriter.getI64ArrayAttr(i));

// For 4xi8 vectors, the intrinsic expects these to be provided as i32		// For 4xi8 vectors, the intrinsic expects these to be provided as i32
// scalar types.		// scalar types.
if (arrayTy.getElementType() == i8x4Ty) {		if (arrayTy.getElementType() == i8x4Ty \|\|
		(arrayTy.getElementType() == f32x1Ty &&
		operandPtxType == NVVM::MMATypes::tf32)) {
result.push_back(		result.push_back(
rewriter.create<LLVM::BitcastOp>(loc, rewriter.getI32Type(), toUse));		rewriter.create<LLVM::BitcastOp>(loc, rewriter.getI32Type(), toUse));
continue;		continue;
}		}

// For some element types (i32, f64), we need to unpack the inner		// For some element types (i32, f32, f64), we need to unpack the inner
// vector/array type as well because the intrinsic expects individual		// vector/array type as well because the intrinsic expects individual
// scalars to be provided.		// scalars to be provided.
VectorType innerArrayTy = arrayTy.getElementType().dyn_cast<VectorType>();		VectorType innerArrayTy = arrayTy.getElementType().dyn_cast<VectorType>();
if (innerArrayTy && (innerArrayTy.getElementType() == i32Ty \|\|		if (innerArrayTy && (innerArrayTy.getElementType() == i32Ty \|\|
innerArrayTy.getElementType() == f64Ty)) {		innerArrayTy.getElementType() == f64Ty \|\|
		innerArrayTy.getElementType() == f32Ty)) {
for (unsigned idx = 0, innerSize = innerArrayTy.getNumElements();		for (unsigned idx = 0, innerSize = innerArrayTy.getNumElements();
idx < innerSize; idx++) {		idx < innerSize; idx++) {
result.push_back(rewriter.create<LLVM::ExtractElementOp>(		result.push_back(rewriter.create<LLVM::ExtractElementOp>(
loc, toUse,		loc, toUse,
rewriter.create<LLVM::ConstantOp>(		rewriter.create<LLVM::ConstantOp>(
loc, rewriter.getI64Type(), rewriter.getI64IntegerAttr(idx))));		loc, rewriter.getI64Type(), rewriter.getI64IntegerAttr(idx))));
}		}
continue;		continue;
▲ Show 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	struct MmaSyncOptoNVVM : public ConvertOpToLLVMPattern<nvgpu::MmaSyncOp> {

LogicalResult		LogicalResult
matchAndRewrite(nvgpu::MmaSyncOp op, OpAdaptor adaptor,		matchAndRewrite(nvgpu::MmaSyncOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {		ConversionPatternRewriter &rewriter) const override {
Location loc = op->getLoc();		Location loc = op->getLoc();
// Get the shapes of the MMAMatrix type being used. The shapes will		// Get the shapes of the MMAMatrix type being used. The shapes will
// choose which intrinsic this op will be lowered to.		// choose which intrinsic this op will be lowered to.
auto aType = op.matrixA().getType().cast<VectorType>();		auto aType = op.matrixA().getType().cast<VectorType>();
		auto cType = op.matrixC().getType().cast<VectorType>();

int64_t m = op.mmaShape()[0].cast<IntegerAttr>().getInt();		int64_t m = op.mmaShape()[0].cast<IntegerAttr>().getInt();
int64_t n = op.mmaShape()[1].cast<IntegerAttr>().getInt();		int64_t n = op.mmaShape()[1].cast<IntegerAttr>().getInt();
int64_t k = op.mmaShape()[2].cast<IntegerAttr>().getInt();		int64_t k = op.mmaShape()[2].cast<IntegerAttr>().getInt();
std::array<int64_t, 3> gemmShape{m, n, k};		std::array<int64_t, 3> gemmShape{m, n, k};

SmallVector<Value> matA =
unpackOperandVector(rewriter, loc, adaptor.matrixA());
SmallVector<Value> matB =
unpackOperandVector(rewriter, loc, adaptor.matrixB());
SmallVector<Value> matC =
unpackOperandVector(rewriter, loc, adaptor.matrixC());

NVVM::MMATypes ptxTypeA;		NVVM::MMATypes ptxTypeA;
NVVM::MMATypes ptxTypeB;		NVVM::MMATypes ptxTypeB;
		Optional<NVVM::MMATypes> ptxTypeC = NVVM::MmaOp::inferOperandMMAType(
		cType.getElementType(), /isAccumulator=/true);
		if (!ptxTypeC) {
		return op->emitError(
		"could not infer the PTX type for the accumulator/result");
		}

Optional<NVVM::MMAIntOverflow> overflow(llvm::None);		Optional<NVVM::MMAIntOverflow> overflow(llvm::None);
if (aType.getElementType().isInteger(8)) {		if (aType.getElementType().isInteger(8)) {
ptxTypeA = NVVM::MMATypes::s8;		ptxTypeA = NVVM::MMATypes::s8;
ptxTypeB = NVVM::MMATypes::s8;		ptxTypeB = NVVM::MMATypes::s8;
overflow = NVVM::MMAIntOverflow::satfinite;		overflow = NVVM::MMAIntOverflow::satfinite;

} else if (aType.getElementType().isF16()) {		} else if (aType.getElementType().isF16()) {
ptxTypeA = NVVM::MMATypes::f16;		ptxTypeA = NVVM::MMATypes::f16;
ptxTypeB = NVVM::MMATypes::f16;		ptxTypeB = NVVM::MMATypes::f16;
} else if (aType.getElementType().isF64()) {		} else if (aType.getElementType().isF64()) {
ptxTypeA = NVVM::MMATypes::f64;		ptxTypeA = NVVM::MMATypes::f64;
ptxTypeB = NVVM::MMATypes::f64;		ptxTypeB = NVVM::MMATypes::f64;
		} else if (aType.getElementType().isF32()) {
		ptxTypeA = NVVM::MMATypes::tf32;
		ptxTypeB = NVVM::MMATypes::tf32;
} else {		} else {
return op->emitError("could not deduce operand PTX types");		return op->emitError("could not deduce operand PTX types");
}		}

		SmallVector<Value> matA =
		unpackOperandVector(rewriter, loc, adaptor.matrixA(), ptxTypeA);
		SmallVector<Value> matB =
		unpackOperandVector(rewriter, loc, adaptor.matrixB(), ptxTypeB);
		SmallVector<Value> matC =
		unpackOperandVector(rewriter, loc, adaptor.matrixC(), *ptxTypeC);

Type desiredRetTy = typeConverter->convertType(op->getResultTypes()[0]);		Type desiredRetTy = typeConverter->convertType(op->getResultTypes()[0]);
Type intrinsicResTy = inferIntrinsicResultType(		Type intrinsicResTy = inferIntrinsicResultType(
typeConverter->convertType(op->getResultTypes()[0]));		typeConverter->convertType(op->getResultTypes()[0]));
Value intrinsicResult = rewriter.create<NVVM::MmaOp>(		Value intrinsicResult = rewriter.create<NVVM::MmaOp>(
op.getLoc(), intrinsicResTy, matA, matB, matC,		op.getLoc(), intrinsicResTy, matA, matB, matC,
/shape=/gemmShape,		/shape=/gemmShape,
/b1Op=/llvm::None,		/b1Op=/llvm::None,
/intOverflow=/overflow,		/intOverflow=/overflow,
Show All 38 Lines

mlir/test/Conversion/NVGPUToNVVM/mma-sync-to-nvvm.mlir

Show All 18 Lines	func.func @m16n8k16_fp16(%arg0: vector<4x2xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
// CHECK: llvm.mlir.undef : !llvm.array<2 x vector<2xf16>>		// CHECK: llvm.mlir.undef : !llvm.array<2 x vector<2xf16>>
// CHECK-DAG: llvm.insertvalue {{%.+}}, {{%.+}}[0] : !llvm.array<2 x vector<2xf16>>		// CHECK-DAG: llvm.insertvalue {{%.+}}, {{%.+}}[0] : !llvm.array<2 x vector<2xf16>>
// CHECK-DAG: llvm.insertvalue {{%.+}}, {{%.+}}[1] : !llvm.array<2 x vector<2xf16>>		// CHECK-DAG: llvm.insertvalue {{%.+}}, {{%.+}}[1] : !llvm.array<2 x vector<2xf16>>
return %d : vector<2x2xf16>		return %d : vector<2x2xf16>
}		}

// -----		// -----

		// Same as above but with fp32 acumulation type.

		// CHECK-LABEL: @m16n8k16_fp16_fp32
		func.func @m16n8k16_fp16_fp32(%arg0: vector<4x2xf16>, %arg1: vector<2x2xf16>, %arg2: vector<2x2xf32>) -> vector<2x2xf32> {
		// We just need to check the mma instruction and the manipulatin of the result.
		// CHECK: [[d:%.+]] = nvvm.mma.sync
		// CHECK-SAME: shape = {k = 16 : i32, m = 16 : i32, n = 8 : i32}
		// CHECK-SAME: (vector<2xf16>, vector<2xf16>, f32) -> !llvm.struct<(f32, f32, f32, f32)>
		%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 16]} : (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf32>) -> vector<2x2xf32>
		// CHECK: [[undef:%.+]] = llvm.mlir.undef : vector<2xf32>
		// CHECK-DAG: llvm.extractvalue [[d]][0] : !llvm.struct<(f32, f32, f32, f32)>
		// CHECK-DAG: llvm.extractvalue [[d]][1] : !llvm.struct<(f32, f32, f32, f32)>
		// CHECK: [[d00:%.+]] = llvm.insertelement {{%.+}}, [[undef]][{{.*}}] : vector<2xf32>
		// CHECK: [[d01:%.+]] = llvm.insertelement {{%.+}}, [[d00]][{{.*}}] : vector<2xf32>

		// CHECK: [[undef:%.+]] = llvm.mlir.undef : vector<2xf32>
		// CHECK-DAG: llvm.extractvalue [[d]][2] : !llvm.struct<(f32, f32, f32, f32)>
		// CHECK-DAG: llvm.extractvalue [[d]][3] : !llvm.struct<(f32, f32, f32, f32)>
		// CHECK: [[d10:%.+]] = llvm.insertelement {{%.+}}, [[undef]][{{.*}}] : vector<2xf32>
		// CHECK: [[d11:%.+]] = llvm.insertelement {{%.+}}, [[d10]][{{.*}}] : vector<2xf32>

		// CHECK-DAG: llvm.insertvalue [[d01]], {{%.+}}[0] : !llvm.array<2 x vector<2xf32>>
		// CHECK-DAG: llvm.insertvalue [[d11]], {{%.+}}[1] : !llvm.array<2 x vector<2xf32>>
		return %d : vector<2x2xf32>
		}

		// -----

// CHECK-LABEL: @m16n8k8_fp16		// CHECK-LABEL: @m16n8k8_fp16
func.func @m16n8k8_fp16(%arg0: vector<2x2xf16>, %arg1: vector<1x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {		func.func @m16n8k8_fp16(%arg0: vector<2x2xf16>, %arg1: vector<1x2xf16>, %arg2: vector<2x2xf16>) -> vector<2x2xf16> {
// CHECK: llvm.extractvalue %{{.*}}[0] : !llvm.array<2 x vector<2xf16>>		// CHECK: llvm.extractvalue %{{.*}}[0] : !llvm.array<2 x vector<2xf16>>
// CHECK: llvm.extractvalue %{{.*}}[1] : !llvm.array<2 x vector<2xf16>>		// CHECK: llvm.extractvalue %{{.*}}[1] : !llvm.array<2 x vector<2xf16>>
// CHECK: llvm.extractvalue %{{.*}}[0] : !llvm.array<1 x vector<2xf16>>		// CHECK: llvm.extractvalue %{{.*}}[0] : !llvm.array<1 x vector<2xf16>>
// CHECK: llvm.extractvalue %{{.*}}[0] : !llvm.array<2 x vector<2xf16>>		// CHECK: llvm.extractvalue %{{.*}}[0] : !llvm.array<2 x vector<2xf16>>
// CHECK: llvm.extractvalue %{{.*}}[1] : !llvm.array<2 x vector<2xf16>>		// CHECK: llvm.extractvalue %{{.*}}[1] : !llvm.array<2 x vector<2xf16>>
// CHECK-NOT llvm.extractvalue		// CHECK-NOT llvm.extractvalue
▲ Show 20 Lines • Show All 85 Lines • ▼ Show 20 Lines
func.func @ldmatrix_x1(%arg0: memref<128x128xf16, 3>) -> vector<1x2xf16> {		func.func @ldmatrix_x1(%arg0: memref<128x128xf16, 3>) -> vector<1x2xf16> {
%c0 = arith.constant 0 : index		%c0 = arith.constant 0 : index
// CHECK: nvvm.ldmatrix {{%.+}} {layout = #nvvm.mma_layout<row>, num = 1 : i32} {{.*}} -> i32		// CHECK: nvvm.ldmatrix {{%.+}} {layout = #nvvm.mma_layout<row>, num = 1 : i32} {{.*}} -> i32
%a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 1 : i32} : memref<128x128xf16, 3> -> vector<1x2xf16>		%a = nvgpu.ldmatrix %arg0[%c0, %c0] {transpose = false, numTiles = 1 : i32} : memref<128x128xf16, 3> -> vector<1x2xf16>
// CHECK: llvm.bitcast		// CHECK: llvm.bitcast
// CHECK: llvm.insertvalue		// CHECK: llvm.insertvalue
return %a : vector<1x2xf16>		return %a : vector<1x2xf16>
}		}

		// -----

		// CHECK-LABEL: @m16n8k4_tf32
		func.func @m16n8k4_tf32(%arg0: vector<2x1xf32>, %arg1: vector<1x1xf32>, %arg2: vector<4x1xf32>) -> vector<4x1xf32> {
		// The A, B operand should be bitcast to i32
		// CHECK: llvm.extractvalue
		// CHECK: llvm.bitcast {{.*}} : vector<1xf32> to i32
		// CHECK: llvm.extractvalue
		// CHECK: llvm.bitcast {{.*}} : vector<1xf32> to i32
		// CHECK: llvm.extractvalue
		// CHECK: llvm.bitcast {{.*}} : vector<1xf32> to i32

		// CHECK: [[d:%.+]] = nvvm.mma.sync A[{{%.+}}, {{%.+}}] B[{{%.+}}] C[{{%.+}}, {{%.+}}, {{%.+}}, {{%.+}}]
		// CHECK-SAME: multiplicandAPtxType = #nvvm.mma_type<tf32>
		// CHECK-SAME: multiplicandBPtxType = #nvvm.mma_type<tf32>
		// CHECK-SAME: shape = {k = 4 : i32, m = 16 : i32, n = 8 : i32}
		// CHECK-SAME: -> !llvm.struct<(f32, f32, f32, f32)>
		%d = nvgpu.mma.sync (%arg0, %arg1, %arg2) {mmaShape = [16, 8, 4]} : (vector<2x1xf32>, vector<1x1xf32>, vector<4x1xf32>) -> vector<4x1xf32>
		// CHECK: [[el:%.+]] = llvm.extractvalue [[d]][0]
		// CHECK: llvm.bitcast [[el]] : f32 to vector<1xf32>
		// CHECK: [[el:%.+]] = llvm.extractvalue [[d]][1]
		// CHECK: llvm.bitcast [[el]] : f32 to vector<1xf32>
		// CHECK: [[el:%.+]] = llvm.extractvalue [[d]][2]
		// CHECK: llvm.bitcast [[el]] : f32 to vector<1xf32>
		// CHECK: [[el:%.+]] = llvm.extractvalue [[d]][3]
		// CHECK: llvm.bitcast [[el]] : f32 to vector<1xf32>
		// CHECK-COUNT-4: llvm.insertvalue {{.*}} : !llvm.array<4 x vector<1xf32>>
		return %d : vector<4x1xf32>
		}
		No newline at end of file