Diff 537366

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

Show First 20 Lines • Show All 491 Lines • ▼ Show 20 Lines	def NVVM_SyncWarpOp :
NVVM_Op<"bar.warp.sync">,		NVVM_Op<"bar.warp.sync">,
Arguments<(ins LLVM_Type:$mask)> {		Arguments<(ins LLVM_Type:$mask)> {
string llvmBuilder = [{		string llvmBuilder = [{
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_bar_warp_sync, {$mask});		createIntrinsicCall(builder, llvm::Intrinsic::nvvm_bar_warp_sync, {$mask});
}];		}];
let assemblyFormat = "$mask attr-dict `:` type($mask)";		let assemblyFormat = "$mask attr-dict `:` type($mask)";
}		}

		// https://docs.nvidia.com/cuda/parallel-thread-execution/#id62
		def LoadCacheModifierCA : I32EnumAttrCase<"CA", 0, "ca">;
		def LoadCacheModifierCG : I32EnumAttrCase<"CG", 1, "cg">;
		def LoadCacheModifierCS : I32EnumAttrCase<"CS", 2, "cs">;
		def LoadCacheModifierLU : I32EnumAttrCase<"LU", 3, "lu">;
		def LoadCacheModifierCV : I32EnumAttrCase<"CV", 4, "cv">;

def NVVM_CpAsyncOp : NVVM_Op<"cp.async.shared.global">,		/// Enum attribute of the different kinds.
		def LoadCacheModifierKind : I32EnumAttr<"LoadCacheModifierKind",
		"NVVM load cache modifier kind",
		[LoadCacheModifierCA, LoadCacheModifierCG, LoadCacheModifierCS,
		LoadCacheModifierLU, LoadCacheModifierCV]> {
		let genSpecializedAttr = 0;
		let cppNamespace = "::mlir::NVVM";
		}

		def LoadCacheModifierAttr : EnumAttr<NVVM_Dialect, LoadCacheModifierKind, "load_cache_modifier">;

		def NVVM_CpAsyncOp : NVVM_Op<"cp.async.shared.global", [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>]>,
Arguments<(ins LLVM_i8Ptr_shared:$dst,		Arguments<(ins LLVM_i8Ptr_shared:$dst,
LLVM_i8Ptr_global:$src,		LLVM_i8Ptr_global:$src,
I32Attr:$size,		I32Attr:$size,
OptionalAttr<UnitAttr>:$bypass_l1)> {		LoadCacheModifierAttr:$modifier,
		Optional<LLVM_Type>:$cpSize)> {
string llvmBuilder = [{		string llvmBuilder = [{
llvm::Intrinsic::ID id;		llvm::Intrinsic::ID id;
switch ($size) {		switch ($size) {
case 4:		case 4:
id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_4;		id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_4;
break;		break;
case 8:		case 8:
id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_8;		id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_8;
break;		break;
case 16:		case 16:
if(static_cast<bool>($bypass_l1))		if($modifier == NVVM::LoadCacheModifierKind::CG)
id = llvm::Intrinsic::nvvm_cp_async_cg_shared_global_16;		id = llvm::Intrinsic::nvvm_cp_async_cg_shared_global_16;
else		else if($modifier == NVVM::LoadCacheModifierKind::CA)
id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_16;		id = llvm::Intrinsic::nvvm_cp_async_ca_shared_global_16;
		else
		llvm_unreachable("unsupported cache modifier");
break;		break;
default:		default:
llvm_unreachable("unsupported async copy size");		llvm_unreachable("unsupported async copy size");
}		}
createIntrinsicCall(builder, id, {$dst, $src});		createIntrinsicCall(builder, id, {$dst, $src});
}];		}];
let assemblyFormat = "$dst `,` $src `,` $size attr-dict `:` type(operands)";		let assemblyFormat = "$dst `,` $src `,` $size `,` `cache` `=` $modifier (`,` $cpSize^)? attr-dict `:` type(operands)";
let hasVerifier = 1;		let hasVerifier = 1;
		let extraClassDeclaration = [{
		bool canBuildPtx() { if(getCpSize()) return true; return false; }

		void getAsmValues(RewriterBase &rewriter,
		llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>> &asmValues) {
		asmValues.push_back({getDst(), PTXRegisterMod::Read});
		asmValues.push_back({getSrc(), PTXRegisterMod::Read});
		asmValues.push_back({makeConstant(rewriter, getSize()), PTXRegisterMod::Read});
		asmValues.push_back({getCpSize(), PTXRegisterMod::Read});
		}
		}];
		let extraClassDefinition = [{
		const char* $cppClass::getPtx() {
		if(getModifier() == NVVM::LoadCacheModifierKind::CG)
		return "cp.async.cg.shared.global [%0], [%1], %2, %3;\n";
		if(getModifier() == NVVM::LoadCacheModifierKind::CA)
		return "cp.async.ca.shared.global [%0], [%1], %2, %3;\n";
		llvm_unreachable("unsupported cache modifier");
		}
		}];
}		}

def NVVM_CpAsyncCommitGroupOp : NVVM_Op<"cp.async.commit.group"> {		def NVVM_CpAsyncCommitGroupOp : NVVM_Op<"cp.async.commit.group"> {
string llvmBuilder = [{		string llvmBuilder = [{
createIntrinsicCall(builder, llvm::Intrinsic::nvvm_cp_async_commit_group);		createIntrinsicCall(builder, llvm::Intrinsic::nvvm_cp_async_commit_group);
}];		}];
let assemblyFormat = "attr-dict";		let assemblyFormat = "attr-dict";
}		}
▲ Show 20 Lines • Show All 810 Lines • Show Last 20 Lines

mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp

Show First 20 Lines • Show All 355 Lines • ▼ Show 20 Lines	void runOnOperation() override {
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();		target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
target.addLegalDialect<::mlir::NVVM::NVVMDialect>();		target.addLegalDialect<::mlir::NVVM::NVVMDialect>();
if (failed(applyPartialConversion(getOperation(), target,		if (failed(applyPartialConversion(getOperation(), target,
std::move(patterns))))		std::move(patterns))))
signalPassFailure();		signalPassFailure();
}		}
};		};

static void emitCpAsyncOpZfillAsm(Location loc, Value dstPtr, Value srcPtr,
Value dstBytes, Value srcElements,
mlir::MemRefType elementType,
ConversionPatternRewriter &rewriter) {
auto asmDialectAttr = LLVM::AsmDialectAttr::get(rewriter.getContext(),
LLVM::AsmDialect::AD_ATT);

const char *cpAsyncCgStr = "cp.async.cg.shared.global [$0], [$1], $2, $3;\n";
nicolasvasilacheUnsubmitted Not Done Reply Inline Actions it is great that we can rationalize this more cleanly, thanks for this! nicolasvasilache: it is great that we can rationalize this more cleanly, thanks for this!
const char *cpAsyncCaStr = "cp.async.ca.shared.global [$0], [$1], $2, $3;\n";
const char *asmConstraints = "r,l,n,r";

Value c3I32 = rewriter.create<LLVM::ConstantOp>(
loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(3));
Value bitwidth = rewriter.create<LLVM::ConstantOp>(
loc, rewriter.getI32Type(),
rewriter.getI32IntegerAttr(elementType.getElementTypeBitWidth()));
Value srcElementsI32 =
rewriter.create<LLVM::TruncOp>(loc, rewriter.getI32Type(), srcElements);
Value srcBytes = rewriter.create<LLVM::LShrOp>(
loc, rewriter.create<LLVM::MulOp>(loc, bitwidth, srcElementsI32), c3I32);

SmallVector<Value> asmVals{dstPtr, srcPtr, dstBytes, srcBytes};

// Pick the right asm string based on the dstBytes which is a compile-time
// constant.
auto dstByteConstOp =
dyn_cast<mlir::LLVM::ConstantOp>(dstBytes.getDefiningOp());
auto dstByteAttr = dyn_cast<mlir::IntegerAttr>(dstByteConstOp.getValue());
int64_t dstByteVal = dstByteAttr.getValue().getSExtValue();

assert((dstByteVal == 4 \|\| dstByteVal == 8 \|\| dstByteVal == 16) &&
"cp.async byte copy size must be 4, 8 or 16");
// Cache global (.cg) for 16 dst bytes, Cache all (.ca) for sizes other than
// 16 dst bytes.
const char *asmStr = (dstByteVal == 16) ? cpAsyncCgStr : cpAsyncCaStr;

rewriter.create<LLVM::InlineAsmOp>(
loc, LLVM::LLVMVoidType::get(rewriter.getContext()),
/operands=/asmVals,
/asm_string=/asmStr,
/constraints=/asmConstraints, /has_side_effects=/true,
/is_align_stack=/false, /asm_dialect=/asmDialectAttr,
/operand_attrs=/ArrayAttr());
}

/// Returns the constraints for the sparse MMA inline assembly instruction.		/// Returns the constraints for the sparse MMA inline assembly instruction.
static std::string buildMmaSparseAsmConstraintString(unsigned matASize,		static std::string buildMmaSparseAsmConstraintString(unsigned matASize,
unsigned matBSize,		unsigned matBSize,
unsigned matCSize) {		unsigned matCSize) {
std::string str;		std::string str;
llvm::raw_string_ostream ss(str);		llvm::raw_string_ostream ss(str);
for (unsigned i = 0; i < matCSize; i++)		for (unsigned i = 0; i < matCSize; i++)
ss << "=r,";		ss << "=r,";
▲ Show 20 Lines • Show All 198 Lines • ▼ Show 20 Lines	matchAndRewrite(nvgpu::DeviceAsyncCopyOp op, OpAdaptor adaptor,
// Intrinsics takes a global pointer so we need an address space cast.		// Intrinsics takes a global pointer so we need an address space cast.
auto srcPointerGlobalType = getTypeConverter()->getPointerType(		auto srcPointerGlobalType = getTypeConverter()->getPointerType(
i8Ty, NVVM::NVVMMemorySpace::kGlobalMemorySpace);		i8Ty, NVVM::NVVMMemorySpace::kGlobalMemorySpace);
scrPtr = rewriter.create<LLVM::AddrSpaceCastOp>(loc, srcPointerGlobalType,		scrPtr = rewriter.create<LLVM::AddrSpaceCastOp>(loc, srcPointerGlobalType,
scrPtr);		scrPtr);
int64_t dstElements = adaptor.getDstElements().getZExtValue();		int64_t dstElements = adaptor.getDstElements().getZExtValue();
int64_t sizeInBytes =		int64_t sizeInBytes =
(dstMemrefType.getElementTypeBitWidth() * dstElements) / 8;		(dstMemrefType.getElementTypeBitWidth() * dstElements) / 8;
// bypass L1 is only supported for byte sizes of 16, we drop the hint
// otherwise.
UnitAttr bypassL1 =
sizeInBytes == 16 ? adaptor.getBypassL1Attr() : UnitAttr();

// When the optional SrcElements argument is present, the source (global
// memory) of CpAsyncOp is read only for SrcElements number of elements. The
// rest of the DstElements in the destination (shared memory) are filled
// with zeros.
if (op.getSrcElements())
emitCpAsyncOpZfillAsm(loc, dstPtr, scrPtr,
rewriter.create<LLVM::ConstantOp>(
loc, rewriter.getI32Type(),
rewriter.getI32IntegerAttr(sizeInBytes)),
adaptor.getSrcElements(), srcMemrefType, rewriter);

// When the optional SrcElements argument is not present, the regular		// When the optional SrcElements argument is not present, the regular
// CpAsyncOp is generated. CopyAsyncOp reads bytes from source (global		// CpAsyncOp is generated. CopyAsyncOp reads bytes from source (global
// memory) to fill DstElements number of elements in the destination (shared		// memory) to fill DstElements number of elements in the destination
// memory).		// (shared memory).
else		Value srcBytes = adaptor.getSrcElements();
rewriter.create<NVVM::CpAsyncOp>(loc, dstPtr, scrPtr,		if (srcBytes) {
rewriter.getI32IntegerAttr(sizeInBytes),		// When the optional SrcElements argument is present, the source (global
bypassL1);		// memory) of CpAsyncOp is read only for SrcElements number of elements.
		// The rest of the DstElements in the destination (shared memory) are
		// filled with zeros.
		Value c3I32 = rewriter.create<LLVM::ConstantOp>(
		loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(3));
		Value bitwidth = rewriter.create<LLVM::ConstantOp>(
		loc, rewriter.getI32Type(),
		rewriter.getI32IntegerAttr(srcMemrefType.getElementTypeBitWidth()));
		Value srcElementsI32 =
		rewriter.create<LLVM::TruncOp>(loc, rewriter.getI32Type(), srcBytes);
		srcBytes = rewriter.create<LLVM::LShrOp>(
		loc, rewriter.create<LLVM::MulOp>(loc, bitwidth, srcElementsI32),
		c3I32);
		}
		// Cache global (.cg) for 16 dst bytes, Cache all (.ca) for sizes other than
		// 16 dst bytes.
		NVVM::LoadCacheModifierKind cacheModifier =
		(op.getBypassL1().value_or(false) && sizeInBytes == 16)
		? NVVM::LoadCacheModifierKind::CG
		: NVVM::LoadCacheModifierKind::CA;

		rewriter.create<NVVM::CpAsyncOp>(
		loc, dstPtr, scrPtr, rewriter.getI32IntegerAttr(sizeInBytes),
		NVVM::LoadCacheModifierKindAttr::get(op->getContext(), cacheModifier),
		srcBytes);

// Drop the result token.		// Drop the result token.
Value zero = rewriter.create<LLVM::ConstantOp>(		Value zero = rewriter.create<LLVM::ConstantOp>(
op->getLoc(), IntegerType::get(op.getContext(), 32),		op->getLoc(), IntegerType::get(op.getContext(), 32),
rewriter.getI32IntegerAttr(0));		rewriter.getI32IntegerAttr(0));
rewriter.replaceOp(op, zero);		rewriter.replaceOp(op, zero);
return success();		return success();
}		}
▲ Show 20 Lines • Show All 44 Lines • Show Last 20 Lines

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

Show First 20 Lines • Show All 62 Lines • ▼ Show 20 Lines	return failure(parser.parseOperandList(ops) \|\|
parser.addTypeToList(type, result.types) \|\|		parser.addTypeToList(type, result.types) \|\|
parser.resolveOperands(ops, {int32Ty, int1Ty},		parser.resolveOperands(ops, {int32Ty, int1Ty},
parser.getNameLoc(), result.operands));		parser.getNameLoc(), result.operands));
}		}

void VoteBallotOp::print(OpAsmPrinter &p) { printNVVMIntrinsicOp(p, *this); }		void VoteBallotOp::print(OpAsmPrinter &p) { printNVVMIntrinsicOp(p, *this); }

LogicalResult CpAsyncOp::verify() {		LogicalResult CpAsyncOp::verify() {
		if (getModifier() != LoadCacheModifierKind::CG &&
		getModifier() != LoadCacheModifierKind::CA)
		return emitError("Only CG and CA cache modifiers are supported.");
if (getSize() != 4 && getSize() != 8 && getSize() != 16)		if (getSize() != 4 && getSize() != 8 && getSize() != 16)
return emitError("expected byte size to be either 4, 8 or 16.");		return emitError("expected byte size to be either 4, 8 or 16.");
if (getBypassL1() && getSize() != 16)		if (getModifier() == LoadCacheModifierKind::CG && getSize() != 16)
return emitError("bypass l1 is only support for 16 bytes copy.");		return emitError("CG cache modifier is only support for 16 bytes copy.");
return success();		return success();
}		}

// Given the element type of an operand and whether or not it is an accumulator,		// Given the element type of an operand and whether or not it is an accumulator,
// this function returns the PTX type (`NVVM::MMATypes`) that corresponds to the		// this function returns the PTX type (`NVVM::MMATypes`) that corresponds to the
// operand's element type.		// operand's element type.
std::optional<mlir::NVVM::MMATypes>		std::optional<mlir::NVVM::MMATypes>
MmaOp::inferOperandMMAType(Type operandElType, bool isAccumulator) {		MmaOp::inferOperandMMAType(Type operandElType, bool isAccumulator) {
▲ Show 20 Lines • Show All 673 Lines • Show Last 20 Lines

mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp

Show First 20 Lines • Show All 77 Lines • ▼ Show 20 Lines	if (dstMemref.getElementType() != srcMemref.getElementType())
return emitError("source and destination must have the same element type");		return emitError("source and destination must have the same element type");
if (size_t(srcMemref.getRank()) != getSrcIndices().size())		if (size_t(srcMemref.getRank()) != getSrcIndices().size())
return emitOpError() << "expected " << srcMemref.getRank()		return emitOpError() << "expected " << srcMemref.getRank()
<< " source indices, got " << getSrcIndices().size();		<< " source indices, got " << getSrcIndices().size();
if (size_t(dstMemref.getRank()) != getDstIndices().size())		if (size_t(dstMemref.getRank()) != getDstIndices().size())
return emitOpError() << "expected " << dstMemref.getRank()		return emitOpError() << "expected " << dstMemref.getRank()
<< " destination indices, got "		<< " destination indices, got "
<< getDstIndices().size();		<< getDstIndices().size();
		if (getBypassL1().has_value()) {
		int64_t dstElements = getDstElements().getZExtValue();
		int64_t sizeInBytes =
		(dstMemref.getElementTypeBitWidth() * dstElements) / 8;
		int64_t req = 16 * 8 / dstMemref.getElementTypeBitWidth();
		if (getBypassL1().value() && sizeInBytes != 16) {
		return emitOpError() << "bypassL1 does not satify alignment for "
		<< dstMemref << " with destination element "
		<< dstElements
		<< ". Unset bypassL1, or set "
		"destination element to "
		<< req;
		}
		}
return success();		return success();
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// NVGPU_MmaSyncOp		// NVGPU_MmaSyncOp
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
void MmaSyncOp::build(::mlir::OpBuilder &odsBuilder,		void MmaSyncOp::build(::mlir::OpBuilder &odsBuilder,
::mlir::OperationState &odsState, Value matrixA,		::mlir::OperationState &odsState, Value matrixA,
▲ Show 20 Lines • Show All 219 Lines • Show Last 20 Lines

mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir

Show First 20 Lines • Show All 252 Lines • ▼ Show 20 Lines	func.func @async_cp(
// CHECK-DAG: %[[FI2:.*]] = llvm.add %[[FI1]], %[[IDX1]] : i64		// CHECK-DAG: %[[FI2:.*]] = llvm.add %[[FI1]], %[[IDX1]] : i64
// CHECK-DAG: %[[ADDRESSDST:.*]] = llvm.getelementptr %[[BASEDST]][%[[FI2]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>		// CHECK-DAG: %[[ADDRESSDST:.*]] = llvm.getelementptr %[[BASEDST]][%[[FI2]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>
// CHECK-DAG: %[[BASESRC:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>		// CHECK-DAG: %[[BASESRC:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
// CHECK-DAG: %[[S3:.*]] = llvm.mlir.constant(128 : index) : i64		// CHECK-DAG: %[[S3:.*]] = llvm.mlir.constant(128 : index) : i64
// CHECK-DAG: %[[FI3:.*]] = llvm.mul %[[IDX1]], %[[S3]] : i64		// CHECK-DAG: %[[FI3:.*]] = llvm.mul %[[IDX1]], %[[S3]] : i64
// CHECK-DAG: %[[FI4:.*]] = llvm.add %[[FI3]], %[[IDX1]] : i64		// CHECK-DAG: %[[FI4:.*]] = llvm.add %[[FI3]], %[[IDX1]] : i64
// CHECK-DAG: %[[ADDRESSSRC:.*]] = llvm.getelementptr %[[BASESRC]][%[[FI4]]] : (!llvm.ptr, i64) -> !llvm.ptr		// CHECK-DAG: %[[ADDRESSSRC:.*]] = llvm.getelementptr %[[BASESRC]][%[[FI4]]] : (!llvm.ptr, i64) -> !llvm.ptr
// CHECK-DAG: %[[CAST2:.*]] = llvm.addrspacecast %[[ADDRESSSRC]] : !llvm.ptr to !llvm.ptr<1>		// CHECK-DAG: %[[CAST2:.*]] = llvm.addrspacecast %[[ADDRESSSRC]] : !llvm.ptr to !llvm.ptr<1>
// CHECK-DAG: nvvm.cp.async.shared.global %[[ADDRESSDST]], %[[CAST2]], 16		// CHECK-DAG: nvvm.cp.async.shared.global %[[ADDRESSDST]], %[[CAST2]], 16, cache = ca
%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 : memref<128x128xf32> to memref<3x16x128xf32, 3>		%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 : memref<128x128xf32> to memref<3x16x128xf32, 3>
// CHECK: nvvm.cp.async.commit.group		// CHECK: nvvm.cp.async.commit.group
%1 = nvgpu.device_async_create_group %0		%1 = nvgpu.device_async_create_group %0
// CHECK: nvvm.cp.async.wait.group 1		// CHECK: nvvm.cp.async.wait.group 1
nvgpu.device_async_wait %1 { numGroups = 1 : i32 }		nvgpu.device_async_wait %1 { numGroups = 1 : i32 }

// CHECK: nvvm.cp.async.shared.global %{{.}}, %{{.}}, 16 {bypass_l1}		// CHECK: nvvm.cp.async.shared.global %{{.}}, %{{.}}, 16, cache = cg
%2 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3>		%2 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3>
return		return
}		}

// -----		// -----

// CHECK-LABEL: @async_cp_i4(		// CHECK-LABEL: @async_cp_i4(
// CHECK-SAME: %[[IDX:[a-zA-Z0-9_]+]]: index)		// CHECK-SAME: %[[IDX:[a-zA-Z0-9_]+]]: index)
func.func @async_cp_i4(		func.func @async_cp_i4(
%src: memref<128x64xi4>, %dst: memref<128x128xi4, 3>, %i : index) -> !nvgpu.device.async.token {		%src: memref<128x64xi4>, %dst: memref<128x128xi4, 3>, %i : index) -> !nvgpu.device.async.token {
// CHECK: %[[IDX1:.*]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64		// CHECK: %[[IDX1:.*]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64
// CHECK-DAG: %[[BASEDST:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>		// CHECK-DAG: %[[BASEDST:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<2 x i64>, array<2 x i64>)>
// CHECK-DAG: %[[S0:.*]] = llvm.mlir.constant(128 : index) : i64		// CHECK-DAG: %[[S0:.*]] = llvm.mlir.constant(128 : index) : i64
// CHECK-DAG: %[[LI:.*]] = llvm.mul %[[IDX1]], %[[S0]] : i64		// CHECK-DAG: %[[LI:.*]] = llvm.mul %[[IDX1]], %[[S0]] : i64
// CHECK-DAG: %[[FI1:.*]] = llvm.add %[[LI]], %[[IDX1]] : i64		// CHECK-DAG: %[[FI1:.*]] = llvm.add %[[LI]], %[[IDX1]] : i64
// CHECK-DAG: %[[ADDRESSDST:.*]] = llvm.getelementptr %[[BASEDST]][%[[FI1]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>		// CHECK-DAG: %[[ADDRESSDST:.*]] = llvm.getelementptr %[[BASEDST]][%[[FI1]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>
// CHECK-DAG: %[[BASESRC:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>		// CHECK-DAG: %[[BASESRC:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
// CHECK-DAG: %[[S2:.*]] = llvm.mlir.constant(64 : index) : i64		// CHECK-DAG: %[[S2:.*]] = llvm.mlir.constant(64 : index) : i64
// CHECK-DAG: %[[FI2:.*]] = llvm.mul %[[IDX1]], %[[S2]] : i64		// CHECK-DAG: %[[FI2:.*]] = llvm.mul %[[IDX1]], %[[S2]] : i64
// CHECK-DAG: %[[FI3:.*]] = llvm.add %[[FI2]], %[[IDX1]] : i64		// CHECK-DAG: %[[FI3:.*]] = llvm.add %[[FI2]], %[[IDX1]] : i64
// CHECK-DAG: %[[ADDRESSSRC:.*]] = llvm.getelementptr %[[BASESRC]][%[[FI3]]] : (!llvm.ptr, i64) -> !llvm.ptr		// CHECK-DAG: %[[ADDRESSSRC:.*]] = llvm.getelementptr %[[BASESRC]][%[[FI3]]] : (!llvm.ptr, i64) -> !llvm.ptr
// CHECK-DAG: %[[CAST2:.*]] = llvm.addrspacecast %[[ADDRESSSRC]] : !llvm.ptr to !llvm.ptr<1>		// CHECK-DAG: %[[CAST2:.*]] = llvm.addrspacecast %[[ADDRESSSRC]] : !llvm.ptr to !llvm.ptr<1>
// CHECK-DAG: nvvm.cp.async.shared.global %[[ADDRESSDST]], %[[CAST2]], 16		// CHECK-DAG: nvvm.cp.async.shared.global %[[ADDRESSDST]], %[[CAST2]], 16, cache = ca
%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i], 32 : memref<128x64xi4> to memref<128x128xi4, 3>		%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i], 32 : memref<128x64xi4> to memref<128x128xi4, 3>
return %0 : !nvgpu.device.async.token		return %0 : !nvgpu.device.async.token
}		}

// -----		// -----

// CHECK-LABEL: @async_cp_zfill_f32_align4(		// CHECK-LABEL: @async_cp_zfill_f32_align4(
// CHECK-SAME: %[[IDX:[a-zA-Z0-9_]+]]: index, %[[SRCELEMENTS:[a-zA-Z0-9_]+]]: index)		// CHECK-SAME: %[[IDX:[a-zA-Z0-9_]+]]: index, %[[SRCELEMENTS:[a-zA-Z0-9_]+]]: index
func.func @async_cp_zfill_f32_align4(		func.func @async_cp_zfill_f32_align4(
%src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index, %srcElements : index) {		%src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index, %srcElements : index) {
// CHECK-DAG: %[[DSTBYTES:.*]] = llvm.mlir.constant(16 : i32) : i32		// CHECK: %[[IDX1:.*]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64
// CHECK-DAG: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.cg.shared.global [$0], [$1], $2, $3;\0A", "r,l,n,r" %[[DSTPTR:.]], %[[SRCPTR:.]], %[[DSTBYTES]], %[[SRCBYTES:.*]] : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> !llvm.void		// CHECK: %[[SRC1:.*]] = builtin.unrealized_conversion_cast %[[SRCELEMENTS]] : index to i64
		// CHECK-DAG: %[[BASEDST:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<3 x i64>, array<3 x i64>)>
		// CHECK-DAG: %[[S2048:.*]] = llvm.mlir.constant(2048 : index) : i64
		// CHECK-DAG: %[[LI1:.*]] = llvm.mul %[[IDX1]], %[[S2048]] : i64
		// CHECK-DAG: %[[S0:.*]] = llvm.mlir.constant(128 : index) : i64
		// CHECK-DAG: %[[LI:.*]] = llvm.mul %[[IDX1]], %[[S0]] : i64
		// CHECK-DAG: %[[FI1:.*]] = llvm.add %[[LI1]], %[[LI]] : i64
		// CHECK-DAG: %[[FI2:.*]] = llvm.add %[[FI1]], %[[IDX1]] : i64
		// CHECK-DAG: %[[ADDRESSDST:.*]] = llvm.getelementptr %[[BASEDST]][%[[FI2]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
		// CHECK-DAG: %[[BASESRC:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
		// CHECK-DAG: %[[S2:.*]] = llvm.mlir.constant(128 : index) : i64
		// CHECK-DAG: %[[FI2:.*]] = llvm.mul %[[IDX1]], %[[S2]] : i64
		// CHECK-DAG: %[[FI3:.*]] = llvm.add %[[FI2]], %[[IDX1]] : i64
		// CHECK-DAG: %[[ADDRESSSRC:.*]] = llvm.getelementptr %[[BASESRC]][%[[FI3]]] : (!llvm.ptr, i64) -> !llvm.ptr
		// CHECK-DAG: %[[CAST2:.*]] = llvm.addrspacecast %[[ADDRESSSRC]] : !llvm.ptr to !llvm.ptr<1>
		// CHECK-DAG: %[[c1:.*]] = llvm.mlir.constant(3 : i32) : i32
		// CHECK-DAG: %[[c2:.*]] = llvm.mlir.constant(32 : i32) : i32
		// CHECK-DAG: %[[c3:.*]] = llvm.trunc %[[SRC1]] : i64 to i32
		// CHECK-DAG: %[[c4:.*]] = llvm.mul %[[c2]], %[[c3]] : i32
		// CHECK-DAG: %[[c5:.*]] = llvm.lshr %[[c4]], %[[c1]] : i32
		// CHECK-DAG: nvvm.cp.async.shared.global %[[ADDRESSDST]], %[[CAST2]], 16, cache = cg, %[[c5]]
%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4, %srcElements {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3>		%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4, %srcElements {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3>
// CHECK: nvvm.cp.async.commit.group		// CHECK: nvvm.cp.async.commit.group
%1 = nvgpu.device_async_create_group %0		%1 = nvgpu.device_async_create_group %0
// CHECK: nvvm.cp.async.wait.group 1		// CHECK: nvvm.cp.async.wait.group 1
nvgpu.device_async_wait %1 { numGroups = 1 : i32 }		nvgpu.device_async_wait %1 { numGroups = 1 : i32 }

return		return
}		}

// -----		// -----

// CHECK-LABEL: @async_cp_zfill_f32_align1(		// CHECK-LABEL: @async_cp_zfill_f32_align1(
// CHECK-SAME: %[[IDX:[a-zA-Z0-9_]+]]: index, %[[SRCELEMENTS:[a-zA-Z0-9_]+]]: index)		// CHECK-SAME: %[[IDX:[a-zA-Z0-9_]+]]: index, %[[SRCELEMENTS:[a-zA-Z0-9_]+]]: index)
func.func @async_cp_zfill_f32_align1(		func.func @async_cp_zfill_f32_align1(
%src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index, %srcElements : index) {		%src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index, %srcElements : index) {
// CHECK-DAG: %[[DSTBYTES:.*]] = llvm.mlir.constant(4 : i32) : i32		// CHECK: %[[IDX1:.*]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64
// CHECK-DAG: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.ca.shared.global [$0], [$1], $2, $3;\0A", "r,l,n,r" %[[DSTPTR:.]], %[[SRCPTR:.]], %[[DSTBYTES]], %[[SRCBYTES:.*]] : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> !llvm.void		// CHECK: %[[SRC1:.*]] = builtin.unrealized_conversion_cast %[[SRCELEMENTS]] : index to i64
%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 1, %srcElements {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3>		// CHECK-DAG: %[[BASEDST:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<3>, ptr<3>, i64, array<3 x i64>, array<3 x i64>)>
		// CHECK-DAG: %[[S2048:.*]] = llvm.mlir.constant(2048 : index) : i64
		// CHECK-DAG: %[[LI1:.*]] = llvm.mul %[[IDX1]], %[[S2048]] : i64
		// CHECK-DAG: %[[S0:.*]] = llvm.mlir.constant(128 : index) : i64
		// CHECK-DAG: %[[LI:.*]] = llvm.mul %[[IDX1]], %[[S0]] : i64
		// CHECK-DAG: %[[FI1:.*]] = llvm.add %[[LI1]], %[[LI]] : i64
		// CHECK-DAG: %[[FI2:.*]] = llvm.add %[[FI1]], %[[IDX1]] : i64
		// CHECK-DAG: %[[ADDRESSDST:.*]] = llvm.getelementptr %[[BASEDST]][%[[FI2]]] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
		// CHECK-DAG: %[[BASESRC:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
		// CHECK-DAG: %[[S2:.*]] = llvm.mlir.constant(128 : index) : i64
		// CHECK-DAG: %[[FI2:.*]] = llvm.mul %[[IDX1]], %[[S2]] : i64
		// CHECK-DAG: %[[FI3:.*]] = llvm.add %[[FI2]], %[[IDX1]] : i64
		// CHECK-DAG: %[[ADDRESSSRC:.*]] = llvm.getelementptr %[[BASESRC]][%[[FI3]]] : (!llvm.ptr, i64) -> !llvm.ptr
		// CHECK-DAG: %[[CAST2:.*]] = llvm.addrspacecast %[[ADDRESSSRC]] : !llvm.ptr to !llvm.ptr<1>
		// CHECK-DAG: %[[c1:.*]] = llvm.mlir.constant(3 : i32) : i32
		// CHECK-DAG: %[[c2:.*]] = llvm.mlir.constant(32 : i32) : i32
		// CHECK-DAG: %[[c3:.*]] = llvm.trunc %[[SRC1]] : i64 to i32
		// CHECK-DAG: %[[c4:.*]] = llvm.mul %[[c2]], %[[c3]] : i32
		// CHECK-DAG: %[[c5:.*]] = llvm.lshr %[[c4]], %[[c1]] : i32
		// CHECK-DAG: nvvm.cp.async.shared.global %[[ADDRESSDST]], %[[CAST2]], 4, cache = ca, %[[c5]]
		%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 1, %srcElements : memref<128x128xf32> to memref<3x16x128xf32, 3>
// CHECK: nvvm.cp.async.commit.group		// CHECK: nvvm.cp.async.commit.group
%1 = nvgpu.device_async_create_group %0		%1 = nvgpu.device_async_create_group %0
// CHECK: nvvm.cp.async.wait.group 1		// CHECK: nvvm.cp.async.wait.group 1
nvgpu.device_async_wait %1 { numGroups = 1 : i32 }		nvgpu.device_async_wait %1 { numGroups = 1 : i32 }

return		return
}		}

▲ Show 20 Lines • Show All 136 Lines • Show Last 20 Lines

mlir/test/Conversion/NVGPUToNVVM/typed-pointers.mlir

Show All 15 Lines	func.func @async_cp(
// CHECK-DAG: %[[CAST0:.*]] = llvm.bitcast %[[ADDRESSDST]] : !llvm.ptr<f32, 3> to !llvm.ptr<i8, 3>		// CHECK-DAG: %[[CAST0:.*]] = llvm.bitcast %[[ADDRESSDST]] : !llvm.ptr<f32, 3> to !llvm.ptr<i8, 3>
// CHECK-DAG: %[[BASESRC:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>		// CHECK-DAG: %[[BASESRC:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
// CHECK-DAG: %[[S3:.*]] = llvm.mlir.constant(128 : index) : i64		// CHECK-DAG: %[[S3:.*]] = llvm.mlir.constant(128 : index) : i64
// CHECK-DAG: %[[FI3:.*]] = llvm.mul %[[IDX1]], %[[S3]] : i64		// CHECK-DAG: %[[FI3:.*]] = llvm.mul %[[IDX1]], %[[S3]] : i64
// CHECK-DAG: %[[FI4:.*]] = llvm.add %[[FI3]], %[[IDX1]] : i64		// CHECK-DAG: %[[FI4:.*]] = llvm.add %[[FI3]], %[[IDX1]] : i64
// CHECK-DAG: %[[ADDRESSSRC:.*]] = llvm.getelementptr %[[BASESRC]][%[[FI4]]] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>		// CHECK-DAG: %[[ADDRESSSRC:.*]] = llvm.getelementptr %[[BASESRC]][%[[FI4]]] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
// CHECK-DAG: %[[CAST1:.*]] = llvm.bitcast %[[ADDRESSSRC]] : !llvm.ptr<f32> to !llvm.ptr<i8>		// CHECK-DAG: %[[CAST1:.*]] = llvm.bitcast %[[ADDRESSSRC]] : !llvm.ptr<f32> to !llvm.ptr<i8>
// CHECK-DAG: %[[CAST2:.*]] = llvm.addrspacecast %[[CAST1]] : !llvm.ptr<i8> to !llvm.ptr<i8, 1>		// CHECK-DAG: %[[CAST2:.*]] = llvm.addrspacecast %[[CAST1]] : !llvm.ptr<i8> to !llvm.ptr<i8, 1>
// CHECK-DAG: nvvm.cp.async.shared.global %[[CAST0]], %[[CAST2]], 16		// CHECK-DAG: nvvm.cp.async.shared.global %[[CAST0]], %[[CAST2]], 16, cache = ca
%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 : memref<128x128xf32> to memref<3x16x128xf32, 3>		%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 : memref<128x128xf32> to memref<3x16x128xf32, 3>
// CHECK: nvvm.cp.async.commit.group		// CHECK: nvvm.cp.async.commit.group
%1 = nvgpu.device_async_create_group %0		%1 = nvgpu.device_async_create_group %0
// CHECK: nvvm.cp.async.wait.group 1		// CHECK: nvvm.cp.async.wait.group 1
nvgpu.device_async_wait %1 { numGroups = 1 : i32 }		nvgpu.device_async_wait %1 { numGroups = 1 : i32 }

// CHECK: nvvm.cp.async.shared.global %{{.}}, %{{.}}, 16 {bypass_l1}		// CHECK: nvvm.cp.async.shared.global %{{.}}, %{{.}}, 16, cache = cg
		manishucsdUnsubmitted Not Done Reply Inline Actions Removing `bypass_l1` from the `nvvm.cp. async` and matching it with ptx spec more closely looks great! Thanks for working on it. Should we also work towards removing `bypass_l1` from `nvgpu.device_async_copy` in future? manishucsd: Removing `bypass_l1` from the `nvvm.cp. async` and matching it with ptx spec more closely looks…
		gurayppAuthorUnsubmitted Done Reply Inline Actions Good point, I think we should delete `bypass_l1` and use the PTXs cache modifiers there. guraypp: Good point, I think we should delete `bypass_l1` and use the PTXs cache modifiers there.
%2 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3>		%2 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3>
return		return
}		}

// -----		// -----

// CHECK-LABEL: @async_cp_i4(		// CHECK-LABEL: @async_cp_i4(
// CHECK-SAME: %[[IDX:[a-zA-Z0-9_]+]]: index)		// CHECK-SAME: %[[IDX:[a-zA-Z0-9_]+]]: index)
func.func @async_cp_i4(		func.func @async_cp_i4(
%src: memref<128x64xi4>, %dst: memref<128x128xi4, 3>, %i : index) -> !nvgpu.device.async.token {		%src: memref<128x64xi4>, %dst: memref<128x128xi4, 3>, %i : index) -> !nvgpu.device.async.token {
// CHECK: %[[IDX1:.*]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64		// CHECK: %[[IDX1:.*]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64
// CHECK-DAG: %[[BASEDST:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<i4, 3>, ptr<i4, 3>, i64, array<2 x i64>, array<2 x i64>)>		// CHECK-DAG: %[[BASEDST:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<i4, 3>, ptr<i4, 3>, i64, array<2 x i64>, array<2 x i64>)>
// CHECK-DAG: %[[S0:.*]] = llvm.mlir.constant(128 : index) : i64		// CHECK-DAG: %[[S0:.*]] = llvm.mlir.constant(128 : index) : i64
// CHECK-DAG: %[[LI:.*]] = llvm.mul %[[IDX1]], %[[S0]] : i64		// CHECK-DAG: %[[LI:.*]] = llvm.mul %[[IDX1]], %[[S0]] : i64
// CHECK-DAG: %[[FI1:.*]] = llvm.add %[[LI]], %[[IDX1]] : i64		// CHECK-DAG: %[[FI1:.*]] = llvm.add %[[LI]], %[[IDX1]] : i64
// CHECK-DAG: %[[ADDRESSDST:.*]] = llvm.getelementptr %[[BASEDST]][%[[FI1]]] : (!llvm.ptr<i4, 3>, i64) -> !llvm.ptr<i4, 3>		// CHECK-DAG: %[[ADDRESSDST:.*]] = llvm.getelementptr %[[BASEDST]][%[[FI1]]] : (!llvm.ptr<i4, 3>, i64) -> !llvm.ptr<i4, 3>
// CHECK-DAG: %[[CAST0:.*]] = llvm.bitcast %[[ADDRESSDST]] : !llvm.ptr<i4, 3> to !llvm.ptr<i8, 3>		// CHECK-DAG: %[[CAST0:.*]] = llvm.bitcast %[[ADDRESSDST]] : !llvm.ptr<i4, 3> to !llvm.ptr<i8, 3>
// CHECK-DAG: %[[BASESRC:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<i4>, ptr<i4>, i64, array<2 x i64>, array<2 x i64>)>		// CHECK-DAG: %[[BASESRC:.]] = llvm.extractvalue %{{.}}[1] : !llvm.struct<(ptr<i4>, ptr<i4>, i64, array<2 x i64>, array<2 x i64>)>
// CHECK-DAG: %[[S2:.*]] = llvm.mlir.constant(64 : index) : i64		// CHECK-DAG: %[[S2:.*]] = llvm.mlir.constant(64 : index) : i64
// CHECK-DAG: %[[FI2:.*]] = llvm.mul %[[IDX1]], %[[S2]] : i64		// CHECK-DAG: %[[FI2:.*]] = llvm.mul %[[IDX1]], %[[S2]] : i64
// CHECK-DAG: %[[FI3:.*]] = llvm.add %[[FI2]], %[[IDX1]] : i64		// CHECK-DAG: %[[FI3:.*]] = llvm.add %[[FI2]], %[[IDX1]] : i64
// CHECK-DAG: %[[ADDRESSSRC:.*]] = llvm.getelementptr %[[BASESRC]][%[[FI3]]] : (!llvm.ptr<i4>, i64) -> !llvm.ptr<i4>		// CHECK-DAG: %[[ADDRESSSRC:.*]] = llvm.getelementptr %[[BASESRC]][%[[FI3]]] : (!llvm.ptr<i4>, i64) -> !llvm.ptr<i4>
// CHECK-DAG: %[[CAST1:.*]] = llvm.bitcast %[[ADDRESSSRC]] : !llvm.ptr<i4> to !llvm.ptr<i8>		// CHECK-DAG: %[[CAST1:.*]] = llvm.bitcast %[[ADDRESSSRC]] : !llvm.ptr<i4> to !llvm.ptr<i8>
// CHECK-DAG: %[[CAST2:.*]] = llvm.addrspacecast %[[CAST1]] : !llvm.ptr<i8> to !llvm.ptr<i8, 1>		// CHECK-DAG: %[[CAST2:.*]] = llvm.addrspacecast %[[CAST1]] : !llvm.ptr<i8> to !llvm.ptr<i8, 1>
// CHECK-DAG: nvvm.cp.async.shared.global %[[CAST0]], %[[CAST2]], 16		// CHECK-DAG: nvvm.cp.async.shared.global %[[CAST0]], %[[CAST2]], 16, cache = ca
%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i], 32 : memref<128x64xi4> to memref<128x128xi4, 3>		%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i], 32 : memref<128x64xi4> to memref<128x128xi4, 3>
return %0 : !nvgpu.device.async.token		return %0 : !nvgpu.device.async.token
}		}

mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir

	// RUN: mlir-opt --convert-nvvm-to-llvm --split-input-file %s \| FileCheck %s			// RUN: mlir-opt --convert-nvvm-to-llvm --split-input-file %s \| FileCheck %s

	// CHECK-LABEL : @init_mbarrier_arrive_expect_tx			// CHECK-LABEL : @init_mbarrier_arrive_expect_tx
	llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i32) -> i32{			llvm.func @init_mbarrier_arrive_expect_tx(%barrier : !llvm.ptr<3>, %txcount : i32) -> i32{
	//CHECK : llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.shared.b64 %0, [%1], %2;", "=r,r,r" %arg0, %arg1 : (!llvm.ptr<3>, i32) -> i32			//CHECK : llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.shared.b64 %0, [%1], %2;", "=r,r,r" %{{.}}, %{{.}} : (!llvm.ptr<3>, i32) -> i32
	%res = nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount : !llvm.ptr<3>, i32 -> i32			%res = nvvm.mbarrier.arrive.expect_tx.shared %barrier, %txcount : !llvm.ptr<3>, i32 -> i32
	llvm.return %res : i32			llvm.return %res : i32
	}			}

	// CHECK-LABEL : @init_mbarrier_arrive_expect_tx_generic			// CHECK-LABEL : @init_mbarrier_arrive_expect_tx_generic
	llvm.func @init_mbarrier_arrive_expect_tx_generic(%barrier : !llvm.ptr, %txcount : i32)-> i32 {			llvm.func @init_mbarrier_arrive_expect_tx_generic(%barrier : !llvm.ptr, %txcount : i32)-> i32 {
	// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.b64 %0, [%1], %2;", "=r,l,r" %arg0, %arg1 : (!llvm.ptr, i32) -> i32			// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "mbarrier.arrive.expect_tx.b64 %0, [%1], %2;", "=r,l,r" %{{.}}, %{{.}} : (!llvm.ptr, i32) -> i32
	%res = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr, i32 -> i32			%res = nvvm.mbarrier.arrive.expect_tx %barrier, %txcount : !llvm.ptr, i32 -> i32
	llvm.return %res : i32			llvm.return %res : i32
	}			}

	// CHECK-LABEL : @init_mbarrier_try_wait.parity.shared			// CHECK-LABEL : @init_mbarrier_try_wait.parity.shared
	llvm.func @init_mbarrier_try_wait.parity.shared(%barrier : !llvm.ptr<3>, %token : i32) -> i32 {			llvm.func @init_mbarrier_try_wait.parity.shared(%barrier : !llvm.ptr<3>, %token : i32) -> i32 {
	// CHECK : llvm.inline_asm has_side_effects asm_dialect = att "{\0A\09.reg .pred P1; \0A\09mbarrier.try_wait.parity.shared.b64 P1, [%1], %2; \0A\09selp.b32 %0, 1, 0, P1; \0A\09}", "=r,r,r" %arg0, %arg1 : (!llvm.ptr<3>, i32) -> i32			// CHECK : llvm.inline_asm has_side_effects asm_dialect = att "{\0A\09.reg .pred P1; \0A\09mbarrier.try_wait.parity.shared.b64 P1, [%1], %2; \0A\09selp.b32 %0, 1, 0, P1; \0A\09}", "=r,r,r" %{{.}}, %{{.}} : (!llvm.ptr<3>, i32) -> i32
	%res = nvvm.mbarrier.try_wait.parity.shared %barrier, %token : !llvm.ptr<3>, i32 -> i32			%res = nvvm.mbarrier.try_wait.parity.shared %barrier, %token : !llvm.ptr<3>, i32 -> i32
	llvm.return %res : i32			llvm.return %res : i32
	}			}

	// CHECK-LABEL : @init_mbarrier_try_wait.parity			// CHECK-LABEL : @init_mbarrier_try_wait.parity
	llvm.func @init_mbarrier_try_wait.parity(%barrier : !llvm.ptr, %token : i32) -> i32{			llvm.func @init_mbarrier_try_wait.parity(%barrier : !llvm.ptr, %token : i32) -> i32{
	// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "{\0A\09.reg .pred P1; \0A\09mbarrier.try_wait.parity.b64 P1, [%1], %2; \0A\09selp.b32 %0, 1, 0, P1; \0A\09}", "=r,l,r" %arg0, %arg1 : (!llvm.ptr, i32) -> i32			// CHECK: llvm.inline_asm has_side_effects asm_dialect = att "{\0A\09.reg .pred P1; \0A\09mbarrier.try_wait.parity.b64 P1, [%1], %2; \0A\09selp.b32 %0, 1, 0, P1; \0A\09}", "=r,l,r" %{{.}}, %{{.}} : (!llvm.ptr, i32) -> i32
	%res = nvvm.mbarrier.try_wait.parity %barrier, %token : !llvm.ptr, i32 -> i32			%res = nvvm.mbarrier.try_wait.parity %barrier, %token : !llvm.ptr, i32 -> i32
	llvm.return %res : i32			llvm.return %res : i32
	}			}

				// CHECK-LABEL : @async_cp
				func.func @async_cp(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>) {
				// CHECK : nvvm.cp.async.shared.global %{{.}}, %{{.}}, 16, cache = ca : !llvm.ptr<3>, !llvm.ptr<1>
				nvvm.cp.async.shared.global %dst, %src, 16, cache = ca : !llvm.ptr<3>, !llvm.ptr<1>
				// CHECK : nvvm.cp.async.shared.global %{{.}}, %{{.}}, 16, cache = cg : !llvm.ptr<3>, !llvm.ptr<1>
				nvvm.cp.async.shared.global %dst, %src, 16, cache = cg : !llvm.ptr<3>, !llvm.ptr<1>
				return
				}

				// CHECK-LABEL : @async_cp_zfill
				func.func @async_cp_zfill(%dst: !llvm.ptr<3>, %src: !llvm.ptr<1>, %cpSize: i32) {
				// CHECK : llvm.inline_asm has_side_effects asm_dialect = att "cp.async.cg.shared.global [%0], [%1], %2, %3;\0A", "r,l,r" %{{.}}, %{{.}}, %{{.*}} : (!llvm.ptr<3>, !llvm.ptr<1>, i32) -> !llvm.void
				nvvm.cp.async.shared.global %dst, %src, 16, cache = cg, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32
				// CHECK : llvm.inline_asm has_side_effects asm_dialect = att "cp.async.ca.shared.global [%0], [%1], %2, %3;\0A", "r,l,r" %{{.}}, %{{.}}, %{{.*}} : (!llvm.ptr<3>, !llvm.ptr<1>, i32) -> !llvm.void
				nvvm.cp.async.shared.global %dst, %src, 4, cache = ca, %cpSize : !llvm.ptr<3>, !llvm.ptr<1>, i32
				return
				}

mlir/test/Dialect/LLVMIR/invalid-typed-pointers.mlir

Show First 20 Lines • Show All 272 Lines • ▼ Show 20 Lines	llvm.func @wmmald_matrix(%arg0: !llvm.ptr<i32, 3>) {
%l = nvvm.ldmatrix %arg0 {num = 4 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32)>		%l = nvvm.ldmatrix %arg0 {num = 4 : i32, layout = #nvvm.mma_layout<row>} : (!llvm.ptr<i32, 3>) -> !llvm.struct<(i32, i32)>
llvm.return		llvm.return
}		}

// -----		// -----

func.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {		func.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
// expected-error @below {{expected byte size to be either 4, 8 or 16.}}		// expected-error @below {{expected byte size to be either 4, 8 or 16.}}
nvvm.cp.async.shared.global %arg0, %arg1, 32 : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>		nvvm.cp.async.shared.global %arg0, %arg1, 32, cache = ca : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
return		return
}		}

// -----		// -----

func.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {		func.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
// expected-error @below {{bypass l1 is only support for 16 bytes copy.}}		// expected-error @below {{CG cache modifier is only support for 16 bytes copy.}}
nvvm.cp.async.shared.global %arg0, %arg1, 8 {bypass_l1} : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>		nvvm.cp.async.shared.global %arg0, %arg1, 8, cache = cg : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
return		return
}		}

// -----		// -----

func.func @gep_struct_variable(%arg0: !llvm.ptr<struct<(i32)>>, %arg1: i32, %arg2: i32) {		func.func @gep_struct_variable(%arg0: !llvm.ptr<struct<(i32)>>, %arg1: i32, %arg2: i32) {
// expected-error @below {{op expected index 1 indexing a struct to be constant}}		// expected-error @below {{op expected index 1 indexing a struct to be constant}}
llvm.getelementptr %arg0[%arg1, %arg1] : (!llvm.ptr<struct<(i32)>>, i32, i32) -> !llvm.ptr<i32>		llvm.getelementptr %arg0[%arg1, %arg1] : (!llvm.ptr<struct<(i32)>>, i32, i32) -> !llvm.ptr<i32>
Show All 10 Lines

mlir/test/Dialect/LLVMIR/invalid.mlir

Show First 20 Lines • Show All 1,275 Lines • ▼ Show 20 Lines	func.func @bitcast(%arg0: vector<2x3xf32>) {
llvm.bitcast %arg0 : vector<2x3xf32> to vector<2x3xi32>		llvm.bitcast %arg0 : vector<2x3xf32> to vector<2x3xi32>
return		return
}		}

// -----		// -----

func.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) {		func.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) {
// expected-error @below {{expected byte size to be either 4, 8 or 16.}}		// expected-error @below {{expected byte size to be either 4, 8 or 16.}}
nvvm.cp.async.shared.global %arg0, %arg1, 32 : !llvm.ptr<3>, !llvm.ptr<1>		nvvm.cp.async.shared.global %arg0, %arg1, 32, cache = cg : !llvm.ptr<3>, !llvm.ptr<1>
return		return
}		}

// -----		// -----

func.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) {		func.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) {
// expected-error @below {{bypass l1 is only support for 16 bytes copy.}}		// expected-error @below {{CG cache modifier is only support for 16 bytes copy.}}
nvvm.cp.async.shared.global %arg0, %arg1, 8 {bypass_l1} : !llvm.ptr<3>, !llvm.ptr<1>		nvvm.cp.async.shared.global %arg0, %arg1, 8, cache = cg : !llvm.ptr<3>, !llvm.ptr<1>
return		return
}		}

// -----		// -----

func.func @gep_struct_variable(%arg0: !llvm.ptr, %arg1: i32, %arg2: i32) {		func.func @gep_struct_variable(%arg0: !llvm.ptr, %arg1: i32, %arg2: i32) {
// expected-error @below {{op expected index 1 indexing a struct to be constant}}		// expected-error @below {{op expected index 1 indexing a struct to be constant}}
llvm.getelementptr %arg0[%arg1, %arg1] : (!llvm.ptr, i32, i32) -> !llvm.ptr, !llvm.struct<(i32)>		llvm.getelementptr %arg0[%arg1, %arg1] : (!llvm.ptr, i32, i32) -> !llvm.ptr, !llvm.struct<(i32)>
▲ Show 20 Lines • Show All 211 Lines • Show Last 20 Lines

mlir/test/Dialect/LLVMIR/nvvm-typed-pointers.mlir

	// RUN: mlir-opt %s -split-input-file -verify-diagnostics \| FileCheck %s			// RUN: mlir-opt %s -split-input-file -verify-diagnostics \| FileCheck %s

	// CHECK-LABEL: @nvvm_wmma_load_tf32			// CHECK-LABEL: @nvvm_wmma_load_tf32
	func.func @nvvm_wmma_load_tf32(%arg0: !llvm.ptr<i32>, %arg1 : i32) -> !llvm.struct<(i32, i32, i32, i32)> {			func.func @nvvm_wmma_load_tf32(%arg0: !llvm.ptr<i32>, %arg1 : i32) -> !llvm.struct<(i32, i32, i32, i32)> {
	// CHECK: nvvm.wmma.load {{.*}} {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}			// CHECK: nvvm.wmma.load {{.*}} {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
	%0 = nvvm.wmma.load %arg0, %arg1			%0 = nvvm.wmma.load %arg0, %arg1
	{eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}			{eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
	: (!llvm.ptr<i32>) -> !llvm.struct<(i32, i32, i32, i32)>			: (!llvm.ptr<i32>) -> !llvm.struct<(i32, i32, i32, i32)>
	llvm.return %0 : !llvm.struct<(i32, i32, i32, i32)>			llvm.return %0 : !llvm.struct<(i32, i32, i32, i32)>
	}			}

	// CHECK-LABEL: @cp_async			// CHECK-LABEL: @cp_async
	llvm.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {			llvm.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
	// CHECK: nvvm.cp.async.shared.global %{{.}}, %{{.}}, 16			// CHECK: nvvm.cp.async.shared.global %{{.}}, %{{.}}, cache = ca
	nvvm.cp.async.shared.global %arg0, %arg1, 16 : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>			nvvm.cp.async.shared.global %arg0, %arg1, 16, cache=ca : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
	// CHECK: nvvm.cp.async.shared.global %{{.}}, %{{.}}, 16 {bypass_l1}			// CHECK: nvvm.cp.async.shared.global %{{.}}, %{{.}}, cache = cg
	nvvm.cp.async.shared.global %arg0, %arg1, 16 {bypass_l1} : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>			nvvm.cp.async.shared.global %arg0, %arg1, 16, cache=cg : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
	// CHECK: nvvm.cp.async.commit.group			// CHECK: nvvm.cp.async.commit.group
	nvvm.cp.async.commit.group			nvvm.cp.async.commit.group
	// CHECK: nvvm.cp.async.wait.group 0			// CHECK: nvvm.cp.async.wait.group 0
	nvvm.cp.async.wait.group 0			nvvm.cp.async.wait.group 0
	llvm.return			llvm.return
	}			}

	// CHECK-LABEL: llvm.func @ld_matrix			// CHECK-LABEL: llvm.func @ld_matrix
	Show All 30 Lines

mlir/test/Dialect/LLVMIR/nvvm.mlir

Show First 20 Lines • Show All 283 Lines • ▼ Show 20 Lines	%r = nvvm.wmma.mma %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15
{eltypeA = #nvvm.mma_type<tf32>, eltypeB = #nvvm.mma_type<f32>, k = 8 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}		{eltypeA = #nvvm.mma_type<tf32>, eltypeB = #nvvm.mma_type<f32>, k = 8 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32}
: (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32)		: (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32)
-> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>		-> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
llvm.return %r : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>		llvm.return %r : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
}		}

// CHECK-LABEL: @cp_async		// CHECK-LABEL: @cp_async
llvm.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) {		llvm.func @cp_async(%arg0: !llvm.ptr<3>, %arg1: !llvm.ptr<1>) {
// CHECK: nvvm.cp.async.shared.global %{{.}}, %{{.}}, 16		// CHECK: nvvm.cp.async.shared.global %{{.}}, %{{.}}, 16, cache = ca
nvvm.cp.async.shared.global %arg0, %arg1, 16 : !llvm.ptr<3>, !llvm.ptr<1>		nvvm.cp.async.shared.global %arg0, %arg1, 16, cache = ca : !llvm.ptr<3>, !llvm.ptr<1>
// CHECK: nvvm.cp.async.shared.global %{{.}}, %{{.}}, 16 {bypass_l1}		// CHECK: nvvm.cp.async.shared.global %{{.}}, %{{.}}, 16, cache = cg
nvvm.cp.async.shared.global %arg0, %arg1, 16 {bypass_l1} : !llvm.ptr<3>, !llvm.ptr<1>		nvvm.cp.async.shared.global %arg0, %arg1, 16, cache = cg : !llvm.ptr<3>, !llvm.ptr<1>
// CHECK: nvvm.cp.async.commit.group		// CHECK: nvvm.cp.async.commit.group
nvvm.cp.async.commit.group		nvvm.cp.async.commit.group
// CHECK: nvvm.cp.async.wait.group 0		// CHECK: nvvm.cp.async.wait.group 0
nvvm.cp.async.wait.group 0		nvvm.cp.async.wait.group 0
llvm.return		llvm.return
}		}

// CHECK-LABEL: llvm.func @ld_matrix		// CHECK-LABEL: llvm.func @ld_matrix
▲ Show 20 Lines • Show All 106 Lines • Show Last 20 Lines

mlir/test/Dialect/NVGPU/invalid.mlir

Show First 20 Lines • Show All 179 Lines • ▼ Show 20 Lines	func.func @mma_sp_sync_f16_16816(%arg0: vector<2x2xf16>,
%arg1: vector<2x2xf16>,		%arg1: vector<2x2xf16>,
%arg2: vector<2x2xf16>,		%arg2: vector<2x2xf16>,
%arg3: vector<2xi16>) -> vector<2x2xf16> {		%arg3: vector<2xi16>) -> vector<2x2xf16> {
// expected-error @+1 {{'nvgpu.mma.sp.sync' op sparsity selector should be 0 or 1}}		// expected-error @+1 {{'nvgpu.mma.sp.sync' op sparsity selector should be 0 or 1}}
%d = nvgpu.mma.sp.sync(%arg0, %arg1, %arg2) metadata(%arg3) {mmaShape = [16, 8, 16], sparsitySelector = 42 : i32} :		%d = nvgpu.mma.sp.sync(%arg0, %arg1, %arg2) metadata(%arg3) {mmaShape = [16, 8, 16], sparsitySelector = 42 : i32} :
(vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>		(vector<2x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
return %d : vector<2x2xf16>		return %d : vector<2x2xf16>
}		}

		// -----

		func.func @async_cp_zfill_f32_align1(
		%src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index, %srcElements : index) {
		// expected-error @+1 {{'nvgpu.device_async_copy' op bypassL1 does not satify alignment for 'memref<3x16x128xf32, 3>' with destination element 1. Unset bypassL1, or set destination element to 4}}
		%0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 1, %srcElements {bypassL1} : memref<128x128xf32> to memref<3x16x128xf32, 3>
		return
		}
		No newline at end of file
		nicolasvasilacheUnsubmitted Not Done Reply Inline Actions nit: nl nicolasvasilache: nit: nl

mlir/test/Target/LLVMIR/nvvmir.mlir

Show First 20 Lines • Show All 303 Lines • ▼ Show 20 Lines	%r = nvvm.wmma.mma %0, %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15
: (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32)		: (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32)
-> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>		-> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
llvm.return		llvm.return
}		}

// CHECK-LABEL: @cp_async		// CHECK-LABEL: @cp_async
llvm.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {		llvm.func @cp_async(%arg0: !llvm.ptr<i8, 3>, %arg1: !llvm.ptr<i8, 1>) {
// CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %{{.}}, ptr addrspace(1) %{{.}})		// CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.4(ptr addrspace(3) %{{.}}, ptr addrspace(1) %{{.}})
nvvm.cp.async.shared.global %arg0, %arg1, 4 : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>		nvvm.cp.async.shared.global %arg0, %arg1, 4, cache = ca : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
// CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %{{.}}, ptr addrspace(1) %{{.}})		// CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.8(ptr addrspace(3) %{{.}}, ptr addrspace(1) %{{.}})
nvvm.cp.async.shared.global %arg0, %arg1, 8 : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>		nvvm.cp.async.shared.global %arg0, %arg1, 8, cache = ca : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
// CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %{{.}}, ptr addrspace(1) %{{.}})		// CHECK: call void @llvm.nvvm.cp.async.ca.shared.global.16(ptr addrspace(3) %{{.}}, ptr addrspace(1) %{{.}})
nvvm.cp.async.shared.global %arg0, %arg1, 16 : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>		nvvm.cp.async.shared.global %arg0, %arg1, 16, cache = ca : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
// CHECK: call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %{{.}}, ptr addrspace(1) %{{.}})		// CHECK: call void @llvm.nvvm.cp.async.cg.shared.global.16(ptr addrspace(3) %{{.}}, ptr addrspace(1) %{{.}})
nvvm.cp.async.shared.global %arg0, %arg1, 16 {bypass_l1} : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>		nvvm.cp.async.shared.global %arg0, %arg1, 16, cache = cg : !llvm.ptr<i8, 3>, !llvm.ptr<i8, 1>
// CHECK: call void @llvm.nvvm.cp.async.commit.group()		// CHECK: call void @llvm.nvvm.cp.async.commit.group()
nvvm.cp.async.commit.group		nvvm.cp.async.commit.group
// CHECK: call void @llvm.nvvm.cp.async.wait.group(i32 0)		// CHECK: call void @llvm.nvvm.cp.async.wait.group(i32 0)
nvvm.cp.async.wait.group 0		nvvm.cp.async.wait.group 0
llvm.return		llvm.return
}		}

// CHECK-LABEL: @ld_matrix		// CHECK-LABEL: @ld_matrix
▲ Show 20 Lines • Show All 112 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][nvgpu] Implement `nvgpu.device_async_copy` by NVVMToLLVM Pass
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 537366

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp

mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir

mlir/test/Conversion/NVGPUToNVVM/typed-pointers.mlir

mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir

mlir/test/Dialect/LLVMIR/invalid-typed-pointers.mlir

mlir/test/Dialect/LLVMIR/invalid.mlir

mlir/test/Dialect/LLVMIR/nvvm-typed-pointers.mlir

mlir/test/Dialect/LLVMIR/nvvm.mlir

mlir/test/Dialect/NVGPU/invalid.mlir

mlir/test/Target/LLVMIR/nvvmir.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][nvgpu] Implement `nvgpu.device_async_copy` by NVVMToLLVM PassClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 537366

mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td

mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp

mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp

mlir/lib/Dialect/NVGPU/IR/NVGPUDialect.cpp

mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir

mlir/test/Conversion/NVGPUToNVVM/typed-pointers.mlir

mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir

mlir/test/Dialect/LLVMIR/invalid-typed-pointers.mlir

mlir/test/Dialect/LLVMIR/invalid.mlir

mlir/test/Dialect/LLVMIR/nvvm-typed-pointers.mlir

mlir/test/Dialect/LLVMIR/nvvm.mlir

mlir/test/Dialect/NVGPU/invalid.mlir

mlir/test/Target/LLVMIR/nvvmir.mlir

[mlir][nvgpu] Implement `nvgpu.device_async_copy` by NVVMToLLVM Pass
ClosedPublic