Diff 470569

clang/lib/Basic/Targets/NVPTX.h

Show First 20 Lines • Show All 171 Lines • ▼ Show 20 Lines	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
// TODO: We should warn if you apply a non-default CC to anything other than		// TODO: We should warn if you apply a non-default CC to anything other than
// a host function.		// a host function.
if (HostTarget)		if (HostTarget)
return HostTarget->checkCallingConvention(CC);		return HostTarget->checkCallingConvention(CC);
return CCCR_Warning;		return CCCR_Warning;
}		}

bool hasBitIntType() const override { return true; }		bool hasBitIntType() const override { return true; }
		bool hasBFloat16Type() const override { return true; }
		const char *getBFloat16Mangling() const override { return "u6__bf16"; };
};		};
} // namespace targets		} // namespace targets
} // namespace clang		} // namespace clang
#endif // LLVM_CLANG_LIB_BASIC_TARGETS_NVPTX_H		#endif // LLVM_CLANG_LIB_BASIC_TARGETS_NVPTX_H

clang/lib/Basic/Targets/NVPTX.cpp

Show First 20 Lines • Show All 46 Lines • ▼ Show 20 Lines	if (!Feature.startswith("+ptx") \|\|
continue;		continue;
PTXVersion = PTXV; // TODO: should it be max(PTXVersion, PTXV)?		PTXVersion = PTXV; // TODO: should it be max(PTXVersion, PTXV)?
}		}

TLSSupported = false;		TLSSupported = false;
VLASupported = false;		VLASupported = false;
AddrSpaceMap = &NVPTXAddrSpaceMap;		AddrSpaceMap = &NVPTXAddrSpaceMap;
UseAddrSpaceMapMangling = true;		UseAddrSpaceMapMangling = true;
		// __bf16 is always available as a load/store only type.
		BFloat16Width = BFloat16Align = 16;
		BFloat16Format = &llvm::APFloat::BFloat();

// Define available target features		// Define available target features
// These must be defined in sorted order!		// These must be defined in sorted order!
NoAsmVariants = true;		NoAsmVariants = true;
GPU = CudaArch::SM_20;		GPU = CudaArch::SM_20;

if (TargetPointerWidth == 32)		if (TargetPointerWidth == 32)
resetDataLayout("e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");		resetDataLayout("e-p:32:32-i64:64-i128:128-v16:16-v32:32-n16:32:64");
▲ Show 20 Lines • Show All 203 Lines • Show Last 20 Lines

clang/test/CodeGenCUDA/bf16.cu

This file was added.

				// REQUIRES: nvptx-registered-target
				// REQUIRES: x86-registered-target

				// RUN: %clang_cc1 "-aux-triple" "x86_64-unknown-linux-gnu" "-triple" "nvptx64-nvidia-cuda" \
				// RUN: -fcuda-is-device "-aux-target-cpu" "x86-64" -S -o - %s \| FileCheck %s

				#include "Inputs/cuda.h"

				// CHECK-LABEL: .visible .func _Z8test_argPu6__bf16u6__bf16(
				// CHECK: .param .b64 _Z8test_argPu6__bf16u6__bf16_param_0,
				// CHECK: .param .b16 _Z8test_argPu6__bf16u6__bf16_param_1
				//
				__device__ void test_arg(__bf16 *out, __bf16 in) {
				// CHECK: ld.param.b16 %{{h.*}}, [_Z8test_argPu6__bf16u6__bf16_param_1];
				__bf16 bf16 = in;
				*out = bf16;
				// CHECK: st.b16
				// CHECK: ret;
				}


				// CHECK-LABEL: .visible .func (.param .b32 func_retval0) _Z8test_retu6__bf16(
				// CHECK: .param .b16 _Z8test_retu6__bf16_param_0
				__device__ __bf16 test_ret( __bf16 in) {
				// CHECK: ld.param.b16 %h{{.*}}, [_Z8test_retu6__bf16_param_0];
				return in;
				// CHECK: st.param.b16 [func_retval0+0], %h
				// CHECK: ret;
				}

				// CHECK-LABEL: .visible .func (.param .b32 func_retval0) _Z9test_callu6__bf16(
				// CHECK: .param .b16 _Z9test_callu6__bf16_param_0
				__device__ __bf16 test_call( __bf16 in) {
				// CHECK: ld.param.b16 %h{{.*}}, [_Z9test_callu6__bf16_param_0];
				// CHECK: st.param.b16 [param0+0], %h2;
				// CHECK: .param .b32 retval0;
				// CHECK: call.uni (retval0),
				// CHECK-NEXT: _Z8test_retu6__bf16,
				// CHECK-NEXT: (
				// CHECK-NEXT: param0
				// CHECK-NEXT );
				// CHECK: ld.param.b16 %h{{.*}}, [retval0+0];
				return test_ret(in);
				// CHECK: st.param.b16 [func_retval0+0], %h
				// CHECK: ret;
				}

clang/test/SemaCUDA/bf16.cu

This file was added.

				// REQUIRES: nvptx-registered-target
				// REQUIRES: x86-registered-target

				// RUN: %clang_cc1 "-triple" "x86_64-unknown-linux-gnu" "-aux-triple" "nvptx64-nvidia-cuda" \
				// RUN: "-target-cpu" "x86-64" -fsyntax-only -verify=scalar %s
				// RUN: %clang_cc1 "-aux-triple" "x86_64-unknown-linux-gnu" "-triple" "nvptx64-nvidia-cuda" \
				// RUN: -fcuda-is-device "-aux-target-cpu" "x86-64" -fsyntax-only -verify=scalar %s

				#include "Inputs/cuda.h"

				__device__ void test(bool b, __bf16 *out, __bf16 in) {
				__bf16 bf16 = in; // No error on using the type itself.

				bf16 + bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
				bf16 - bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
				bf16 * bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}
				bf16 / bf16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__bf16')}}

				__fp16 fp16;

				bf16 + fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
				fp16 + bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
				bf16 - fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
				fp16 - bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
				bf16 * fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
				fp16 * bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
				bf16 / fp16; // scalar-error {{invalid operands to binary expression ('__bf16' and '__fp16')}}
				fp16 / bf16; // scalar-error {{invalid operands to binary expression ('__fp16' and '__bf16')}}
				bf16 = fp16; // scalar-error {{assigning to '__bf16' from incompatible type '__fp16'}}
				fp16 = bf16; // scalar-error {{assigning to '__fp16' from incompatible type '__bf16'}}
				bf16 + (b ? fp16 : bf16); // scalar-error {{incompatible operand types ('__fp16' and '__bf16')}}
				*out = bf16;
				}

llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp

Show First 20 Lines • Show All 1,825 Lines • ▼ Show 20 Lines	if (const auto *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
AggBuffer->addZeros(AllocSize);		AggBuffer->addZeros(AllocSize);
break;		break;
}		}
}		}
llvm_unreachable("unsupported integer const type");		llvm_unreachable("unsupported integer const type");
break;		break;

case Type::HalfTyID:		case Type::HalfTyID:
		case Type::BFloatTyID:
case Type::FloatTyID:		case Type::FloatTyID:
case Type::DoubleTyID:		case Type::DoubleTyID:
AddIntToBuffer(cast<ConstantFP>(CPV)->getValueAPF().bitcastToAPInt());		AddIntToBuffer(cast<ConstantFP>(CPV)->getValueAPF().bitcastToAPInt());
break;		break;

case Type::PointerTyID: {		case Type::PointerTyID: {
if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {		if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
AggBuffer->addSymbol(GVar, GVar);		AggBuffer->addSymbol(GVar, GVar);
▲ Show 20 Lines • Show All 377 Lines • Show Last 20 Lines

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Show First 20 Lines • Show All 817 Lines • ▼ Show 20 Lines	case MVT::i8:
return Opcode_i8;		return Opcode_i8;
case MVT::i16:		case MVT::i16:
return Opcode_i16;		return Opcode_i16;
case MVT::i32:		case MVT::i32:
return Opcode_i32;		return Opcode_i32;
case MVT::i64:		case MVT::i64:
return Opcode_i64;		return Opcode_i64;
case MVT::f16:		case MVT::f16:
		case MVT::bf16:
return Opcode_f16;		return Opcode_f16;
case MVT::v2f16:		case MVT::v2f16:
		case MVT::v2bf16:
return Opcode_f16x2;		return Opcode_f16x2;
case MVT::f32:		case MVT::f32:
return Opcode_f32;		return Opcode_f32;
case MVT::f64:		case MVT::f64:
return Opcode_f64;		return Opcode_f64;
default:		default:
return None;		return None;
}		}
}		}
		jchlandaUnsubmitted Not Done Reply Inline Actions New line here. jchlanda: New line here.

		static int getLdStRegType(EVT VT) {
		if (VT.isFloatingPoint())
		switch (VT.getSimpleVT().SimpleTy) {
		case MVT::f16:
		case MVT::bf16:
		jchlandaUnsubmitted Not Done Reply Inline Actions Nice, this fixes the reg type for `v2f16` from `Float` to `Untyped`, looks like no test picked up on that before. jchlanda: Nice, this fixes the reg type for `v2f16` from `Float` to `Untyped`, looks like no test picked…
		case MVT::v2f16:
		case MVT::v2bf16:
		return NVPTX::PTXLdStInstCode::Untyped;
		default:
		return NVPTX::PTXLdStInstCode::Float;
		}
		else
		return NVPTX::PTXLdStInstCode::Unsigned;
		}

bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {		bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
SDLoc dl(N);		SDLoc dl(N);
MemSDNode *LD = cast<MemSDNode>(N);		MemSDNode *LD = cast<MemSDNode>(N);
assert(LD->readMem() && "Expected load");		assert(LD->readMem() && "Expected load");
LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);		LoadSDNode *PlainLoad = dyn_cast<LoadSDNode>(N);
EVT LoadedVT = LD->getMemoryVT();		EVT LoadedVT = LD->getMemoryVT();
SDNode *NVPTXLD = nullptr;		SDNode *NVPTXLD = nullptr;

Show All 40 Lines	bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
MVT ScalarVT = SimpleVT.getScalarType();		MVT ScalarVT = SimpleVT.getScalarType();
// Read at least 8 bits (predicates are stored as 8-bit values)		// Read at least 8 bits (predicates are stored as 8-bit values)
unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());		unsigned fromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
unsigned int fromType;		unsigned int fromType;

// Vector Setting		// Vector Setting
unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;		unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
if (SimpleVT.isVector()) {		if (SimpleVT.isVector()) {
assert(LoadedVT == MVT::v2f16 && "Unexpected vector type");		assert((LoadedVT == MVT::v2f16 \|\| LoadedVT == MVT::v2bf16) &&
// v2f16 is loaded using ld.b32		"Unexpected vector type");
		// v2f16/v2bf16 is loaded using ld.b32
fromTypeWidth = 32;		fromTypeWidth = 32;
}		}

if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))		if (PlainLoad && (PlainLoad->getExtensionType() == ISD::SEXTLOAD))
fromType = NVPTX::PTXLdStInstCode::Signed;		fromType = NVPTX::PTXLdStInstCode::Signed;
else if (ScalarVT.isFloatingPoint())
// f16 uses .b16 as its storage type.
fromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
: NVPTX::PTXLdStInstCode::Float;
else		else
fromType = NVPTX::PTXLdStInstCode::Unsigned;		fromType = getLdStRegType(ScalarVT);

// Create the machine instruction DAG		// Create the machine instruction DAG
SDValue Chain = N->getOperand(0);		SDValue Chain = N->getOperand(0);
SDValue N1 = N->getOperand(1);		SDValue N1 = N->getOperand(1);
SDValue Addr;		SDValue Addr;
SDValue Offset, Base;		SDValue Offset, Base;
Optional<unsigned> Opcode;		Optional<unsigned> Opcode;
MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;		MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
▲ Show 20 Lines • Show All 113 Lines • ▼ Show 20 Lines	bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
// Read at least 8 bits (predicates are stored as 8-bit values)		// Read at least 8 bits (predicates are stored as 8-bit values)
unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());		unsigned FromTypeWidth = std::max(8U, (unsigned)ScalarVT.getSizeInBits());
unsigned int FromType;		unsigned int FromType;
// The last operand holds the original LoadSDNode::getExtensionType() value		// The last operand holds the original LoadSDNode::getExtensionType() value
unsigned ExtensionType = cast<ConstantSDNode>(		unsigned ExtensionType = cast<ConstantSDNode>(
N->getOperand(N->getNumOperands() - 1))->getZExtValue();		N->getOperand(N->getNumOperands() - 1))->getZExtValue();
if (ExtensionType == ISD::SEXTLOAD)		if (ExtensionType == ISD::SEXTLOAD)
FromType = NVPTX::PTXLdStInstCode::Signed;		FromType = NVPTX::PTXLdStInstCode::Signed;
else if (ScalarVT.isFloatingPoint())
FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
: NVPTX::PTXLdStInstCode::Float;
else		else
FromType = NVPTX::PTXLdStInstCode::Unsigned;		FromType = getLdStRegType(ScalarVT);

unsigned VecType;		unsigned VecType;

switch (N->getOpcode()) {		switch (N->getOpcode()) {
case NVPTXISD::LoadV2:		case NVPTXISD::LoadV2:
VecType = NVPTX::PTXLdStInstCode::V2;		VecType = NVPTX::PTXLdStInstCode::V2;
break;		break;
case NVPTXISD::LoadV4:		case NVPTXISD::LoadV4:
VecType = NVPTX::PTXLdStInstCode::V4;		VecType = NVPTX::PTXLdStInstCode::V4;
break;		break;
default:		default:
return false;		return false;
}		}

EVT EltVT = N->getValueType(0);		EVT EltVT = N->getValueType(0);

// v8f16 is a special case. PTX doesn't have ld.v8.f16		// v8f16 is a special case. PTX doesn't have ld.v8.f16
// instruction. Instead, we split the vector into v2f16 chunks and		// instruction. Instead, we split the vector into v2f16 chunks and
// load them with ld.v4.b32.		// load them with ld.v4.b32.
if (EltVT == MVT::v2f16) {		if (EltVT == MVT::v2f16 \|\| EltVT == MVT::v2bf16) {
assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");		assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
EltVT = MVT::i32;		EltVT = MVT::i32;
FromType = NVPTX::PTXLdStInstCode::Untyped;		FromType = NVPTX::PTXLdStInstCode::Untyped;
FromTypeWidth = 32;		FromTypeWidth = 32;
}		}

if (SelectDirectAddr(Op1, Addr)) {		if (SelectDirectAddr(Op1, Addr)) {
switch (N->getOpcode()) {		switch (N->getOpcode()) {
▲ Show 20 Lines • Show All 671 Lines • ▼ Show 20 Lines	bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;		unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;

// Type Setting: toType + toTypeWidth		// Type Setting: toType + toTypeWidth
// - for integer type, always use 'u'		// - for integer type, always use 'u'
//		//
MVT ScalarVT = SimpleVT.getScalarType();		MVT ScalarVT = SimpleVT.getScalarType();
unsigned toTypeWidth = ScalarVT.getSizeInBits();		unsigned toTypeWidth = ScalarVT.getSizeInBits();
if (SimpleVT.isVector()) {		if (SimpleVT.isVector()) {
assert(StoreVT == MVT::v2f16 && "Unexpected vector type");		assert((StoreVT == MVT::v2f16 \|\| StoreVT == MVT::v2bf16) &&
		"Unexpected vector type");
// v2f16 is stored using st.b32		// v2f16 is stored using st.b32
toTypeWidth = 32;		toTypeWidth = 32;
}		}

unsigned int toType;		unsigned int toType = getLdStRegType(ScalarVT);
if (ScalarVT.isFloatingPoint())
// f16 uses .b16 as its storage type.
toType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
: NVPTX::PTXLdStInstCode::Float;
else
toType = NVPTX::PTXLdStInstCode::Unsigned;

// Create the machine instruction DAG		// Create the machine instruction DAG
SDValue Chain = ST->getChain();		SDValue Chain = ST->getChain();
SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();		SDValue Value = PlainStore ? PlainStore->getValue() : AtomicStore->getVal();
SDValue BasePtr = ST->getBasePtr();		SDValue BasePtr = ST->getBasePtr();
SDValue Addr;		SDValue Addr;
SDValue Offset, Base;		SDValue Offset, Base;
Optional<unsigned> Opcode;		Optional<unsigned> Opcode;
▲ Show 20 Lines • Show All 123 Lines • ▼ Show 20 Lines	if (CodeAddrSpace != NVPTX::PTXLdStInstCode::GLOBAL &&
CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)		CodeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
IsVolatile = false;		IsVolatile = false;

// Type Setting: toType + toTypeWidth		// Type Setting: toType + toTypeWidth
// - for integer type, always use 'u'		// - for integer type, always use 'u'
assert(StoreVT.isSimple() && "Store value is not simple");		assert(StoreVT.isSimple() && "Store value is not simple");
MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();		MVT ScalarVT = StoreVT.getSimpleVT().getScalarType();
unsigned ToTypeWidth = ScalarVT.getSizeInBits();		unsigned ToTypeWidth = ScalarVT.getSizeInBits();
unsigned ToType;		unsigned ToType = getLdStRegType(ScalarVT);
if (ScalarVT.isFloatingPoint())
ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
: NVPTX::PTXLdStInstCode::Float;
else
ToType = NVPTX::PTXLdStInstCode::Unsigned;

SmallVector<SDValue, 12> StOps;		SmallVector<SDValue, 12> StOps;
SDValue N2;		SDValue N2;
unsigned VecType;		unsigned VecType;

switch (N->getOpcode()) {		switch (N->getOpcode()) {
case NVPTXISD::StoreV2:		case NVPTXISD::StoreV2:
VecType = NVPTX::PTXLdStInstCode::V2;		VecType = NVPTX::PTXLdStInstCode::V2;
Show All 11 Lines	case NVPTXISD::StoreV4:
break;		break;
default:		default:
return false;		return false;
}		}

// v8f16 is a special case. PTX doesn't have st.v8.f16		// v8f16 is a special case. PTX doesn't have st.v8.f16
// instruction. Instead, we split the vector into v2f16 chunks and		// instruction. Instead, we split the vector into v2f16 chunks and
// store them with st.v4.b32.		// store them with st.v4.b32.
if (EltVT == MVT::v2f16) {		if (EltVT == MVT::v2f16 \|\| EltVT == MVT::v2bf16) {
assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");		assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
EltVT = MVT::i32;		EltVT = MVT::i32;
ToType = NVPTX::PTXLdStInstCode::Untyped;		ToType = NVPTX::PTXLdStInstCode::Untyped;
ToTypeWidth = 32;		ToTypeWidth = 32;
}		}

StOps.push_back(getI32Imm(IsVolatile, DL));		StOps.push_back(getI32Imm(IsVolatile, DL));
StOps.push_back(getI32Imm(CodeAddrSpace, DL));		StOps.push_back(getI32Imm(CodeAddrSpace, DL));
▲ Show 20 Lines • Show All 1,820 Lines • Show Last 20 Lines

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Show First 20 Lines • Show All 127 Lines • ▼ Show 20 Lines	static bool IsPTXVectorType(MVT VT) {
case MVT::v2i16:		case MVT::v2i16:
case MVT::v4i16:		case MVT::v4i16:
case MVT::v2i32:		case MVT::v2i32:
case MVT::v4i32:		case MVT::v4i32:
case MVT::v2i64:		case MVT::v2i64:
case MVT::v2f16:		case MVT::v2f16:
case MVT::v4f16:		case MVT::v4f16:
case MVT::v8f16: // <4 x f16x2>		case MVT::v8f16: // <4 x f16x2>
		case MVT::v2bf16:
		case MVT::v4bf16:
		case MVT::v8bf16: // <4 x bf16x2>
case MVT::v2f32:		case MVT::v2f32:
case MVT::v4f32:		case MVT::v4f32:
case MVT::v2f64:		case MVT::v2f64:
return true;		return true;
}		}
}		}

/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive		/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
▲ Show 20 Lines • Show All 41 Lines • ▼ Show 20 Lines	for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
// Split vectors into individual elements, except for v2f16, which		// Split vectors into individual elements, except for v2f16, which
// we will pass as a single scalar.		// we will pass as a single scalar.
if (VT.isVector()) {		if (VT.isVector()) {
unsigned NumElts = VT.getVectorNumElements();		unsigned NumElts = VT.getVectorNumElements();
EVT EltVT = VT.getVectorElementType();		EVT EltVT = VT.getVectorElementType();
// Vectors with an even number of f16 elements will be passed to		// Vectors with an even number of f16 elements will be passed to
// us as an array of v2f16 elements. We must match this so we		// us as an array of v2f16 elements. We must match this so we
// stay in sync with Ins/Outs.		// stay in sync with Ins/Outs.
if (EltVT == MVT::f16 && NumElts % 2 == 0) {		if ((EltVT == MVT::f16 \|\| EltVT == MVT::f16) && NumElts % 2 == 0) {
EltVT = MVT::v2f16;		EltVT = EltVT == MVT::f16 ? MVT::v2f16 : MVT::v2bf16;
NumElts /= 2;		NumElts /= 2;
}		}
for (unsigned j = 0; j != NumElts; ++j) {		for (unsigned j = 0; j != NumElts; ++j) {
ValueVTs.push_back(EltVT);		ValueVTs.push_back(EltVT);
if (Offsets)		if (Offsets)
Offsets->push_back(Off + j * EltVT.getStoreSize());		Offsets->push_back(Off + j * EltVT.getStoreSize());
}		}
} else {		} else {
▲ Show 20 Lines • Show All 192 Lines • ▼ Show 20 Lines	NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);		addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);		addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);		addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);		addRegisterClass(MVT::i64, &NVPTX::Int64RegsRegClass);
addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);		addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);		addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);		addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);		addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
		addRegisterClass(MVT::bf16, &NVPTX::Float16RegsRegClass);
		addRegisterClass(MVT::v2bf16, &NVPTX::Float16x2RegsRegClass);

// Conversion to/from FP16/FP16x2 is always legal.		// Conversion to/from FP16/FP16x2 is always legal.
setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);		setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);		setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);		setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);		setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);		setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);		setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f16, Expand);
▲ Show 20 Lines • Show All 79 Lines • ▼ Show 20 Lines	for (MVT VT : MVT::integer_valuetypes()) {
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);		setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
setTruncStoreAction(VT, MVT::i1, Expand);		setTruncStoreAction(VT, MVT::i1, Expand);
}		}

// This is legal in NVPTX		// This is legal in NVPTX
setOperationAction(ISD::ConstantFP, MVT::f64, Legal);		setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
setOperationAction(ISD::ConstantFP, MVT::f32, Legal);		setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);		setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
		setOperationAction(ISD::ConstantFP, MVT::bf16, Legal);

// TRAP can be lowered to PTX trap		// TRAP can be lowered to PTX trap
setOperationAction(ISD::TRAP, MVT::Other, Legal);		setOperationAction(ISD::TRAP, MVT::Other, Legal);

// Register custom handling for vector loads/stores		// Register custom handling for vector loads/stores
for (MVT VT : MVT::fixedlen_vector_valuetypes()) {		for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
if (IsPTXVectorType(VT)) {		if (IsPTXVectorType(VT)) {
setOperationAction(ISD::LOAD, VT, Custom);		setOperationAction(ISD::LOAD, VT, Custom);
▲ Show 20 Lines • Show All 1,823 Lines • ▼ Show 20 Lines	if (ValVT.isVector()) {
switch (ValVT.getSimpleVT().SimpleTy) {		switch (ValVT.getSimpleVT().SimpleTy) {
default:		default:
return SDValue();		return SDValue();
case MVT::v2i8:		case MVT::v2i8:
case MVT::v2i16:		case MVT::v2i16:
case MVT::v2i32:		case MVT::v2i32:
case MVT::v2i64:		case MVT::v2i64:
case MVT::v2f16:		case MVT::v2f16:
		case MVT::v2bf16:
case MVT::v2f32:		case MVT::v2f32:
case MVT::v2f64:		case MVT::v2f64:
case MVT::v4i8:		case MVT::v4i8:
case MVT::v4i16:		case MVT::v4i16:
case MVT::v4i32:		case MVT::v4i32:
case MVT::v4f16:		case MVT::v4f16:
		case MVT::v4bf16:
case MVT::v4f32:		case MVT::v4f32:
case MVT::v8f16: // <4 x f16x2>		case MVT::v8f16: // <4 x f16x2>
		case MVT::v8bf16: // <4 x bf16x2>
// This is a "native" vector type		// This is a "native" vector type
break;		break;
}		}

MemSDNode *MemSD = cast<MemSDNode>(N);		MemSDNode *MemSD = cast<MemSDNode>(N);
const DataLayout &TD = DAG.getDataLayout();		const DataLayout &TD = DAG.getDataLayout();

Align Alignment = MemSD->getAlign();		Align Alignment = MemSD->getAlign();
Show All 28 Lines	case 2:
break;		break;
case 4:		case 4:
Opcode = NVPTXISD::StoreV4;		Opcode = NVPTXISD::StoreV4;
break;		break;
case 8:		case 8:
// v8f16 is a special case. PTX doesn't have st.v8.f16		// v8f16 is a special case. PTX doesn't have st.v8.f16
// instruction. Instead, we split the vector into v2f16 chunks and		// instruction. Instead, we split the vector into v2f16 chunks and
// store them with st.v4.b32.		// store them with st.v4.b32.
assert(EltVT == MVT::f16 && "Wrong type for the vector.");		assert((EltVT == MVT::f16 \|\| EltVT == MVT::bf16) &&
		"Wrong type for the vector.");
Opcode = NVPTXISD::StoreV4;		Opcode = NVPTXISD::StoreV4;
StoreF16x2 = true;		StoreF16x2 = true;
break;		break;
}		}

SmallVector<SDValue, 8> Ops;		SmallVector<SDValue, 8> Ops;

// First is the chain		// First is the chain
▲ Show 20 Lines • Show All 2,584 Lines • ▼ Show 20 Lines	case 4: {
EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };		EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
LdResVTs = DAG.getVTList(ListVTs);		LdResVTs = DAG.getVTList(ListVTs);
break;		break;
}		}
case 8: {		case 8: {
// v8f16 is a special case. PTX doesn't have ld.v8.f16		// v8f16 is a special case. PTX doesn't have ld.v8.f16
// instruction. Instead, we split the vector into v2f16 chunks and		// instruction. Instead, we split the vector into v2f16 chunks and
// load them with ld.v4.b32.		// load them with ld.v4.b32.
assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");		assert((EltVT == MVT::f16 \|\| EltVT == MVT::bf16) &&
		"Unsupported v8 vector type.");
LoadF16x2 = true;		LoadF16x2 = true;
Opcode = NVPTXISD::LoadV4;		Opcode = NVPTXISD::LoadV4;
EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,		EVT VVT = (EltVT == MVT::f16) ? MVT::v2f16 : MVT::v2bf16;
MVT::Other};		EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other};
LdResVTs = DAG.getVTList(ListVTs);		LdResVTs = DAG.getVTList(ListVTs);
break;		break;
}		}
}		}

// Copy regular operands		// Copy regular operands
SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());		SmallVector<SDValue, 8> OtherOps(N->op_begin(), N->op_end());

▲ Show 20 Lines • Show All 258 Lines • Show Last 20 Lines

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Show First 20 Lines • Show All 166 Lines • ▼ Show 20 Lines

// non-sync shfl instructions are not available on sm_70+ in PTX6.4+		// non-sync shfl instructions are not available on sm_70+ in PTX6.4+
def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"		def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
"&& Subtarget->getPTXVersion() >= 64)">;		"&& Subtarget->getPTXVersion() >= 64)">;

def useShortPtr : Predicate<"useShortPointers()">;		def useShortPtr : Predicate<"useShortPointers()">;
def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;		def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;

		// Helper class to aid conversion between ValueType and a matching RegisterClass.

		class ValueToRegClass<ValueType T> {
		string name = !cast<string>(T);
		NVPTXRegClass ret = !cond(
		!eq(name, "i1"): Int1Regs,
		!eq(name, "i16"): Int16Regs,
		!eq(name, "i32"): Int32Regs,
		!eq(name, "i64"): Int64Regs,
		!eq(name, "f16"): Float16Regs,
		!eq(name, "v2f16"): Float16x2Regs,
		!eq(name, "bf16"): Float16Regs,
		AllenUnsubmitted Not Done Reply Inline Actions sorry for a basic question: what's the different between bf16 and f16 ? Allen: sorry for a basic question: what's the different between bf16 and f16 ?
		traAuthorUnsubmitted Done Reply Inline Actions https://en.wikipedia.org/wiki/Bfloat16_floating-point_format tra: https://en.wikipedia.org/wiki/Bfloat16_floating-point_format
		traAuthorUnsubmitted Done Reply Inline Actions If your question is why both bf16 and f16 use Float16Regs, then the answer is that both use 'untyped' 16-bit integer registers. The difference from Int16Regs is that those are signed. PTX has some awkward restrictions on matching instructions and register kinds, even though under the hood it all boils down to everything using 32-bit registers. tra: If your question is why both bf16 and f16 use Float16Regs, then the answer is that both use…
		AllenUnsubmitted Done Reply Inline Actions Thanks for your explanation. Allen: Thanks for your explanation.
		!eq(name, "v2bf16"): Float16x2Regs,
		!eq(name, "f32"): Float32Regs,
		!eq(name, "f64"): Float64Regs,
		!eq(name, "ai32"): Int32ArgRegs,
		!eq(name, "ai64"): Int64ArgRegs,
		!eq(name, "af32"): Float32ArgRegs,
		!eq(name, "if64"): Float64ArgRegs,
		);
		}



//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Some Common Instruction Class Templates		// Some Common Instruction Class Templates
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

// Template for instructions which take three int64, int32, or int16 args.		// Template for instructions which take three int64, int32, or int16 args.
// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").		// The instructions are named "<OpcStr><Width>" (e.g. "add.s64").
multiclass I3<string OpcStr, SDNode OpNode> {		multiclass I3<string OpcStr, SDNode OpNode> {
def i64rr :		def i64rr :
▲ Show 20 Lines • Show All 89 Lines • ▼ Show 20 Lines	NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),		(ins Float32Regs:$a, f32imm:$b),
!strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
[(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;		[(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>;

def f16rr_ftz :		def f16rr_ftz :
NVPTXInst<(outs Float16Regs:$dst),		NVPTXInst<(outs Float16Regs:$dst),
(ins Float16Regs:$a, Float16Regs:$b),		(ins Float16Regs:$a, Float16Regs:$b),
!strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
[(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,		[(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>,
Requires<[useFP16Math, doF32FTZ]>;		Requires<[useFP16Math, doF32FTZ]>;
def f16rr :		def f16rr :
NVPTXInst<(outs Float16Regs:$dst),		NVPTXInst<(outs Float16Regs:$dst),
(ins Float16Regs:$a, Float16Regs:$b),		(ins Float16Regs:$a, Float16Regs:$b),
!strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
[(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,		[(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>,
Requires<[useFP16Math]>;		Requires<[useFP16Math]>;

def f16x2rr_ftz :		def f16x2rr_ftz :
NVPTXInst<(outs Float16x2Regs:$dst),		NVPTXInst<(outs Float16x2Regs:$dst),
(ins Float16x2Regs:$a, Float16x2Regs:$b),		(ins Float16x2Regs:$a, Float16x2Regs:$b),
!strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
[(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,		[(set Float16x2Regs:$dst, (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>,
Requires<[useFP16Math, doF32FTZ]>;		Requires<[useFP16Math, doF32FTZ]>;
def f16x2rr :		def f16x2rr :
NVPTXInst<(outs Float16x2Regs:$dst),		NVPTXInst<(outs Float16x2Regs:$dst),
(ins Float16x2Regs:$a, Float16x2Regs:$b),		(ins Float16x2Regs:$a, Float16x2Regs:$b),
!strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
[(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,		[(set Float16x2Regs:$dst, (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>,
Requires<[useFP16Math]>;		Requires<[useFP16Math]>;
}		}

// Template for instructions which take three FP args. The		// Template for instructions which take three FP args. The
// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").		// instructions are named "<OpcStr>.f<Width>" (e.g. "add.f64").
//		//
// Also defines ftz (flush subnormal inputs and results to sign-preserving		// Also defines ftz (flush subnormal inputs and results to sign-preserving
// zero) variants for fp32/fp16 functions.		// zero) variants for fp32/fp16 functions.
Show All 38 Lines	NVPTXInst<(outs Float32Regs:$dst),
!strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".f32 \t$dst, $a, $b;"),
[(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,		[(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
Requires<[allowFMA]>;		Requires<[allowFMA]>;

def f16rr_ftz :		def f16rr_ftz :
NVPTXInst<(outs Float16Regs:$dst),		NVPTXInst<(outs Float16Regs:$dst),
(ins Float16Regs:$a, Float16Regs:$b),		(ins Float16Regs:$a, Float16Regs:$b),
!strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".ftz.f16 \t$dst, $a, $b;"),
[(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,		[(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>,
Requires<[useFP16Math, allowFMA, doF32FTZ]>;		Requires<[useFP16Math, allowFMA, doF32FTZ]>;
def f16rr :		def f16rr :
NVPTXInst<(outs Float16Regs:$dst),		NVPTXInst<(outs Float16Regs:$dst),
(ins Float16Regs:$a, Float16Regs:$b),		(ins Float16Regs:$a, Float16Regs:$b),
!strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".f16 \t$dst, $a, $b;"),
[(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,		[(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>,
Requires<[useFP16Math, allowFMA]>;		Requires<[useFP16Math, allowFMA]>;

def f16x2rr_ftz :		def f16x2rr_ftz :
NVPTXInst<(outs Float16x2Regs:$dst),		NVPTXInst<(outs Float16x2Regs:$dst),
(ins Float16x2Regs:$a, Float16x2Regs:$b),		(ins Float16x2Regs:$a, Float16x2Regs:$b),
!strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
[(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,		[(set (v2f16 Float16x2Regs:$dst), (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>,
Requires<[useFP16Math, allowFMA, doF32FTZ]>;		Requires<[useFP16Math, allowFMA, doF32FTZ]>;
def f16x2rr :		def f16x2rr :
NVPTXInst<(outs Float16x2Regs:$dst),		NVPTXInst<(outs Float16x2Regs:$dst),
(ins Float16x2Regs:$a, Float16x2Regs:$b),		(ins Float16x2Regs:$a, Float16x2Regs:$b),
!strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
[(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,		[(set Float16x2Regs:$dst, (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>,
Requires<[useFP16Math, allowFMA]>;		Requires<[useFP16Math, allowFMA]>;

// These have strange names so we don't perturb existing mir tests.		// These have strange names so we don't perturb existing mir tests.
def _rnf64rr :		def _rnf64rr :
NVPTXInst<(outs Float64Regs:$dst),		NVPTXInst<(outs Float64Regs:$dst),
(ins Float64Regs:$a, Float64Regs:$b),		(ins Float64Regs:$a, Float64Regs:$b),
!strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".rn.f64 \t$dst, $a, $b;"),
[(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,		[(set Float64Regs:$dst, (OpNode Float64Regs:$a, Float64Regs:$b))]>,
Show All 27 Lines	NVPTXInst<(outs Float32Regs:$dst),
(ins Float32Regs:$a, f32imm:$b),		(ins Float32Regs:$a, f32imm:$b),
!strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".rn.f32 \t$dst, $a, $b;"),
[(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,		[(set Float32Regs:$dst, (OpNode Float32Regs:$a, fpimm:$b))]>,
Requires<[noFMA]>;		Requires<[noFMA]>;
def _rnf16rr_ftz :		def _rnf16rr_ftz :
NVPTXInst<(outs Float16Regs:$dst),		NVPTXInst<(outs Float16Regs:$dst),
(ins Float16Regs:$a, Float16Regs:$b),		(ins Float16Regs:$a, Float16Regs:$b),
!strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".rn.ftz.f16 \t$dst, $a, $b;"),
[(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,		[(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>,
Requires<[useFP16Math, noFMA, doF32FTZ]>;		Requires<[useFP16Math, noFMA, doF32FTZ]>;
def _rnf16rr :		def _rnf16rr :
NVPTXInst<(outs Float16Regs:$dst),		NVPTXInst<(outs Float16Regs:$dst),
(ins Float16Regs:$a, Float16Regs:$b),		(ins Float16Regs:$a, Float16Regs:$b),
!strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
[(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,		[(set Float16Regs:$dst, (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>,
Requires<[useFP16Math, noFMA]>;		Requires<[useFP16Math, noFMA]>;
def _rnf16x2rr_ftz :		def _rnf16x2rr_ftz :
NVPTXInst<(outs Float16x2Regs:$dst),		NVPTXInst<(outs Float16x2Regs:$dst),
(ins Float16x2Regs:$a, Float16x2Regs:$b),		(ins Float16x2Regs:$a, Float16x2Regs:$b),
!strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
[(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,		[(set Float16x2Regs:$dst, (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>,
Requires<[useFP16Math, noFMA, doF32FTZ]>;		Requires<[useFP16Math, noFMA, doF32FTZ]>;
def _rnf16x2rr :		def _rnf16x2rr :
NVPTXInst<(outs Float16x2Regs:$dst),		NVPTXInst<(outs Float16x2Regs:$dst),
(ins Float16x2Regs:$a, Float16x2Regs:$b),		(ins Float16x2Regs:$a, Float16x2Regs:$b),
!strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),		!strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
[(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,		[(set Float16x2Regs:$dst, (OpNode (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>,
Requires<[useFP16Math, noFMA]>;		Requires<[useFP16Math, noFMA]>;
}		}

// Template for operations which take two f32 or f64 operands. Provides three		// Template for operations which take two f32 or f64 operands. Provides three
// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush		// instructions: <OpcStr>.f64, <OpcStr>.f32, and <OpcStr>.ftz.f32 (flush
// subnormal inputs and results to zero).		// subnormal inputs and results to zero).
multiclass F2<string OpcStr, SDNode OpNode> {		multiclass F2<string OpcStr, SDNode OpNode> {
def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),		def f64 : NVPTXInst<(outs Float64Regs:$dst), (ins Float64Regs:$a),
▲ Show 20 Lines • Show All 475 Lines • ▼ Show 20 Lines

defm FABS : F2<"abs", fabs>;		defm FABS : F2<"abs", fabs>;
defm FNEG : F2<"neg", fneg>;		defm FNEG : F2<"neg", fneg>;
defm FSQRT : F2<"sqrt.rn", fsqrt>;		defm FSQRT : F2<"sqrt.rn", fsqrt>;

//		//
// F16 NEG		// F16 NEG
//		//
class FNEG_F16_F16X2<string OpcStr, RegisterClass RC, Predicate Pred> :		class FNEG_F16_F16X2<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> :
NVPTXInst<(outs RC:$dst), (ins RC:$src),		NVPTXInst<(outs RC:$dst), (ins RC:$src),
!strconcat(OpcStr, " \t$dst, $src;"),		!strconcat(OpcStr, " \t$dst, $src;"),
[(set RC:$dst, (fneg RC:$src))]>,		[(set RC:$dst, (fneg (T RC:$src)))]>,
Requires<[useFP16Math, hasPTX60, hasSM53, Pred]>;		Requires<[useFP16Math, hasPTX60, hasSM53, Pred]>;
def FNEG16_ftz : FNEG_F16_F16X2<"neg.ftz.f16", Float16Regs, doF32FTZ>;		def FNEG16_ftz : FNEG_F16_F16X2<"neg.ftz.f16", f16, Float16Regs, doF32FTZ>;
def FNEG16 : FNEG_F16_F16X2<"neg.f16", Float16Regs, True>;		def FNEG16 : FNEG_F16_F16X2<"neg.f16", f16, Float16Regs, True>;
def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", Float16x2Regs, doF32FTZ>;		def FNEG16x2_ftz : FNEG_F16_F16X2<"neg.ftz.f16x2", v2f16, Float16x2Regs, doF32FTZ>;
def FNEG16x2 : FNEG_F16_F16X2<"neg.f16x2", Float16x2Regs, True>;		def FNEG16x2 : FNEG_F16_F16X2<"neg.f16x2", v2f16, Float16x2Regs, True>;

//		//
// F64 division		// F64 division
//		//
def FDIV641r :		def FDIV641r :
NVPTXInst<(outs Float64Regs:$dst),		NVPTXInst<(outs Float64Regs:$dst),
(ins f64imm:$a, Float64Regs:$b),		(ins f64imm:$a, Float64Regs:$b),
"rcp.rn.f64 \t$dst, $b;",		"rcp.rn.f64 \t$dst, $b;",
▲ Show 20 Lines • Show All 156 Lines • ▼ Show 20 Lines	def rir : NVPTXInst<(outs RC:$dst),
Requires<[Pred]>;		Requires<[Pred]>;
def rii : NVPTXInst<(outs RC:$dst),		def rii : NVPTXInst<(outs RC:$dst),
(ins RC:$a, ImmCls:$b, ImmCls:$c),		(ins RC:$a, ImmCls:$b, ImmCls:$c),
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),		!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
[(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,		[(set RC:$dst, (fma RC:$a, fpimm:$b, fpimm:$c))]>,
Requires<[Pred]>;		Requires<[Pred]>;
}		}

multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> {		multiclass FMA_F16<string OpcStr, ValueType T, RegisterClass RC, Predicate Pred> {
def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),		def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),		!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
[(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,		[(set RC:$dst, (fma (T RC:$a), (T RC:$b), (T RC:$c)))]>,
Requires<[useFP16Math, Pred]>;		Requires<[useFP16Math, Pred]>;
}		}

defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>;		defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", f16, Float16Regs, doF32FTZ>;
defm FMA16 : FMA_F16<"fma.rn.f16", Float16Regs, True>;		defm FMA16 : FMA_F16<"fma.rn.f16", f16, Float16Regs, True>;
defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>;		defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", v2f16, Float16x2Regs, doF32FTZ>;
defm FMA16x2 : FMA_F16<"fma.rn.f16x2", Float16x2Regs, True>;		defm FMA16x2 : FMA_F16<"fma.rn.f16x2", v2f16, Float16x2Regs, True>;
defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;		defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;		defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, True>;
defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;		defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, True>;

// sin/cos		// sin/cos
def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),		def SINF: NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$src),
"sin.approx.f32 \t$dst, $src;",		"sin.approx.f32 \t$dst, $src;",
[(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,		[(set Float32Regs:$dst, (fsin Float32Regs:$src))]>,
▲ Show 20 Lines • Show All 437 Lines • ▼ Show 20 Lines	multiclass SELP<string TypeStr, RegisterClass RC, Operand ImmCls> {
def ir : NVPTXInst<(outs RC:$dst),		def ir : NVPTXInst<(outs RC:$dst),
(ins ImmCls:$a, RC:$b, Int1Regs:$p),		(ins ImmCls:$a, RC:$b, Int1Regs:$p),
!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;		!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
def ii : NVPTXInst<(outs RC:$dst),		def ii : NVPTXInst<(outs RC:$dst),
(ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),		(ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;		!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"), []>;
}		}

multiclass SELP_PATTERN<string TypeStr, RegisterClass RC, Operand ImmCls,		multiclass SELP_PATTERN<string TypeStr, ValueType T, RegisterClass RC,
SDNode ImmNode> {		Operand ImmCls, SDNode ImmNode> {
def rr :		def rr :
NVPTXInst<(outs RC:$dst),		NVPTXInst<(outs RC:$dst),
(ins RC:$a, RC:$b, Int1Regs:$p),		(ins RC:$a, RC:$b, Int1Regs:$p),
!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),		!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
[(set RC:$dst, (select Int1Regs:$p, RC:$a, RC:$b))]>;		[(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T RC:$b)))]>;
def ri :		def ri :
NVPTXInst<(outs RC:$dst),		NVPTXInst<(outs RC:$dst),
(ins RC:$a, ImmCls:$b, Int1Regs:$p),		(ins RC:$a, ImmCls:$b, Int1Regs:$p),
!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),		!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
[(set RC:$dst, (select Int1Regs:$p, RC:$a, ImmNode:$b))]>;		[(set (T RC:$dst), (select Int1Regs:$p, (T RC:$a), (T ImmNode:$b)))]>;
def ir :		def ir :
NVPTXInst<(outs RC:$dst),		NVPTXInst<(outs RC:$dst),
(ins ImmCls:$a, RC:$b, Int1Regs:$p),		(ins ImmCls:$a, RC:$b, Int1Regs:$p),
!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),		!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
[(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, RC:$b))]>;		[(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, (T RC:$b)))]>;
def ii :		def ii :
NVPTXInst<(outs RC:$dst),		NVPTXInst<(outs RC:$dst),
(ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),		(ins ImmCls:$a, ImmCls:$b, Int1Regs:$p),
!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),		!strconcat("selp.", TypeStr, " \t$dst, $a, $b, $p;"),
[(set RC:$dst, (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;		[(set (T RC:$dst), (select Int1Regs:$p, ImmNode:$a, ImmNode:$b))]>;
}		}
}		}

// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as		// Don't pattern match on selp.{s,u}{16,32,64} -- selp.b{16,32,64} is just as
// good.		// good.
defm SELP_b16 : SELP_PATTERN<"b16", Int16Regs, i16imm, imm>;		defm SELP_b16 : SELP_PATTERN<"b16", i16, Int16Regs, i16imm, imm>;
defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;		defm SELP_s16 : SELP<"s16", Int16Regs, i16imm>;
defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;		defm SELP_u16 : SELP<"u16", Int16Regs, i16imm>;
defm SELP_b32 : SELP_PATTERN<"b32", Int32Regs, i32imm, imm>;		defm SELP_b32 : SELP_PATTERN<"b32", i32, Int32Regs, i32imm, imm>;
defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;		defm SELP_s32 : SELP<"s32", Int32Regs, i32imm>;
defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;		defm SELP_u32 : SELP<"u32", Int32Regs, i32imm>;
defm SELP_b64 : SELP_PATTERN<"b64", Int64Regs, i64imm, imm>;		defm SELP_b64 : SELP_PATTERN<"b64", i64, Int64Regs, i64imm, imm>;
defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;		defm SELP_s64 : SELP<"s64", Int64Regs, i64imm>;
defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;		defm SELP_u64 : SELP<"u64", Int64Regs, i64imm>;
defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>;		defm SELP_f16 : SELP_PATTERN<"b16", f16, Float16Regs, f16imm, fpimm>;
defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;		defm SELP_f32 : SELP_PATTERN<"f32", f32, Float32Regs, f32imm, fpimm>;
		defm SELP_f64 : SELP_PATTERN<"f64", f64, Float64Regs, f64imm, fpimm>;

		// This does not work as tablegen fails to infer the type of 'imm'.
		//def v2f16imm : Operand<v2f16>;
		//defm SELP_f16x2 : SELP_PATTERN<"b32", v2f16, Float16x2Regs, v2f16imm, imm>;

def SELP_f16x2rr :		def SELP_f16x2rr :
NVPTXInst<(outs Float16x2Regs:$dst),		NVPTXInst<(outs Float16x2Regs:$dst),
(ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p),		(ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p),
"selp.b32 \t$dst, $a, $b, $p;",		"selp.b32 \t$dst, $a, $b, $p;",
[(set Float16x2Regs:$dst,		[(set Float16x2Regs:$dst,
(select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>;		(select Int1Regs:$p, (v2f16 Float16x2Regs:$a), (v2f16 Float16x2Regs:$b)))]>;

//-----------------------------------		//-----------------------------------
// Data Movement (Load / Store, Move)		// Data Movement (Load / Store, Move)
//-----------------------------------		//-----------------------------------

def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],		def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex],
[SDNPWantRoot]>;		[SDNPWantRoot]>;
def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],		def ADDRri64 : ComplexPattern<i64, 2, "SelectADDRri64", [frameindex],
▲ Show 20 Lines • Show All 216 Lines • ▼ Show 20 Lines	def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
(SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;		(SELP_u32ii -1, 0, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;
def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),		def : Pat<(i32 (setne Int1Regs:$a, Int1Regs:$b)),
(SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;		(SELP_u32ii 0, -1, (XORb1rr Int1Regs:$a, Int1Regs:$b))>;



multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {		multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
// f16 -> pred		// f16 -> pred
def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),		def : Pat<(i1 (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b))),
(SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,		(SETP_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
Requires<[useFP16Math,doF32FTZ]>;		Requires<[useFP16Math,doF32FTZ]>;
def : Pat<(i1 (OpNode Float16Regs:$a, Float16Regs:$b)),		def : Pat<(i1 (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b))),
(SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,		(SETP_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
Requires<[useFP16Math]>;		Requires<[useFP16Math]>;
def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),		def : Pat<(i1 (OpNode (f16 Float16Regs:$a), fpimm:$b)),
(SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,		(SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
Requires<[useFP16Math,doF32FTZ]>;		Requires<[useFP16Math,doF32FTZ]>;
def : Pat<(i1 (OpNode Float16Regs:$a, fpimm:$b)),		def : Pat<(i1 (OpNode (f16 Float16Regs:$a), fpimm:$b)),
(SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,		(SETP_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
Requires<[useFP16Math]>;		Requires<[useFP16Math]>;
def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),		def : Pat<(i1 (OpNode fpimm:$a, (f16 Float16Regs:$b))),
(SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,		(SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
Requires<[useFP16Math,doF32FTZ]>;		Requires<[useFP16Math,doF32FTZ]>;
def : Pat<(i1 (OpNode fpimm:$a, Float16Regs:$b)),		def : Pat<(i1 (OpNode fpimm:$a, (f16 Float16Regs:$b))),
(SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,		(SETP_f16rr (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
Requires<[useFP16Math]>;		Requires<[useFP16Math]>;

// f32 -> pred		// f32 -> pred
def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),		def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
(SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,		(SETP_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
Requires<[doF32FTZ]>;		Requires<[doF32FTZ]>;
def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),		def : Pat<(i1 (OpNode Float32Regs:$a, Float32Regs:$b)),
Show All 13 Lines	multiclass FSET_FORMAT<PatFrag OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),		def : Pat<(i1 (OpNode Float64Regs:$a, Float64Regs:$b)),
(SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;		(SETP_f64rr Float64Regs:$a, Float64Regs:$b, Mode)>;
def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),		def : Pat<(i1 (OpNode Float64Regs:$a, fpimm:$b)),
(SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;		(SETP_f64ri Float64Regs:$a, fpimm:$b, Mode)>;
def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),		def : Pat<(i1 (OpNode fpimm:$a, Float64Regs:$b)),
(SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;		(SETP_f64ir fpimm:$a, Float64Regs:$b, Mode)>;

// f16 -> i32		// f16 -> i32
def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),		def : Pat<(i32 (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b))),
(SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,		(SET_f16rr Float16Regs:$a, Float16Regs:$b, ModeFTZ)>,
Requires<[useFP16Math, doF32FTZ]>;		Requires<[useFP16Math, doF32FTZ]>;
def : Pat<(i32 (OpNode Float16Regs:$a, Float16Regs:$b)),		def : Pat<(i32 (OpNode (f16 Float16Regs:$a), (f16 Float16Regs:$b))),
(SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,		(SET_f16rr Float16Regs:$a, Float16Regs:$b, Mode)>,
Requires<[useFP16Math]>;		Requires<[useFP16Math]>;
def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),		def : Pat<(i32 (OpNode (f16 Float16Regs:$a), fpimm:$b)),
(SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,		(SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), ModeFTZ)>,
Requires<[useFP16Math, doF32FTZ]>;		Requires<[useFP16Math, doF32FTZ]>;
def : Pat<(i32 (OpNode Float16Regs:$a, fpimm:$b)),		def : Pat<(i32 (OpNode (f16 Float16Regs:$a), fpimm:$b)),
(SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,		(SET_f16rr Float16Regs:$a, (LOAD_CONST_F16 fpimm:$b), Mode)>,
Requires<[useFP16Math]>;		Requires<[useFP16Math]>;
def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),		def : Pat<(i32 (OpNode fpimm:$a, (f16 Float16Regs:$b))),
(SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,		(SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, ModeFTZ)>,
Requires<[useFP16Math, doF32FTZ]>;		Requires<[useFP16Math, doF32FTZ]>;
def : Pat<(i32 (OpNode fpimm:$a, Float16Regs:$b)),		def : Pat<(i32 (OpNode fpimm:$a, (f16 Float16Regs:$b))),
(SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,		(SET_f16ir (LOAD_CONST_F16 fpimm:$a), Float16Regs:$b, Mode)>,
Requires<[useFP16Math]>;		Requires<[useFP16Math]>;

// f32 -> i32		// f32 -> i32
def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),		def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
(SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,		(SET_f32rr Float32Regs:$a, Float32Regs:$b, ModeFTZ)>,
Requires<[doF32FTZ]>;		Requires<[doF32FTZ]>;
def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),		def : Pat<(i32 (OpNode Float32Regs:$a, Float32Regs:$b)),
▲ Show 20 Lines • Show All 405 Lines • ▼ Show 20 Lines	def DeclareScalarParamInst :
NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),		NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
".param .b$size param$a;",		".param .b$size param$a;",
[(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;		[(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 0))]>;
def DeclareScalarRegInst :		def DeclareScalarRegInst :
NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),		NVPTXInst<(outs), (ins i32imm:$a, i32imm:$size),
".reg .b$size param$a;",		".reg .b$size param$a;",
[(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;		[(DeclareScalarParam (i32 imm:$a), (i32 imm:$size), (i32 1))]>;

class MoveParamInst<NVPTXRegClass regclass, string asmstr> :		class MoveParamInst<ValueType T, NVPTXRegClass regclass, string asmstr> :
NVPTXInst<(outs regclass:$dst), (ins regclass:$src),		NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
!strconcat("mov", asmstr, " \t$dst, $src;"),		!strconcat("mov", asmstr, " \t$dst, $src;"),
[(set regclass:$dst, (MoveParam regclass:$src))]>;		[(set (T regclass:$dst), (MoveParam (T regclass:$src)))]>;

class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty,		class MoveParamSymbolInst<NVPTXRegClass regclass, Operand srcty,
string asmstr> :		string asmstr> :
NVPTXInst<(outs regclass:$dst), (ins srcty:$src),		NVPTXInst<(outs regclass:$dst), (ins srcty:$src),
!strconcat("mov", asmstr, " \t$dst, $src;"),		!strconcat("mov", asmstr, " \t$dst, $src;"),
[(set regclass:$dst, (MoveParam texternalsym:$src))]>;		[(set regclass:$dst, (MoveParam texternalsym:$src))]>;

def MoveParamI64 : MoveParamInst<Int64Regs, ".b64">;		def MoveParamI64 : MoveParamInst<i64, Int64Regs, ".b64">;
def MoveParamI32 : MoveParamInst<Int32Regs, ".b32">;		def MoveParamI32 : MoveParamInst<i32, Int32Regs, ".b32">;

def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, ".b64">;		def MoveParamSymbolI64 : MoveParamSymbolInst<Int64Regs, i64imm, ".b64">;
def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, ".b32">;		def MoveParamSymbolI32 : MoveParamSymbolInst<Int32Regs, i32imm, ".b32">;

def MoveParamI16 :		def MoveParamI16 :
NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),		NVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src),
"cvt.u16.u32 \t$dst, $src;",		"cvt.u16.u32 \t$dst, $src;",
[(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;		[(set Int16Regs:$dst, (MoveParam Int16Regs:$src))]>;
def MoveParamF64 : MoveParamInst<Float64Regs, ".f64">;		def MoveParamF64 : MoveParamInst<f64, Float64Regs, ".f64">;
def MoveParamF32 : MoveParamInst<Float32Regs, ".f32">;		def MoveParamF32 : MoveParamInst<f32, Float32Regs, ".f32">;
def MoveParamF16 : MoveParamInst<Float16Regs, ".f16">;		def MoveParamF16 : MoveParamInst<f16, Float16Regs, ".f16">;

class PseudoUseParamInst<NVPTXRegClass regclass> :		class PseudoUseParamInst<NVPTXRegClass regclass> :
NVPTXInst<(outs), (ins regclass:$src),		NVPTXInst<(outs), (ins regclass:$src),
"// Pseudo use of $src",		"// Pseudo use of $src",
[(PseudoUseParam regclass:$src)]>;		[(PseudoUseParam regclass:$src)]>;

def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;		def PseudoUseParamI64 : PseudoUseParamInst<Int64Regs>;
def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;		def PseudoUseParamI32 : PseudoUseParamInst<Int32Regs>;
def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;		def PseudoUseParamI16 : PseudoUseParamInst<Int16Regs>;
def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;		def PseudoUseParamF64 : PseudoUseParamInst<Float64Regs>;
def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;		def PseudoUseParamF32 : PseudoUseParamInst<Float32Regs>;

class ProxyRegInst<string SzStr, NVPTXRegClass regclass> :		class ProxyRegInst<string SzStr, ValueType T, NVPTXRegClass regclass> :
NVPTXInst<(outs regclass:$dst), (ins regclass:$src),		NVPTXInst<(outs regclass:$dst), (ins regclass:$src),
!strconcat("mov.", SzStr, " \t$dst, $src;"),		!strconcat("mov.", SzStr, " \t$dst, $src;"),
[(set regclass:$dst, (ProxyReg regclass:$src))]>;		[(set (T regclass:$dst), (ProxyReg (T regclass:$src)))]>;

let isCodeGenOnly=1, isPseudo=1 in {		let isCodeGenOnly=1, isPseudo=1 in {
def ProxyRegI1 : ProxyRegInst<"pred", Int1Regs>;		def ProxyRegI1 : ProxyRegInst<"pred", i1, Int1Regs>;
def ProxyRegI16 : ProxyRegInst<"b16", Int16Regs>;		def ProxyRegI16 : ProxyRegInst<"b16", i16, Int16Regs>;
def ProxyRegI32 : ProxyRegInst<"b32", Int32Regs>;		def ProxyRegI32 : ProxyRegInst<"b32", i32, Int32Regs>;
def ProxyRegI64 : ProxyRegInst<"b64", Int64Regs>;		def ProxyRegI64 : ProxyRegInst<"b64", i64, Int64Regs>;
def ProxyRegF16 : ProxyRegInst<"b16", Float16Regs>;		def ProxyRegF16 : ProxyRegInst<"b16", f16, Float16Regs>;
def ProxyRegF32 : ProxyRegInst<"f32", Float32Regs>;		def ProxyRegBF16 : ProxyRegInst<"b16", bf16, Float16Regs>;
def ProxyRegF64 : ProxyRegInst<"f64", Float64Regs>;		def ProxyRegF32 : ProxyRegInst<"f32", f32, Float32Regs>;
def ProxyRegF16x2 : ProxyRegInst<"b32", Float16x2Regs>;		def ProxyRegF64 : ProxyRegInst<"f64", f64, Float64Regs>;
		def ProxyRegF16x2 : ProxyRegInst<"b32", v2f16, Float16x2Regs>;
		def ProxyRegBF16x2 : ProxyRegInst<"b32", v2bf16, Float16x2Regs>;
}		}

//		//
// Load / Store Handling		// Load / Store Handling
//		//
multiclass LD<NVPTXRegClass regclass> {		multiclass LD<NVPTXRegClass regclass> {
def _avar : NVPTXInst<		def _avar : NVPTXInst<
(outs regclass:$dst),		(outs regclass:$dst),
▲ Show 20 Lines • Show All 274 Lines • ▼ Show 20 Lines	let mayStore=1, hasSideEffects=0 in {
defm STV_f16 : ST_VEC<Float16Regs>;		defm STV_f16 : ST_VEC<Float16Regs>;
defm STV_f16x2 : ST_VEC<Float16x2Regs>;		defm STV_f16x2 : ST_VEC<Float16x2Regs>;
defm STV_f32 : ST_VEC<Float32Regs>;		defm STV_f32 : ST_VEC<Float32Regs>;
defm STV_f64 : ST_VEC<Float64Regs>;		defm STV_f64 : ST_VEC<Float64Regs>;
}		}

//---- Conversion ----		//---- Conversion ----

class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,		class F_BITCONVERT<string SzStr, ValueType TIn, ValueType TOut,
NVPTXRegClass regclassOut> :		NVPTXRegClass regclassIn = ValueToRegClass<TIn>.ret,
		NVPTXRegClass regclassOut = ValueToRegClass<TOut>.ret> :
NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),		NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
!strconcat("mov.b", SzStr, " \t$d, $a;"),		!strconcat("mov.b", SzStr, " \t$d, $a;"),
[(set regclassOut:$d, (bitconvert regclassIn:$a))]>;		[(set (TOut regclassOut:$d), (bitconvert (TIn regclassIn:$a)))]>;

def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>;		def BITCONVERT_16_I2F : F_BITCONVERT<"16", i16, f16>;
def BITCONVERT_16_F2I : F_BITCONVERT<"16", Float16Regs, Int16Regs>;		def BITCONVERT_16_F2I : F_BITCONVERT<"16", f16, i16>;
def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;		def BITCONVERT_16_I2BF : F_BITCONVERT<"16", i16, bf16>;
def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;		def BITCONVERT_16_BF2I : F_BITCONVERT<"16", bf16, i16>;
def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;		def BITCONVERT_32_I2F : F_BITCONVERT<"32", i32, f32>;
def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;		def BITCONVERT_32_F2I : F_BITCONVERT<"32", f32, i32>;
def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>;		def BITCONVERT_64_I2F : F_BITCONVERT<"64", i64, f64>;
def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>;		def BITCONVERT_64_F2I : F_BITCONVERT<"64", f64, i64>;
def BITCONVERT_32_F2F16x2 : F_BITCONVERT<"32", Float32Regs, Float16x2Regs>;		def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", i32, v2f16>;
def BITCONVERT_32_F16x22F : F_BITCONVERT<"32", Float16x2Regs, Float32Regs>;		def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", v2f16, i32>;
		def BITCONVERT_32_F2F16x2 : F_BITCONVERT<"32", f32, v2f16>;
		def BITCONVERT_32_F16x22F : F_BITCONVERT<"32", v2f16, f32>;
		def BITCONVERT_32_I2BF16x2 : F_BITCONVERT<"32", i32, v2bf16>;
		def BITCONVERT_32_BF16x22I : F_BITCONVERT<"32", v2bf16, i32>;
		def BITCONVERT_32_F2BF16x2 : F_BITCONVERT<"32", f32, v2bf16>;
		def BITCONVERT_32_BF16x22F : F_BITCONVERT<"32", v2bf16, f32>;

// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where		// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
// we cannot specify floating-point literals in isel patterns. Therefore, we		// we cannot specify floating-point literals in isel patterns. Therefore, we
// use an integer selp to select either 1 or 0 and then cvt to floating-point.		// use an integer selp to select either 1 or 0 and then cvt to floating-point.

// sint -> f16		// sint -> f16
def : Pat<(f16 (sint_to_fp Int1Regs:$a)),		def : Pat<(f16 (sint_to_fp Int1Regs:$a)),
(CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;		(CVT_f16_s32 (SELP_u32ii 1, 0, Int1Regs:$a), CvtRN)>;
▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines	def : Pat<(f64 (uint_to_fp Int16Regs:$a)),
(CVT_f64_u16 Int16Regs:$a, CvtRN)>;		(CVT_f64_u16 Int16Regs:$a, CvtRN)>;
def : Pat<(f64 (uint_to_fp Int32Regs:$a)),		def : Pat<(f64 (uint_to_fp Int32Regs:$a)),
(CVT_f64_u32 Int32Regs:$a, CvtRN)>;		(CVT_f64_u32 Int32Regs:$a, CvtRN)>;
def : Pat<(f64 (uint_to_fp Int64Regs:$a)),		def : Pat<(f64 (uint_to_fp Int64Regs:$a)),
(CVT_f64_u64 Int64Regs:$a, CvtRN)>;		(CVT_f64_u64 Int64Regs:$a, CvtRN)>;


// f16 -> sint		// f16 -> sint
def : Pat<(i1 (fp_to_sint Float16Regs:$a)),		def : Pat<(i1 (fp_to_sint (f16 Float16Regs:$a))),
(SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;		(SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
def : Pat<(i16 (fp_to_sint Float16Regs:$a)),		def : Pat<(i16 (fp_to_sint (f16 Float16Regs:$a))),
(CVT_s16_f16 Float16Regs:$a, CvtRZI)>;		(CVT_s16_f16 (f16 Float16Regs:$a), CvtRZI)>;
def : Pat<(i32 (fp_to_sint Float16Regs:$a)),		def : Pat<(i32 (fp_to_sint (f16 Float16Regs:$a))),
(CVT_s32_f16 Float16Regs:$a, CvtRZI)>;		(CVT_s32_f16 (f16 Float16Regs:$a), CvtRZI)>;
def : Pat<(i64 (fp_to_sint Float16Regs:$a)),		def : Pat<(i64 (fp_to_sint (f16 Float16Regs:$a))),
(CVT_s64_f16 Float16Regs:$a, CvtRZI)>;		(CVT_s64_f16 Float16Regs:$a, CvtRZI)>;

// f16 -> uint		// f16 -> uint
def : Pat<(i1 (fp_to_uint Float16Regs:$a)),		def : Pat<(i1 (fp_to_uint (f16 Float16Regs:$a))),
(SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;		(SETP_b16ri (BITCONVERT_16_F2I Float16Regs:$a), 0, CmpEQ)>;
def : Pat<(i16 (fp_to_uint Float16Regs:$a)),		def : Pat<(i16 (fp_to_uint (f16 Float16Regs:$a))),
(CVT_u16_f16 Float16Regs:$a, CvtRZI)>;		(CVT_u16_f16 Float16Regs:$a, CvtRZI)>;
def : Pat<(i32 (fp_to_uint Float16Regs:$a)),		def : Pat<(i32 (fp_to_uint (f16 Float16Regs:$a))),
(CVT_u32_f16 Float16Regs:$a, CvtRZI)>;		(CVT_u32_f16 Float16Regs:$a, CvtRZI)>;
def : Pat<(i64 (fp_to_uint Float16Regs:$a)),		def : Pat<(i64 (fp_to_uint (f16 Float16Regs:$a))),
(CVT_u64_f16 Float16Regs:$a, CvtRZI)>;		(CVT_u64_f16 Float16Regs:$a, CvtRZI)>;

// f32 -> sint		// f32 -> sint
def : Pat<(i1 (fp_to_sint Float32Regs:$a)),		def : Pat<(i1 (fp_to_sint Float32Regs:$a)),
(SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;		(SETP_b32ri (BITCONVERT_32_F2I Float32Regs:$a), 0, CmpEQ)>;
def : Pat<(i16 (fp_to_sint Float32Regs:$a)),		def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
(CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;		(CVT_s16_f32 Float32Regs:$a, CvtRZI_FTZ)>, Requires<[doF32FTZ]>;
def : Pat<(i16 (fp_to_sint Float32Regs:$a)),		def : Pat<(i16 (fp_to_sint Float32Regs:$a)),
▲ Show 20 Lines • Show All 130 Lines • ▼ Show 20 Lines	def : Pat<(select Int32Regs:$pred, Int16Regs:$a, Int16Regs:$b),
(SELP_b16rr Int16Regs:$a, Int16Regs:$b,		(SELP_b16rr Int16Regs:$a, Int16Regs:$b,
(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;		(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),		def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
(SELP_b32rr Int32Regs:$a, Int32Regs:$b,		(SELP_b32rr Int32Regs:$a, Int32Regs:$b,
(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;		(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),		def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
(SELP_b64rr Int64Regs:$a, Int64Regs:$b,		(SELP_b64rr Int64Regs:$a, Int64Regs:$b,
(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;		(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b),		def : Pat<(select Int32Regs:$pred, (f16 Float16Regs:$a), (f16 Float16Regs:$b)),
(SELP_f16rr Float16Regs:$a, Float16Regs:$b,		(SELP_f16rr Float16Regs:$a, Float16Regs:$b,
(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;		(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),		def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
(SELP_f32rr Float32Regs:$a, Float32Regs:$b,		(SELP_f32rr Float32Regs:$a, Float32Regs:$b,
(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;		(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),		def : Pat<(select Int32Regs:$pred, Float64Regs:$a, Float64Regs:$b),
(SELP_f64rr Float64Regs:$a, Float64Regs:$b,		(SELP_f64rr Float64Regs:$a, Float64Regs:$b,
(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;		(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
▲ Show 20 Lines • Show All 48 Lines • ▼ Show 20 Lines	def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst),
" mov.b32 \t{%tmp_lo, $dst}, $src; }}",		" mov.b32 \t{%tmp_lo, $dst}, $src; }}",
[(set Float16Regs:$dst,		[(set Float16Regs:$dst,
(extractelt (v2f16 Float16x2Regs:$src), 1))]>;		(extractelt (v2f16 Float16x2Regs:$src), 1))]>;

// Coalesce two f16 registers into f16x2		// Coalesce two f16 registers into f16x2
def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst),		def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst),
(ins Float16Regs:$a, Float16Regs:$b),		(ins Float16Regs:$a, Float16Regs:$b),
"mov.b32 \t$dst, {{$a, $b}};",		"mov.b32 \t$dst, {{$a, $b}};",
[(set Float16x2Regs:$dst,		[(set (v2f16 Float16x2Regs:$dst),
(build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>;		(build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>;

// Directly initializing underlying the b32 register is one less SASS		// Directly initializing underlying the b32 register is one less SASS
// instruction than than vector-packing move.		// instruction than than vector-packing move.
def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src),		def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src),
"mov.b32 \t$dst, $src;",		"mov.b32 \t$dst, $src;",
[]>;		[]>;

▲ Show 20 Lines • Show All 82 Lines • ▼ Show 20 Lines

// fpround f64 -> f32		// fpround f64 -> f32
def : Pat<(f32 (fpround Float64Regs:$a)),		def : Pat<(f32 (fpround Float64Regs:$a)),
(CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;		(CVT_f32_f64 Float64Regs:$a, CvtRN_FTZ)>, Requires<[doF32FTZ]>;
def : Pat<(f32 (fpround Float64Regs:$a)),		def : Pat<(f32 (fpround Float64Regs:$a)),
(CVT_f32_f64 Float64Regs:$a, CvtRN)>;		(CVT_f32_f64 Float64Regs:$a, CvtRN)>;

// fpextend f16 -> f32		// fpextend f16 -> f32
def : Pat<(f32 (fpextend Float16Regs:$a)),		def : Pat<(f32 (fpextend (f16 Float16Regs:$a))),
(CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;		(CVT_f32_f16 Float16Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
def : Pat<(f32 (fpextend Float16Regs:$a)),		def : Pat<(f32 (fpextend (f16 Float16Regs:$a))),
(CVT_f32_f16 Float16Regs:$a, CvtNONE)>;		(CVT_f32_f16 Float16Regs:$a, CvtNONE)>;

// fpextend f16 -> f64		// fpextend f16 -> f64
def : Pat<(f64 (fpextend Float16Regs:$a)),		def : Pat<(f64 (fpextend (f16 Float16Regs:$a))),
(CVT_f64_f16 Float16Regs:$a, CvtNONE)>;		(CVT_f64_f16 Float16Regs:$a, CvtNONE)>;

// fpextend f32 -> f64		// fpextend f32 -> f64
def : Pat<(f64 (fpextend Float32Regs:$a)),		def : Pat<(f64 (fpextend Float32Regs:$a)),
(CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;		(CVT_f64_f32 Float32Regs:$a, CvtNONE_FTZ)>, Requires<[doF32FTZ]>;
def : Pat<(f64 (fpextend Float32Regs:$a)),		def : Pat<(f64 (fpextend Float32Regs:$a)),
(CVT_f64_f32 Float32Regs:$a, CvtNONE)>;		(CVT_f64_f32 Float32Regs:$a, CvtNONE)>;

def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,		def retflag : SDNode<"NVPTXISD::RET_FLAG", SDTNone,
[SDNPHasChain, SDNPOptInGlue]>;		[SDNPHasChain, SDNPOptInGlue]>;

// fceil, ffloor, froundeven, ftrunc.		// fceil, ffloor, froundeven, ftrunc.

multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {		multiclass CVT_ROUND<SDNode OpNode, PatLeaf Mode, PatLeaf ModeFTZ> {
def : Pat<(OpNode Float16Regs:$a),		def : Pat<(OpNode (f16 Float16Regs:$a)),
(CVT_f16_f16 Float16Regs:$a, Mode)>;		(CVT_f16_f16 Float16Regs:$a, Mode)>;
def : Pat<(OpNode Float32Regs:$a),		def : Pat<(OpNode Float32Regs:$a),
(CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>;		(CVT_f32_f32 Float32Regs:$a, ModeFTZ)>, Requires<[doF32FTZ]>;
def : Pat<(OpNode Float32Regs:$a),		def : Pat<(OpNode Float32Regs:$a),
(CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>;		(CVT_f32_f32 Float32Regs:$a, Mode)>, Requires<[doNoF32FTZ]>;
def : Pat<(OpNode Float64Regs:$a),		def : Pat<(OpNode Float64Regs:$a),
(CVT_f64_f64 Float64Regs:$a, Mode)>;		(CVT_f64_f64 Float64Regs:$a, Mode)>;
}		}
▲ Show 20 Lines • Show All 108 Lines • Show Last 20 Lines

llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp

Show First 20 Lines • Show All 69 Lines • ▼ Show 20 Lines	for (auto &BB : MF) {
for (auto &MI : BB) {		for (auto &MI : BB) {
switch (MI.getOpcode()) {		switch (MI.getOpcode()) {
case NVPTX::ProxyRegI1:		case NVPTX::ProxyRegI1:
case NVPTX::ProxyRegI16:		case NVPTX::ProxyRegI16:
case NVPTX::ProxyRegI32:		case NVPTX::ProxyRegI32:
case NVPTX::ProxyRegI64:		case NVPTX::ProxyRegI64:
case NVPTX::ProxyRegF16:		case NVPTX::ProxyRegF16:
case NVPTX::ProxyRegF16x2:		case NVPTX::ProxyRegF16x2:
		case NVPTX::ProxyRegBF16:
		case NVPTX::ProxyRegBF16x2:
case NVPTX::ProxyRegF32:		case NVPTX::ProxyRegF32:
case NVPTX::ProxyRegF64:		case NVPTX::ProxyRegF64:
replaceMachineInstructionUsage(MF, MI);		replaceMachineInstructionUsage(MF, MI);
RemoveList.push_back(&MI);		RemoveList.push_back(&MI);
break;		break;
}		}
}		}
}		}
Show All 36 Lines

llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td

	Show First 20 Lines • Show All 54 Lines • ▼ Show 20 Lines

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// Register classes			// Register classes
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>;			def Int1Regs : NVPTXRegClass<[i1], 8, (add (sequence "P%u", 0, 4))>;
	def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 4))>;			def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 4))>;
	def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 4), VRFrame32, VRFrameLocal32)>;			def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 4), VRFrame32, VRFrameLocal32)>;
	def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>;			def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4), VRFrame64, VRFrameLocal64)>;
	def Float16Regs : NVPTXRegClass<[f16], 16, (add (sequence "H%u", 0, 4))>;			def Float16Regs : NVPTXRegClass<[f16,bf16], 16, (add (sequence "H%u", 0, 4))>;
	def Float16x2Regs : NVPTXRegClass<[v2f16], 32, (add (sequence "HH%u", 0, 4))>;			def Float16x2Regs : NVPTXRegClass<[v2f16,v2bf16], 32, (add (sequence "HH%u", 0, 4))>;
	def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;			def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
	def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>;			def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>;
	def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>;			def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>;
	def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%u", 0, 4))>;			def Int64ArgRegs : NVPTXRegClass<[i64], 64, (add (sequence "la%u", 0, 4))>;
	def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>;			def Float32ArgRegs : NVPTXRegClass<[f32], 32, (add (sequence "fa%u", 0, 4))>;
	def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>;			def Float64ArgRegs : NVPTXRegClass<[f64], 64, (add (sequence "da%u", 0, 4))>;

	// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.			// Read NVPTXRegisterInfo.cpp to see how VRFrame and VRDepot are used.
	def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame32, VRFrameLocal32, VRDepot,			def SpecialRegs : NVPTXRegClass<[i32], 32, (add VRFrame32, VRFrameLocal32, VRDepot,
	(sequence "ENVREG%u", 0, 31))>;			(sequence "ENVREG%u", 0, 31))>;

llvm/test/CodeGen/NVPTX/bf16.ll

This file was added.

				; RUN: llc < %s -march=nvptx \| FileCheck %s
				; RUN: %if ptxas %{ llc < %s -march=nvptx \| %ptxas-verify %}

				; LDST: .b8 bfloat_array[8] = {1, 2, 3, 4, 5, 6, 7, 8};
				@"bfloat_array" = addrspace(1) constant [4 x bfloat]
				[bfloat 0xR0201, bfloat 0xR0403, bfloat 0xR0605, bfloat 0xR0807]

				define void @test_load_store(bfloat addrspace(1)* %in, bfloat addrspace(1)* %out) {
				; CHECK-LABEL: @test_load_store
				; CHECK: ld.global.b16 [[TMP:%h[0-9]+]], [{{%r[0-9]+}}]
				; CHECK: st.global.b16 [{{%r[0-9]+}}], [[TMP]]
				%val = load bfloat, bfloat addrspace(1)* %in
				store bfloat %val, bfloat addrspace(1) * %out
				ret void
				}

				define void @test_bitcast_from_bfloat(bfloat addrspace(1)* %in, i16 addrspace(1)* %out) {
				; CHECK-LABEL: @test_bitcast_from_bfloat
				; CHECK: ld.global.b16 [[TMP:%h[0-9]+]], [{{%r[0-9]+}}]
				; CHECK: st.global.b16 [{{%r[0-9]+}}], [[TMP]]
				%val = load bfloat, bfloat addrspace(1) * %in
				%val_int = bitcast bfloat %val to i16
				store i16 %val_int, i16 addrspace(1)* %out
				ret void
				}

				define void @test_bitcast_to_bfloat(bfloat addrspace(1)* %out, i16 addrspace(1)* %in) {
				; CHECK-LABEL: @test_bitcast_to_bfloat
				; CHECK: ld.global.u16 [[TMP:%rs[0-9]+]], [{{%r[0-9]+}}]
				; CHECK: st.global.u16 [{{%r[0-9]+}}], [[TMP]]
				%val = load i16, i16 addrspace(1)* %in
				%val_fp = bitcast i16 %val to bfloat
				store bfloat %val_fp, bfloat addrspace(1)* %out
				ret void
				}

This is an archive of the discontinued LLVM Phabricator instance.

[CUDA,NVPTX] Implement __bf16 support for NVPTX.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 470569

clang/lib/Basic/Targets/NVPTX.h

clang/lib/Basic/Targets/NVPTX.cpp

clang/test/CodeGenCUDA/bf16.cu

clang/test/SemaCUDA/bf16.cu

llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp

llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td

llvm/test/CodeGen/NVPTX/bf16.ll

This is an archive of the discontinued LLVM Phabricator instance.

[CUDA,NVPTX] Implement __bf16 support for NVPTX.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 470569

clang/lib/Basic/Targets/NVPTX.h

clang/lib/Basic/Targets/NVPTX.cpp

clang/test/CodeGenCUDA/bf16.cu

clang/test/SemaCUDA/bf16.cu

llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

llvm/lib/Target/NVPTX/NVPTXProxyRegErasure.cpp

llvm/lib/Target/NVPTX/NVPTXRegisterInfo.td

llvm/test/CodeGen/NVPTX/bf16.ll

[CUDA,NVPTX] Implement __bf16 support for NVPTX.
ClosedPublic