Diff 268499

llvm/lib/Target/AMDGPU/AMDGPU.td

Show First 20 Lines • Show All 208 Lines • ▼ Show 20 Lines
>;		>;

def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",		def FeatureOffset3fBug : SubtargetFeature<"offset-3f-bug",
"HasOffset3fBug",		"HasOffset3fBug",
"true",		"true",
"Branch offset of 3f hardware bug"		"Branch offset of 3f hardware bug"
>;		>;

		def FeatureImageStoreD16Bug : SubtargetFeature<"image-store-d16-bug",
		"HasImageStoreD16Bug",
		"true",
		"Image Store D16 hardware bug"
		>;

		def FeatureImageGather4D16Bug : SubtargetFeature<"image-gather4-d16-bug",
		"HasImageGather4D16Bug",
		"true",
		"Image Gather4 D16 hardware bug"
		>;

class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <		class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
"ldsbankcount"#Value,		"ldsbankcount"#Value,
"LDSBankCount",		"LDSBankCount",
!cast<string>(Value),		!cast<string>(Value),
"The number of LDS banks per compute unit."		"The number of LDS banks per compute unit."
>;		>;

def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;		def FeatureLDSBankCount16 : SubtargetFeatureLDSBankCount<16>;
▲ Show 20 Lines • Show All 541 Lines • ▼ Show 20 Lines	def FeatureISAVersion8_0_3 : FeatureSet<
FeatureUnpackedD16VMem,		FeatureUnpackedD16VMem,
FeatureDoesNotSupportXNACK,		FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;		FeatureCodeObjectV3]>;

def FeatureISAVersion8_1_0 : FeatureSet<		def FeatureISAVersion8_1_0 : FeatureSet<
[FeatureVolcanicIslands,		[FeatureVolcanicIslands,
FeatureLDSBankCount16,		FeatureLDSBankCount16,
FeatureXNACK,		FeatureXNACK,
FeatureCodeObjectV3]>;		FeatureCodeObjectV3,
		FeatureImageStoreD16Bug,
		FeatureImageGather4D16Bug]>;

def FeatureISAVersion9_0_0 : FeatureSet<		def FeatureISAVersion9_0_0 : FeatureSet<
[FeatureGFX9,		[FeatureGFX9,
FeatureMadMixInsts,		FeatureMadMixInsts,
FeatureLDSBankCount32,		FeatureLDSBankCount32,
FeatureCodeObjectV3,		FeatureCodeObjectV3,
FeatureDoesNotSupportXNACK,		FeatureDoesNotSupportXNACK,
FeatureDoesNotSupportSRAMECC]>;		FeatureDoesNotSupportSRAMECC,
		FeatureImageGather4D16Bug]>;

def FeatureISAVersion9_0_2 : FeatureSet<		def FeatureISAVersion9_0_2 : FeatureSet<
[FeatureGFX9,		[FeatureGFX9,
FeatureMadMixInsts,		FeatureMadMixInsts,
FeatureLDSBankCount32,		FeatureLDSBankCount32,
FeatureXNACK,		FeatureXNACK,
FeatureDoesNotSupportSRAMECC,		FeatureDoesNotSupportSRAMECC,
FeatureCodeObjectV3]>;		FeatureCodeObjectV3,
		FeatureImageGather4D16Bug]>;

def FeatureISAVersion9_0_4 : FeatureSet<		def FeatureISAVersion9_0_4 : FeatureSet<
[FeatureGFX9,		[FeatureGFX9,
FeatureLDSBankCount32,		FeatureLDSBankCount32,
FeatureFmaMixInsts,		FeatureFmaMixInsts,
FeatureDoesNotSupportXNACK,		FeatureDoesNotSupportXNACK,
FeatureDoesNotSupportSRAMECC,		FeatureDoesNotSupportSRAMECC,
FeatureCodeObjectV3]>;		FeatureCodeObjectV3,
		FeatureImageGather4D16Bug]>;

def FeatureISAVersion9_0_6 : FeatureSet<		def FeatureISAVersion9_0_6 : FeatureSet<
[FeatureGFX9,		[FeatureGFX9,
HalfRate64Ops,		HalfRate64Ops,
FeatureFmaMixInsts,		FeatureFmaMixInsts,
FeatureLDSBankCount32,		FeatureLDSBankCount32,
FeatureDLInsts,		FeatureDLInsts,
FeatureDot1Insts,		FeatureDot1Insts,
FeatureDot2Insts,		FeatureDot2Insts,
FeatureDoesNotSupportXNACK,		FeatureDoesNotSupportXNACK,
FeatureCodeObjectV3]>;		FeatureCodeObjectV3,
		FeatureImageGather4D16Bug]>;

def FeatureISAVersion9_0_8 : FeatureSet<		def FeatureISAVersion9_0_8 : FeatureSet<
[FeatureGFX9,		[FeatureGFX9,
HalfRate64Ops,		HalfRate64Ops,
FeatureFmaMixInsts,		FeatureFmaMixInsts,
FeatureLDSBankCount32,		FeatureLDSBankCount32,
FeatureDLInsts,		FeatureDLInsts,
FeatureDot1Insts,		FeatureDot1Insts,
FeatureDot2Insts,		FeatureDot2Insts,
FeatureDot3Insts,		FeatureDot3Insts,
FeatureDot4Insts,		FeatureDot4Insts,
FeatureDot5Insts,		FeatureDot5Insts,
FeatureDot6Insts,		FeatureDot6Insts,
FeatureMAIInsts,		FeatureMAIInsts,
FeaturePkFmacF16Inst,		FeaturePkFmacF16Inst,
FeatureAtomicFaddInsts,		FeatureAtomicFaddInsts,
FeatureSRAMECC,		FeatureSRAMECC,
FeatureMFMAInlineLiteralBug,		FeatureMFMAInlineLiteralBug,
FeatureCodeObjectV3]>;		FeatureCodeObjectV3,
		FeatureImageGather4D16Bug]>;

def FeatureISAVersion9_0_9 : FeatureSet<		def FeatureISAVersion9_0_9 : FeatureSet<
[FeatureGFX9,		[FeatureGFX9,
FeatureMadMixInsts,		FeatureMadMixInsts,
FeatureLDSBankCount32,		FeatureLDSBankCount32,
FeatureXNACK,		FeatureXNACK,
FeatureCodeObjectV3]>;		FeatureCodeObjectV3,
		FeatureImageGather4D16Bug]>;

// TODO: Organize more features into groups.		// TODO: Organize more features into groups.
def FeatureGroup {		def FeatureGroup {
// Bugs present on gfx10.1.		// Bugs present on gfx10.1.
list<SubtargetFeature> GFX10_1_Bugs = [		list<SubtargetFeature> GFX10_1_Bugs = [
FeatureVcmpxPermlaneHazard,		FeatureVcmpxPermlaneHazard,
FeatureVMEMtoScalarWriteHazard,		FeatureVMEMtoScalarWriteHazard,
FeatureSMEMtoVectorWriteHazard,		FeatureSMEMtoVectorWriteHazard,
▲ Show 20 Lines • Show All 340 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Show First 20 Lines • Show All 380 Lines • ▼ Show 20 Lines	protected:
bool HasVMEMtoScalarWriteHazard;		bool HasVMEMtoScalarWriteHazard;
bool HasSMEMtoVectorWriteHazard;		bool HasSMEMtoVectorWriteHazard;
bool HasInstFwdPrefetchBug;		bool HasInstFwdPrefetchBug;
bool HasVcmpxExecWARHazard;		bool HasVcmpxExecWARHazard;
bool HasLdsBranchVmemWARHazard;		bool HasLdsBranchVmemWARHazard;
bool HasNSAtoVMEMBug;		bool HasNSAtoVMEMBug;
bool HasOffset3fBug;		bool HasOffset3fBug;
bool HasFlatSegmentOffsetBug;		bool HasFlatSegmentOffsetBug;
		bool HasImageStoreD16Bug;
		bool HasImageGather4D16Bug;

// Dummy feature to use for assembler in tablegen.		// Dummy feature to use for assembler in tablegen.
bool FeatureDisable;		bool FeatureDisable;

SelectionDAGTargetInfo TSInfo;		SelectionDAGTargetInfo TSInfo;
private:		private:
SIInstrInfo InstrInfo;		SIInstrInfo InstrInfo;
SITargetLowering TLInfo;		SITargetLowering TLInfo;
▲ Show 20 Lines • Show All 599 Lines • ▼ Show 20 Lines	public:
bool hasGFX10A16() const {		bool hasGFX10A16() const {
return HasGFX10A16;		return HasGFX10A16;
}		}

bool hasOffset3fBug() const {		bool hasOffset3fBug() const {
return HasOffset3fBug;		return HasOffset3fBug;
}		}

		bool hasImageStoreD16Bug() const {
		return HasImageStoreD16Bug;
		}

		bool hasImageGather4D16Bug() const {
		return HasImageGather4D16Bug;
		}

bool hasNSAEncoding() const {		bool hasNSAEncoding() const {
return HasNSAEncoding;		return HasNSAEncoding;
}		}

bool hasMadF16() const;		bool hasMadF16() const;

bool enableSIScheduler() const {		bool enableSIScheduler() const {
return EnableSIScheduler;		return EnableSIScheduler;
▲ Show 20 Lines • Show All 363 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

Show First 20 Lines • Show All 277 Lines • ▼ Show 20 Lines	GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
HasVMEMtoScalarWriteHazard(false),		HasVMEMtoScalarWriteHazard(false),
HasSMEMtoVectorWriteHazard(false),		HasSMEMtoVectorWriteHazard(false),
HasInstFwdPrefetchBug(false),		HasInstFwdPrefetchBug(false),
HasVcmpxExecWARHazard(false),		HasVcmpxExecWARHazard(false),
HasLdsBranchVmemWARHazard(false),		HasLdsBranchVmemWARHazard(false),
HasNSAtoVMEMBug(false),		HasNSAtoVMEMBug(false),
HasOffset3fBug(false),		HasOffset3fBug(false),
HasFlatSegmentOffsetBug(false),		HasFlatSegmentOffsetBug(false),
		HasImageStoreD16Bug(false),
		HasImageGather4D16Bug(false),

FeatureDisable(false),		FeatureDisable(false),
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),		InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
TLInfo(TM, *this),		TLInfo(TM, *this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {		FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);		MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));		CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));		Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
Show All 13 Lines	unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
case AMDGPU::V_LSHRREV_B64:		case AMDGPU::V_LSHRREV_B64:
case AMDGPU::V_LSHRREV_B64_gfx10:		case AMDGPU::V_LSHRREV_B64_gfx10:
case AMDGPU::V_LSHR_B64:		case AMDGPU::V_LSHR_B64:
case AMDGPU::V_ASHRREV_I64:		case AMDGPU::V_ASHRREV_I64:
case AMDGPU::V_ASHRREV_I64_gfx10:		case AMDGPU::V_ASHRREV_I64_gfx10:
case AMDGPU::V_ASHR_I64:		case AMDGPU::V_ASHR_I64:
return 1;		return 1;
}		}

		arsenmUnsubmitted Done Reply Inline Actions I know clang-format really wants to pack these onto a single line, but it's a terrible idea and you shouldn't listen to it arsenm: I know clang-format really wants to pack these onto a single line, but it's a terrible idea and…
return 2;		return 2;
}		}

unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,		unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
const Function &F) const {		const Function &F) const {
if (NWaves == 1)		if (NWaves == 1)
return getLocalMemorySize();		return getLocalMemorySize();
unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;		unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
▲ Show 20 Lines • Show All 601 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIISelLowering.h

Show First 20 Lines • Show All 97 Lines • ▼ Show 20 Lines	SDValue lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, SelectionDAG &DAG,
ArrayRef<SDValue> Ops) const;		ArrayRef<SDValue> Ops) const;

// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to		// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
// dwordx4 if on SI.		// dwordx4 if on SI.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,		SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
ArrayRef<SDValue> Ops, EVT MemVT,		ArrayRef<SDValue> Ops, EVT MemVT,
MachineMemOperand *MMO, SelectionDAG &DAG) const;		MachineMemOperand *MMO, SelectionDAG &DAG) const;

SDValue handleD16VData(SDValue VData, SelectionDAG &DAG) const;		SDValue handleD16VData(SDValue VData, SelectionDAG &DAG,
		bool Image = false) const;

/// Converts \p Op, which must be of floating point type, to the		/// Converts \p Op, which must be of floating point type, to the
/// floating point type \p VT, by either extending or truncating it.		/// floating point type \p VT, by either extending or truncating it.
SDValue getFPExtOrFPTrunc(SelectionDAG &DAG,		SDValue getFPExtOrFPTrunc(SelectionDAG &DAG,
SDValue Op,		SDValue Op,
const SDLoc &DL,		const SDLoc &DL,
EVT VT) const;		EVT VT) const;

▲ Show 20 Lines • Show All 332 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,220 Lines • ▼ Show 20 Lines	MVT DataDwordVT = NumDataDwords == 1 ?
MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);		MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);

MVT MaskPopVT = MaskPopDwords == 1 ?		MVT MaskPopVT = MaskPopDwords == 1 ?
MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);		MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);

SDValue Data(Result, 0);		SDValue Data(Result, 0);
SDValue TexFail;		SDValue TexFail;

if (IsTexFail) {		if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);		SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
if (MaskPopVT.isVector()) {		if (MaskPopVT.isVector()) {
Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,		Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
SDValue(Result, 0), ZeroIdx);		SDValue(Result, 0), ZeroIdx);
} else {		} else {
Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,		Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
SDValue(Result, 0), ZeroIdx);		SDValue(Result, 0), ZeroIdx);
}		}

TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
SDValue(Result, 0),
DAG.getConstant(MaskPopDwords, DL, MVT::i32));
}		}

if (DataDwordVT.isVector())		if (DataDwordVT.isVector())
Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,		Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
NumDataDwords - MaskPopDwords);		NumDataDwords - MaskPopDwords);

if (IsD16)		if (IsD16)
Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);		Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);

if (!ReqRetVT.isVector())		if (!ReqRetVT.isVector())
Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);		Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);

Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data);		Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data);

if (TexFail)		if (IsTexFail) {
		TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
		SDValue(Result, 0),
		DAG.getConstant(MaskPopDwords, DL, MVT::i32));

return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);		return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
		}

if (Result->getNumValues() == 1)		if (Result->getNumValues() == 1)
return Data;		return Data;

return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);		return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
}		}

static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,		static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
▲ Show 20 Lines • Show All 72 Lines • ▼ Show 20 Lines	if (BaseOpcode->Store) {
VData = Op.getOperand(2);		VData = Op.getOperand(2);

MVT StoreVT = VData.getSimpleValueType();		MVT StoreVT = VData.getSimpleValueType();
if (StoreVT.getScalarType() == MVT::f16) {		if (StoreVT.getScalarType() == MVT::f16) {
if (!Subtarget->hasD16Images() \|\| !BaseOpcode->HasD16)		if (!Subtarget->hasD16Images() \|\| !BaseOpcode->HasD16)
return Op; // D16 is unsupported for this instruction		return Op; // D16 is unsupported for this instruction

IsD16 = true;		IsD16 = true;
VData = handleD16VData(VData, DAG);		VData = handleD16VData(VData, DAG, true);
}		}

NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;		NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
} else {		} else {
// Work out the num dwords based on the dmask popcount and underlying type		// Work out the num dwords based on the dmask popcount and underlying type
// and whether packing is supported.		// and whether packing is supported.
MVT LoadVT = ResultTypes[0].getSimpleVT();		MVT LoadVT = ResultTypes[0].getSimpleVT();
if (LoadVT.getScalarType() == MVT::f16) {		if (LoadVT.getScalarType() == MVT::f16) {
if (!Subtarget->hasD16Images() \|\| !BaseOpcode->HasD16)		if (!Subtarget->hasD16Images() \|\| !BaseOpcode->HasD16)
return Op; // D16 is unsupported for this instruction		return Op; // D16 is unsupported for this instruction

IsD16 = true;		IsD16 = true;
}		}

// Confirm that the return type is large enough for the dmask specified		// Confirm that the return type is large enough for the dmask specified
if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) \|\|		if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) \|\|
(!LoadVT.isVector() && DMaskLanes > 1))		(!LoadVT.isVector() && DMaskLanes > 1))
return Op;		return Op;

if (IsD16 && !Subtarget->hasUnpackedD16VMem())		// The sq block of gfx8 and gfx9 do not estimate register use correctly
		// for d16 image_gather4, image_gather4_l, and image_gather4_lz
		// instructions.
		if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
		!(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
		arsenmUnsubmitted Not Done Reply Inline Actions IsD16 && (!hasUnpackedD16() \|\| (Gather4 && hasGatherbug))? arsenm: IsD16 && (!hasUnpackedD16() \|\| (Gather4 && hasGatherbug))?
		rdominguAuthorUnsubmitted Done Reply Inline Actions We want NumVDataDwords = DMaskLanes for Gather4 && IsD16 && hasGatherBug. That wouldn't be the case with your suggestion. rdomingu: We want NumVDataDwords = DMaskLanes for Gather4 && IsD16 && hasGatherBug. That wouldn't be the…
NumVDataDwords = (DMaskLanes + 1) / 2;		NumVDataDwords = (DMaskLanes + 1) / 2;
else		else
NumVDataDwords = DMaskLanes;		NumVDataDwords = DMaskLanes;

AdjustRetType = true;		AdjustRetType = true;
}		}

AddrIdx = DMaskIdx + 1;		AddrIdx = DMaskIdx + 1;
▲ Show 20 Lines • Show All 1,277 Lines • ▼ Show 20 Lines	if (WidenedVT != VT) {
auto Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,		auto Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
DAG.getVectorIdxConstant(0, DL));		DAG.getVectorIdxConstant(0, DL));
NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);		NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);
}		}
return NewOp;		return NewOp;
}		}

SDValue SITargetLowering::handleD16VData(SDValue VData,		SDValue SITargetLowering::handleD16VData(SDValue VData,
SelectionDAG &DAG) const {		SelectionDAG &DAG,
		bool Image) const {
EVT StoreVT = VData.getValueType();		EVT StoreVT = VData.getValueType();

// No change for f16 and legal vector D16 types.		// No change for f16 and legal vector D16 types.
if (!StoreVT.isVector())		if (!StoreVT.isVector())
return VData;		return VData;

SDLoc DL(VData);		SDLoc DL(VData);
assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");		assert((StoreVT.getVectorNumElements() != 3) && "Handle v3f16");

if (Subtarget->hasUnpackedD16VMem()) {		if (Subtarget->hasUnpackedD16VMem()) {
// We need to unpack the packed data to store.		// We need to unpack the packed data to store.
EVT IntStoreVT = StoreVT.changeTypeToInteger();		EVT IntStoreVT = StoreVT.changeTypeToInteger();
SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);		SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);

EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,		EVT EquivStoreVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
StoreVT.getVectorNumElements());		StoreVT.getVectorNumElements());
SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);		SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
return DAG.UnrollVectorOp(ZExt.getNode());		return DAG.UnrollVectorOp(ZExt.getNode());
}		}

		// The sq block of gfx8.1 does not estimate register use correctly for d16
		// image store instructions. The data operand is computed as if it were not a
		// d16 image instruction.
		if (Image && Subtarget->hasImageStoreD16Bug()) {
		// Bitcast to i16
		EVT IntStoreVT = StoreVT.changeTypeToInteger();
		SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);

		// Decompose into scalars
		SmallVector<SDValue, 4> Elts;
		DAG.ExtractVectorElements(IntVData, Elts);

		// Group pairs of i16 into v2i16 and bitcast to i32
		SmallVector<SDValue, 4> PackedElts;
		arsenmUnsubmitted Not Done Reply Inline Actions You can initialize this to the target size and then avoid push_back arsenm: You can initialize this to the target size and then avoid push_back
		EVT Vec2VT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, 2);
		arsenmUnsubmitted Done Reply Inline Actions This is the same as just MVT::v2i16 arsenm: This is the same as just MVT::v2i16
		arsenmUnsubmitted Not Done Reply Inline Actions ++I arsenm: ++I
		for (unsigned i = 0 ; i < Elts.size() / 2 ; i += 1)
		{
		arsenmUnsubmitted Done Reply Inline Actions Formatting arsenm: Formatting
		SDValue Pair = DAG.getBuildVector(Vec2VT, DL, {Elts[i * 2],
		Elts[i * 2 + 1]});
		SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
		PackedElts.push_back(IntPair);
		}

		// Pad using UNDEF
		PackedElts.resize(PackedElts.size() * 2, DAG.getUNDEF(MVT::i32));

		// Build final vector
		EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
		PackedElts.size());
		return DAG.getBuildVector(VecVT, DL, PackedElts);
		}

assert(isTypeLegal(StoreVT));		assert(isTypeLegal(StoreVT));
return VData;		return VData;
}		}

SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,		SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDLoc DL(Op);		SDLoc DL(Op);
SDValue Chain = Op.getOperand(0);		SDValue Chain = Op.getOperand(0);
▲ Show 20 Lines • Show All 4,287 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll

; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s		; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,PACKED,GFX81,GFX89 %s		; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX81,GFX89 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,PACKED,GFX9,GFX89 %s		; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,PACKED,GFX9,GFX89 %s
; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX10 %s		; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX10 %s

; GCN-LABEL: {{^}}image_load_f16:		; GCN-LABEL: {{^}}image_load_f16:
; GFX89: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm d16{{$}}		; GFX89: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm d16{{$}}
; GFX10: image_load v0, v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm d16{{$}}		; GFX10: image_load v0, v[0:1], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps half @image_load_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {		define amdgpu_ps half @image_load_f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
main_body:		main_body:
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	main_body:
ret void		ret void
}		}

; GCN-LABEL: {{^}}image_store_v2f16		; GCN-LABEL: {{^}}image_store_v2f16
; UNPACKED: v_lshrrev_b32_e32		; UNPACKED: v_lshrrev_b32_e32
; UNPACKED: v_and_b32_e32		; UNPACKED: v_and_b32_e32
; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}		; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
; PACKED: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}		; PACKED: image_store v2, v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
		; GFX81: image_store v[2:3], v[0:1], s[0:7] dmask:0x3 unorm d16{{$}}
; GFX10: image_store v2, v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm d16{{$}}		; GFX10: image_store v2, v[0:1], s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, float %in) {		define amdgpu_ps void @image_store_v2f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, float %in) {
main_body:		main_body:
%data = bitcast float %in to <2 x half>		%data = bitcast float %in to <2 x half>
call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %data, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)		call void @llvm.amdgcn.image.store.2d.v2f16.i32(<2 x half> %data, i32 3, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void		ret void
}		}

; GCN-LABEL: {{^}}image_store_v4f16		; GCN-LABEL: {{^}}image_store_v4f16
; UNPACKED: v_lshrrev_b32_e32		; UNPACKED: v_lshrrev_b32_e32
; UNPACKED: v_and_b32_e32		; UNPACKED: v_and_b32_e32
; UNPACKED: v_lshrrev_b32_e32		; UNPACKED: v_lshrrev_b32_e32
; UNPACKED: v_and_b32_e32		; UNPACKED: v_and_b32_e32
; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}		; UNPACKED: image_store v[{{[0-9:]+}}], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
; PACKED: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}		; PACKED: image_store v[2:3], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
		; GFX81: image_store v[2:5], v[0:1], s[0:7] dmask:0xf unorm d16{{$}}
; GFX10: image_store v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}}		; GFX10: image_store v[2:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm d16{{$}}
define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) {		define amdgpu_ps void @image_store_v4f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t, <2 x float> %in) {
main_body:		main_body:
%data = bitcast <2 x float> %in to <4 x half>		%data = bitcast <2 x float> %in to <4 x half>
call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %data, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)		call void @llvm.amdgcn.image.store.2d.v4f16.i32(<4 x half> %data, i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
ret void		ret void
}		}

Show All 30 Lines

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll

	; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s			; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck -check-prefixes=GCN,UNPACKED,GFX89 %s
	; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,PACKED,GFX81,GFX89 %s			; RUN: llc < %s -march=amdgcn -mcpu=gfx810 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX81,GFX89 %s
	; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,PACKED,GFX9,GFX89 %s			; RUN: llc < %s -march=amdgcn -mcpu=gfx900 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX9,GFX89 %s
	; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX10 %s			; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs \| FileCheck -check-prefixes=GCN,GFX10 %s

	; GCN-LABEL: {{^}}image_gather4_b_2d_v4f16:			; GCN-LABEL: {{^}}image_gather4_b_2d_v4f16:
	; UNPACKED: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}			; UNPACKED: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}
	; PACKED: image_gather4_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}			; PACKED: image_gather4_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}
				; GFX810: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}
				; GFX9: image_gather4_b v[0:3], v[0:2], s[0:7], s[8:11] dmask:0x4 d16{{$}}
	; GFX10: image_gather4_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D d16{{$}}			; GFX10: image_gather4_b v[0:1], v[0:2], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D d16{{$}}
	define amdgpu_ps <2 x float> @image_gather4_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {			define amdgpu_ps <2 x float> @image_gather4_b_2d_v4f16(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
	main_body:			main_body:
	%tex = call <4 x half> @llvm.amdgcn.image.gather4.b.2d.v4f16.f32.f32(i32 4, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)			%tex = call <4 x half> @llvm.amdgcn.image.gather4.b.2d.v4f16.f32.f32(i32 4, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
	%r = bitcast <4 x half> %tex to <2 x float>			%r = bitcast <4 x half> %tex to <2 x float>
	ret <2 x float> %r			ret <2 x float> %r
	}			}

	declare <4 x half> @llvm.amdgcn.image.gather4.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1			declare <4 x half> @llvm.amdgcn.image.gather4.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1

	attributes #0 = { nounwind }			attributes #0 = { nounwind }
	attributes #1 = { nounwind readonly }			attributes #1 = { nounwind readonly }
	attributes #2 = { nounwind readnone }			attributes #2 = { nounwind readnone }

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Implement hardware bug workaround for image instructions
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 268499

llvm/lib/Target/AMDGPU/AMDGPU.td

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.h

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Implement hardware bug workaround for image instructionsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 268499

llvm/lib/Target/AMDGPU/AMDGPU.td

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

llvm/lib/Target/AMDGPU/SIISelLowering.h

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.d16.dim.ll

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.d16.dim.ll

[AMDGPU] Implement hardware bug workaround for image instructions
ClosedPublic