This is an archive of the discontinued LLVM Phabricator instance.

llvm/lib/Target/AMDGPU/SIISelLowering.cpp
10431	Calling this "Src" is a bit confusing given you handle cast operations here and this is the Dest for them
10599	I'd hope this is unreachable
10700	there are stripBitcast helpers around
10889	I think isByteSized works

jrbyrnes added a child revision: D155995: [AMDGPU]: Allow combining into v_dot4.Jul 21 2023, 2:29 PM

Address comments + rework "hasEightBitAccesses".

hasEightBitAccesses is really just a heuristic to stop combinations of type:

s_mov mask, 0x01000504
v_perm d, v1, v0, mask

when we can instead do:

v_lshl_or d, v0, 16, v1

These will occur iff both operands have ultimate srcs that are exactly 16 bit, and they are addressed as 16 bit operands in the relevant or (that is, they havent been byte extracted / shuffled).

jrbyrnes edited the summary of this revision. (Show Details)Jul 24 2023, 1:38 PM

Harbormaster completed remote builds in B247780: Diff 543685.Jul 24 2023, 10:18 PM

arsenm accepted this revision.Jul 26 2023, 2:19 PM

This revision is now accepted and ready to land.Jul 26 2023, 2:19 PM

Closed by commit rG391249d1afe4: [AMDGPU] Allow 8,16 bit sources in calculateSrcByte (authored by jrbyrnes). · Explain WhyJul 28 2023, 9:51 AM

This revision was automatically updated to reflect the committed changes.

jrbyrnes added a commit: rG391249d1afe4: [AMDGPU] Allow 8,16 bit sources in calculateSrcByte.

jrbyrnes added a child revision: D157133: [AMDGPU] Extend CalculateByteProvider to capture vectors and signed.Aug 4 2023, 1:26 PM

jrbyrnes removed a child revision: D155995: [AMDGPU]: Allow combining into v_dot4.Aug 11 2023, 9:33 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

SIISelLowering.cpp

85 lines

test/

CodeGen/

AMDGPU/

12 lines

36 lines

8 lines

34 lines

4 lines

Diff 545209

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 10,422 Lines • ▼ Show 20 Lines
// performed.		// performed.
static const std::optional<ByteProvider<SDValue>>		static const std::optional<ByteProvider<SDValue>>
calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,		calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
unsigned Depth = 0) {		unsigned Depth = 0) {
// We may need to recursively traverse a series of SRLs		// We may need to recursively traverse a series of SRLs
if (Depth >= 6)		if (Depth >= 6)
return std::nullopt;		return std::nullopt;

		auto ValueSize = Op.getValueSizeInBits();
		arsenmUnsubmitted Done Reply Inline Actions Calling this "Src" is a bit confusing given you handle cast operations here and this is the Dest for them arsenm: Calling this "Src" is a bit confusing given you handle cast operations here and this is the…
		if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32)
		return std::nullopt;

switch (Op->getOpcode()) {		switch (Op->getOpcode()) {
case ISD::TRUNCATE: {		case ISD::TRUNCATE: {
if (Op->getOperand(0).getScalarValueSizeInBits() != 32)
return std::nullopt;
return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);		return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
}		}

case ISD::SRL: {		case ISD::SRL: {
auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));		auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!ShiftOp)		if (!ShiftOp)
return std::nullopt;		return std::nullopt;

uint64_t BitShift = ShiftOp->getZExtValue();		uint64_t BitShift = ShiftOp->getZExtValue();

if (BitShift % 8 != 0)		if (BitShift % 8 != 0)
return std::nullopt;		return std::nullopt;

SrcIndex += BitShift / 8;		SrcIndex += BitShift / 8;

return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);		return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
}		}

default: {		default: {
if (Op.getScalarValueSizeInBits() != 32)
return std::nullopt;

return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);		return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
}		}
}		}
llvm_unreachable("fully handled switch");		llvm_unreachable("fully handled switch");
}		}

// For a byte position in the result of an Or, traverse the tree and find the		// For a byte position in the result of an Or, traverse the tree and find the
// node (and the byte of the node) which ultimately provides this {Or,		// node (and the byte of the node) which ultimately provides this {Or,
▲ Show 20 Lines • Show All 125 Lines • ▼ Show 20 Lines	case ISD::TRUNCATE: {
if (NarrowByteWidth >= Index) {		if (NarrowByteWidth >= Index) {
return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,		return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
StartingIndex);		StartingIndex);
}		}

return std::nullopt;		return std::nullopt;
}		}

		case ISD::CopyFromReg: {
		auto BitWidth = Op.getScalarValueSizeInBits();
		if (BitWidth % 8)
		arsenmUnsubmitted Done Reply Inline Actions I'd hope this is unreachable arsenm: I'd hope this is unreachable
		llvm_unreachable("Invalid type in CopyFromReg");

		if (BitWidth / 8 > Index)
		return calculateSrcByte(Op, StartingIndex, Index);

		return std::nullopt;
		}

case ISD::LOAD: {		case ISD::LOAD: {
auto L = cast<LoadSDNode>(Op.getNode());		auto L = cast<LoadSDNode>(Op.getNode());
unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();		unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
if (NarrowBitWidth % 8 != 0)		if (NarrowBitWidth % 8 != 0)
return std::nullopt;		return std::nullopt;
uint64_t NarrowByteWidth = NarrowBitWidth / 8;		uint64_t NarrowByteWidth = NarrowBitWidth / 8;

// If the width of the load does not reach byte we are trying to provide for		// If the width of the load does not reach byte we are trying to provide for
Show All 20 Lines	default: {
return std::nullopt;		return std::nullopt;
}		}
}		}

llvm_unreachable("fully handled switch");		llvm_unreachable("fully handled switch");
}		}

// Returns true if the Operand is a scalar and is 16 bits		// Returns true if the Operand is a scalar and is 16 bits
static bool is16BitScalarOp(SDValue &Operand) {		static bool isExtendedFrom16Bits(SDValue &Operand) {

switch (Operand.getOpcode()) {		switch (Operand.getOpcode()) {
case ISD::ANY_EXTEND:		case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:		case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND: {		case ISD::ZERO_EXTEND: {
auto OpVT = Operand.getOperand(0).getValueType();		auto OpVT = Operand.getOperand(0).getValueType();
return !OpVT.isVector() && OpVT.getSizeInBits() == 16;		return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
}		}
case ISD::LOAD: {		case ISD::LOAD: {
LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());		LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
auto ExtType = cast<LoadSDNode>(L)->getExtensionType();		auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
if (ExtType == ISD::ZEXTLOAD \|\| ExtType == ISD::SEXTLOAD \|\|		if (ExtType == ISD::ZEXTLOAD \|\| ExtType == ISD::SEXTLOAD \|\|
ExtType == ISD::EXTLOAD) {		ExtType == ISD::EXTLOAD) {
auto MemVT = L->getMemoryVT();		auto MemVT = L->getMemoryVT();
return !MemVT.isVector() && MemVT.getSizeInBits() == 16;		return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
}		}
return false;		return L->getMemoryVT().getSizeInBits() == 16;
}		}
default:		default:
return false;		return false;
}		}
}		}

// Returns true if the mask matches consecutive bytes, and the first byte		// Returns true if the mask matches consecutive bytes, and the first byte
// begins at a power of 2 byte offset from 0th byte		// begins at a power of 2 byte offset from 0th byte
Show All 11 Lines	static bool addresses16Bits(int Mask) {
bool Is16Aligned = !(Low8 % 2);		bool Is16Aligned = !(Low8 % 2);

return IsConsecutive && Is16Aligned;		return IsConsecutive && Is16Aligned;
}		}

// Do not lower into v_perm if the operands are actually 16 bit		// Do not lower into v_perm if the operands are actually 16 bit
// and the selected bits (based on PermMask) correspond with two		// and the selected bits (based on PermMask) correspond with two
// easily addressable 16 bit operands.		// easily addressable 16 bit operands.
static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op,		static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
SDValue &OtherOp) {		SDValue &OtherOp) {
int Low16 = PermMask & 0xffff;		int Low16 = PermMask & 0xffff;
int Hi16 = (PermMask & 0xffff0000) >> 16;		int Hi16 = (PermMask & 0xffff0000) >> 16;

// ByteProvider only accepts 32 bit operands		assert(Op.getValueType().isByteSized());
assert(Op.getValueType().getSizeInBits() == 32);		assert(OtherOp.getValueType().isByteSized());
assert(OtherOp.getValueType().getSizeInBits() == 32);

auto OpIs16Bit = is16BitScalarOp(Op);		auto TempOp = peekThroughBitcasts(Op);
auto OtherOpIs16Bit = is16BitScalarOp(Op);		auto TempOtherOp = peekThroughBitcasts(OtherOp);

// If there is a size mismatch, then we must use masking on at least one		auto OpIs16Bit =
		arsenmUnsubmitted Done Reply Inline Actions there are stripBitcast helpers around arsenm: there are stripBitcast helpers around
// operand		TempOtherOp.getValueSizeInBits() == 16 \|\| isExtendedFrom16Bits(TempOp);
if (OpIs16Bit != OtherOpIs16Bit)		if (!OpIs16Bit)
return true;		return true;

// If both operands are 16 bit, return whether or not we cleanly address both		auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 \|\|
if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp))		isExtendedFrom16Bits(TempOtherOp);
return !addresses16Bits(Low16) \|\| !addresses16Bits(Hi16);		if (!OtherOpIs16Bit)

// Both are 32 bit operands
return true;		return true;

		// Do we cleanly address both
		return !addresses16Bits(Low16) \|\| !addresses16Bits(Hi16);
}		}

SDValue SITargetLowering::performOrCombine(SDNode *N,		SDValue SITargetLowering::performOrCombine(SDNode *N,
DAGCombinerInfo &DCI) const {		DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;		SelectionDAG &DAG = DCI.DAG;
SDValue LHS = N->getOperand(0);		SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);		SDValue RHS = N->getOperand(1);

▲ Show 20 Lines • Show All 108 Lines • ▼ Show 20 Lines	if (LHSMask == ~0u \|\| RHSMask == ~0u) {

// VT is known to be MVT::i32, so we need to provide 4 bytes.		// VT is known to be MVT::i32, so we need to provide 4 bytes.
assert(VT == MVT::i32);		assert(VT == MVT::i32);
for (int i = 0; i < 4; i++) {		for (int i = 0; i < 4; i++) {
// Find the ByteProvider that provides the ith byte of the result of OR		// Find the ByteProvider that provides the ith byte of the result of OR
std::optional<ByteProvider<SDValue>> P =		std::optional<ByteProvider<SDValue>> P =
calculateByteProvider(SDValue(N, 0), i, 0, /StartingIndex = / i);		calculateByteProvider(SDValue(N, 0), i, 0, /StartingIndex = / i);
// TODO support constantZero		// TODO support constantZero
if (!P \|\| P->isConstantZero())		if (!P \|\| P->isConstantZero()) {
return SDValue();		return SDValue();
		}

PermNodes.push_back(*P);		PermNodes.push_back(*P);
}		}
if (PermNodes.size() != 4)		if (PermNodes.size() != 4)
return SDValue();		return SDValue();

int FirstSrc = 0;		int FirstSrc = 0;
std::optional<int> SecondSrc;		std::optional<int> SecondSrc;
uint64_t permMask = 0x00000000;		uint64_t PermMask = 0x00000000;
for (size_t i = 0; i < PermNodes.size(); i++) {		for (size_t i = 0; i < PermNodes.size(); i++) {
auto PermOp = PermNodes[i];		auto PermOp = PermNodes[i];
// Since the mask is applied to Src1:Src2, Src1 bytes must be offset		// Since the mask is applied to Src1:Src2, Src1 bytes must be offset
// by sizeof(Src2) = 4		// by sizeof(Src2) = 4
int SrcByteAdjust = 4;		int SrcByteAdjust = 4;

if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {		if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
if (SecondSrc.has_value())		if (SecondSrc.has_value())
if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))		if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
return SDValue();		return SDValue();

// Set the index of the second distinct Src node		// Set the index of the second distinct Src node
SecondSrc = i;		SecondSrc = i;
assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() ==		assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
32);
SrcByteAdjust = 0;		SrcByteAdjust = 0;
}		}
assert(PermOp.SrcOffset + SrcByteAdjust < 8);		assert(PermOp.SrcOffset + SrcByteAdjust < 8);
assert(!DAG.getDataLayout().isBigEndian());		assert(!DAG.getDataLayout().isBigEndian());
permMask \|= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);		PermMask \|= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
}		}

SDValue Op = *PermNodes[FirstSrc].Src;		SDValue Op = *PermNodes[FirstSrc].Src;
SDValue OtherOp = SecondSrc.has_value() ? PermNodes[SecondSrc].Src		SDValue OtherOp = SecondSrc.has_value() ? PermNodes[SecondSrc].Src
: *PermNodes[FirstSrc].Src;		: *PermNodes[FirstSrc].Src;

// Check that we are not just extracting the bytes in order from an op		// Check that we are not just extracting the bytes in order from an op
if (Op == OtherOp) {		if (Op == OtherOp) {
int Low16 = permMask & 0xffff;		int Low16 = PermMask & 0xffff;
int Hi16 = (permMask & 0xffff0000) >> 16;		int Hi16 = (PermMask & 0xffff0000) >> 16;

bool WellFormedLow = (Low16 == 0x0504) \|\| (Low16 == 0x0100);		bool WellFormedLow = (Low16 == 0x0504) \|\| (Low16 == 0x0100);
bool WellFormedHi = (Hi16 == 0x0706) \|\| (Hi16 == 0x0302);		bool WellFormedHi = (Hi16 == 0x0706) \|\| (Hi16 == 0x0302);

// The perm op would really just produce Op. So combine into Op		// The perm op would really just produce Op. So combine into Op
if (WellFormedLow && WellFormedHi)		if (WellFormedLow && WellFormedHi)
return Op;		return Op;
}		}

if (hasEightBitAccesses(permMask, Op, OtherOp)) {		if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
SDLoc DL(N);		SDLoc DL(N);
		assert(Op.getValueType().isByteSized() &&
		OtherOp.getValueType().isByteSized());
		arsenmUnsubmitted Done Reply Inline Actions I think isByteSized works arsenm: I think isByteSized works
		if (Op.getValueSizeInBits() < 32)
		// If the ultimate src is less than 32 bits, then we will only be
		// using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
		// CalculateByteProvider would not have returned Op as source if we
		// used a byte that is outside its ValueType. Thus, we are free to
		// ANY_EXTEND as the extended bits are dont-cares.
		Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op);

		if (OtherOp.getValueSizeInBits() < 32)
		OtherOp = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);

return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,		return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
DAG.getConstant(permMask, DL, MVT::i32));		DAG.getConstant(PermMask, DL, MVT::i32));
}		}
}		}
}		}

if (VT != MVT::i64 \|\| DCI.isBeforeLegalizeOps())		if (VT != MVT::i64 \|\| DCI.isBeforeLegalizeOps())
return SDValue();		return SDValue();

// TODO: This could be a generic combine with a predicate for extracting the		// TODO: This could be a generic combine with a predicate for extracting the
▲ Show 20 Lines • Show All 3,606 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/bf16.ll

	Show First 20 Lines • Show All 1,257 Lines • ▼ Show 20 Lines
	; GFX8: ; %bb.0: ; %entry			; GFX8: ; %bb.0: ; %entry
	; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1			; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
	; GFX8-NEXT: s_setpc_b64 s[30:31]			; GFX8-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX9-LABEL: test_ret_v3bf16:			; GFX9-LABEL: test_ret_v3bf16:
	; GFX9: ; %bb.0: ; %entry			; GFX9: ; %bb.0: ; %entry
	; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
	; GFX9-NEXT: s_mov_b32 s4, 0xffff
	; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2
	; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1			; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
	; GFX9-NEXT: s_setpc_b64 s[30:31]			; GFX9-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX10-LABEL: test_ret_v3bf16:			; GFX10-LABEL: test_ret_v3bf16:
	; GFX10: ; %bb.0: ; %entry			; GFX10: ; %bb.0: ; %entry
	; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
	; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1			; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
	; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
	; GFX10-NEXT: s_setpc_b64 s[30:31]			; GFX10-NEXT: s_setpc_b64 s[30:31]
	entry:			entry:
	ret <3 x bfloat> %in			ret <3 x bfloat> %in
	}			}

	define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {			define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {
	; GCN-LABEL: test_ret_v4bf16:			; GCN-LABEL: test_ret_v4bf16:
	; GCN: ; %bb.0: ; %entry			; GCN: ; %bb.0: ; %entry
	▲ Show 20 Lines • Show All 511 Lines • ▼ Show 20 Lines
	; GFX9: ; %bb.0: ; %entry			; GFX9: ; %bb.0: ; %entry
	; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX9-NEXT: s_mov_b32 s8, s33			; GFX9-NEXT: s_mov_b32 s8, s33
	; GFX9-NEXT: s_mov_b32 s33, s32			; GFX9-NEXT: s_mov_b32 s33, s32
	; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1			; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
	; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill			; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
	; GFX9-NEXT: s_mov_b64 exec, s[4:5]			; GFX9-NEXT: s_mov_b64 exec, s[4:5]
	; GFX9-NEXT: s_addk_i32 s32, 0x400			; GFX9-NEXT: s_addk_i32 s32, 0x400
	; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
	; GFX9-NEXT: s_mov_b32 s4, 0xffff
	; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v4
	; GFX9-NEXT: s_getpc_b64 s[4:5]			; GFX9-NEXT: s_getpc_b64 s[4:5]
	; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4			; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
	; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12			; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
	; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0			; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
	; GFX9-NEXT: v_writelane_b32 v3, s30, 0			; GFX9-NEXT: v_writelane_b32 v3, s30, 0
	; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1			; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
	; GFX9-NEXT: v_writelane_b32 v3, s31, 1			; GFX9-NEXT: v_writelane_b32 v3, s31, 1
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	Show All 20 Lines
	; GFX10-NEXT: s_xor_saveexec_b32 s4, -1			; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
	; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill			; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
	; GFX10-NEXT: s_waitcnt_depctr 0xffe3			; GFX10-NEXT: s_waitcnt_depctr 0xffe3
	; GFX10-NEXT: s_mov_b32 exec_lo, s4			; GFX10-NEXT: s_mov_b32 exec_lo, s4
	; GFX10-NEXT: s_addk_i32 s32, 0x200			; GFX10-NEXT: s_addk_i32 s32, 0x200
	; GFX10-NEXT: s_getpc_b64 s[4:5]			; GFX10-NEXT: s_getpc_b64 s[4:5]
	; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4			; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
	; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12			; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
	; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
	; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
	; GFX10-NEXT: v_writelane_b32 v3, s30, 0			; GFX10-NEXT: v_writelane_b32 v3, s30, 0
				; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
	; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1			; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
	; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v4
	; GFX10-NEXT: v_writelane_b32 v3, s31, 1			; GFX10-NEXT: v_writelane_b32 v3, s31, 1
	; GFX10-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]			; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
	; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4			; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
	; GFX10-NEXT: s_waitcnt_vscnt null, 0x0			; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
	; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen			; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
	; GFX10-NEXT: s_waitcnt_vscnt null, 0x0			; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
	; GFX10-NEXT: v_readlane_b32 s31, v3, 1			; GFX10-NEXT: v_readlane_b32 s31, v3, 1
	▲ Show 20 Lines • Show All 1,278 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/load-hi16.ll

	Show First 20 Lines • Show All 260 Lines • ▼ Show 20 Lines
	; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4			; GFX906-NEXT: v_perm_b32 v0, v0, v1, s4
	; GFX906-NEXT: s_setpc_b64 s[30:31]			; GFX906-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX803-LABEL: load_local_hi_v2i16_reglo:			; GFX803-LABEL: load_local_hi_v2i16_reglo:
	; GFX803: ; %bb.0: ; %entry			; GFX803: ; %bb.0: ; %entry
	; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX803-NEXT: s_mov_b32 m0, -1			; GFX803-NEXT: s_mov_b32 m0, -1
	; GFX803-NEXT: ds_read_u16 v0, v0			; GFX803-NEXT: ds_read_u16 v0, v0
				; GFX803-NEXT: s_mov_b32 s4, 0x1000504
	; GFX803-NEXT: s_waitcnt lgkmcnt(0)			; GFX803-NEXT: s_waitcnt lgkmcnt(0)
	; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0			; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
	; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
	; GFX803-NEXT: s_setpc_b64 s[30:31]			; GFX803-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo:			; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo:
	; GFX900-FLATSCR: ; %bb.0: ; %entry			; GFX900-FLATSCR: ; %bb.0: ; %entry
	; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX900-FLATSCR-NEXT: ds_read_u16_d16_hi v1, v0			; GFX900-FLATSCR-NEXT: ds_read_u16_d16_hi v1, v0
	; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)			; GFX900-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
	; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, v1			; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, v1
	Show All 26 Lines
	; GFX906-NEXT: s_waitcnt vmcnt(0)			; GFX906-NEXT: s_waitcnt vmcnt(0)
	; GFX906-NEXT: s_setpc_b64 s[30:31]			; GFX906-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX803-LABEL: load_local_hi_v2i16_reglo_vreg:			; GFX803-LABEL: load_local_hi_v2i16_reglo_vreg:
	; GFX803: ; %bb.0: ; %entry			; GFX803: ; %bb.0: ; %entry
	; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX803-NEXT: s_mov_b32 m0, -1			; GFX803-NEXT: s_mov_b32 m0, -1
	; GFX803-NEXT: ds_read_u16 v0, v0			; GFX803-NEXT: ds_read_u16 v0, v0
				; GFX803-NEXT: s_mov_b32 s4, 0x1000504
	; GFX803-NEXT: s_waitcnt lgkmcnt(0)			; GFX803-NEXT: s_waitcnt lgkmcnt(0)
	; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0			; GFX803-NEXT: v_perm_b32 v0, v1, v0, s4
	; GFX803-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
	; GFX803-NEXT: flat_store_dword v[0:1], v0			; GFX803-NEXT: flat_store_dword v[0:1], v0
	; GFX803-NEXT: s_waitcnt vmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0)
	; GFX803-NEXT: s_setpc_b64 s[30:31]			; GFX803-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo_vreg:			; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_reglo_vreg:
	; GFX900-FLATSCR: ; %bb.0: ; %entry			; GFX900-FLATSCR: ; %bb.0: ; %entry
	; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX900-FLATSCR-NEXT: ds_read_u16_d16_hi v1, v0			; GFX900-FLATSCR-NEXT: ds_read_u16_d16_hi v1, v0
	▲ Show 20 Lines • Show All 366 Lines • ▼ Show 20 Lines
	; GFX906-NEXT: s_setpc_b64 s[30:31]			; GFX906-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX803-LABEL: load_global_hi_v2i16_reglo_vreg:			; GFX803-LABEL: load_global_hi_v2i16_reglo_vreg:
	; GFX803: ; %bb.0: ; %entry			; GFX803: ; %bb.0: ; %entry
	; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0			; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
	; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc			; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
	; GFX803-NEXT: flat_load_ushort v0, v[0:1]			; GFX803-NEXT: flat_load_ushort v0, v[0:1]
				; GFX803-NEXT: s_mov_b32 s4, 0x1000504
	; GFX803-NEXT: s_waitcnt vmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0)
	; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0			; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
	; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
	; GFX803-NEXT: flat_store_dword v[0:1], v0			; GFX803-NEXT: flat_store_dword v[0:1], v0
	; GFX803-NEXT: s_waitcnt vmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0)
	; GFX803-NEXT: s_setpc_b64 s[30:31]			; GFX803-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX900-FLATSCR-LABEL: load_global_hi_v2i16_reglo_vreg:			; GFX900-FLATSCR-LABEL: load_global_hi_v2i16_reglo_vreg:
	; GFX900-FLATSCR: ; %bb.0: ; %entry			; GFX900-FLATSCR: ; %bb.0: ; %entry
	; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX900-FLATSCR-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094			; GFX900-FLATSCR-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
	▲ Show 20 Lines • Show All 291 Lines • ▼ Show 20 Lines
	; GFX906-NEXT: global_store_dword v[0:1], v0, off			; GFX906-NEXT: global_store_dword v[0:1], v0, off
	; GFX906-NEXT: s_waitcnt vmcnt(0)			; GFX906-NEXT: s_waitcnt vmcnt(0)
	; GFX906-NEXT: s_setpc_b64 s[30:31]			; GFX906-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX803-LABEL: load_flat_hi_v2i16_reglo_vreg:			; GFX803-LABEL: load_flat_hi_v2i16_reglo_vreg:
	; GFX803: ; %bb.0: ; %entry			; GFX803: ; %bb.0: ; %entry
	; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX803-NEXT: flat_load_ushort v0, v[0:1]			; GFX803-NEXT: flat_load_ushort v0, v[0:1]
				; GFX803-NEXT: s_mov_b32 s4, 0x1000504
	; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0			; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
	; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
	; GFX803-NEXT: flat_store_dword v[0:1], v0			; GFX803-NEXT: flat_store_dword v[0:1], v0
	; GFX803-NEXT: s_waitcnt vmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0)
	; GFX803-NEXT: s_setpc_b64 s[30:31]			; GFX803-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX900-FLATSCR-LABEL: load_flat_hi_v2i16_reglo_vreg:			; GFX900-FLATSCR-LABEL: load_flat_hi_v2i16_reglo_vreg:
	; GFX900-FLATSCR: ; %bb.0: ; %entry			; GFX900-FLATSCR: ; %bb.0: ; %entry
	; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX900-FLATSCR-NEXT: flat_load_short_d16_hi v2, v[0:1]			; GFX900-FLATSCR-NEXT: flat_load_short_d16_hi v2, v[0:1]
	▲ Show 20 Lines • Show All 275 Lines • ▼ Show 20 Lines
	; GFX906-NEXT: global_store_dword v[0:1], v0, off			; GFX906-NEXT: global_store_dword v[0:1], v0, off
	; GFX906-NEXT: s_waitcnt vmcnt(0)			; GFX906-NEXT: s_waitcnt vmcnt(0)
	; GFX906-NEXT: s_setpc_b64 s[30:31]			; GFX906-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg:			; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg:
	; GFX803: ; %bb.0: ; %entry			; GFX803: ; %bb.0: ; %entry
	; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094			; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4094
				; GFX803-NEXT: s_mov_b32 s4, 0x1000504
	; GFX803-NEXT: s_waitcnt vmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0)
	; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1			; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
	; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
	; GFX803-NEXT: flat_store_dword v[0:1], v0			; GFX803-NEXT: flat_store_dword v[0:1], v0
	; GFX803-NEXT: s_waitcnt vmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0)
	; GFX803-NEXT: s_setpc_b64 s[30:31]			; GFX803-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg:			; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg:
	; GFX900-FLATSCR: ; %bb.0: ; %entry			; GFX900-FLATSCR: ; %bb.0: ; %entry
	; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:4094			; GFX900-FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s32 offset:4094
	▲ Show 20 Lines • Show All 80 Lines • ▼ Show 20 Lines
	; GFX906-NEXT: s_waitcnt vmcnt(0)			; GFX906-NEXT: s_waitcnt vmcnt(0)
	; GFX906-NEXT: s_setpc_b64 s[30:31]			; GFX906-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:			; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:
	; GFX803: ; %bb.0: ; %entry			; GFX803: ; %bb.0: ; %entry
	; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc			; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], 0 offset:4094 glc
	; GFX803-NEXT: s_waitcnt vmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0)
	; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1			; GFX803-NEXT: s_mov_b32 s4, 0x1000504
	; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD			; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
	; GFX803-NEXT: flat_store_dword v[0:1], v0			; GFX803-NEXT: flat_store_dword v[0:1], v0
	; GFX803-NEXT: s_waitcnt vmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0)
	; GFX803-NEXT: s_setpc_b64 s[30:31]			; GFX803-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:			; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_nooff:
	; GFX900-FLATSCR: ; %bb.0: ; %entry			; GFX900-FLATSCR: ; %bb.0: ; %entry
	; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe			; GFX900-FLATSCR-NEXT: s_movk_i32 s0, 0xffe
	▲ Show 20 Lines • Show All 434 Lines • ▼ Show 20 Lines
	; GFX906-NEXT: s_setpc_b64 s[30:31]			; GFX906-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX803-LABEL: load_constant_hi_v2i16_reglo_vreg:			; GFX803-LABEL: load_constant_hi_v2i16_reglo_vreg:
	; GFX803: ; %bb.0: ; %entry			; GFX803: ; %bb.0: ; %entry
	; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0			; GFX803-NEXT: v_add_u32_e32 v0, vcc, 0xfffff002, v0
	; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc			; GFX803-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
	; GFX803-NEXT: flat_load_ushort v0, v[0:1]			; GFX803-NEXT: flat_load_ushort v0, v[0:1]
				; GFX803-NEXT: s_mov_b32 s4, 0x1000504
	; GFX803-NEXT: s_waitcnt vmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0)
	; GFX803-NEXT: v_lshlrev_b32_e32 v0, 16, v0			; GFX803-NEXT: v_perm_b32 v0, v2, v0, s4
	; GFX803-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
	; GFX803-NEXT: flat_store_dword v[0:1], v0			; GFX803-NEXT: flat_store_dword v[0:1], v0
	; GFX803-NEXT: s_waitcnt vmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0)
	; GFX803-NEXT: s_setpc_b64 s[30:31]			; GFX803-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX900-FLATSCR-LABEL: load_constant_hi_v2i16_reglo_vreg:			; GFX900-FLATSCR-LABEL: load_constant_hi_v2i16_reglo_vreg:
	; GFX900-FLATSCR: ; %bb.0: ; %entry			; GFX900-FLATSCR: ; %bb.0: ; %entry
	; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX900-FLATSCR-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094			; GFX900-FLATSCR-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
	▲ Show 20 Lines • Show All 199 Lines • ▼ Show 20 Lines
	;			;
	; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:			; GFX803-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:
	; GFX803: ; %bb.0: ; %entry			; GFX803: ; %bb.0: ; %entry
	; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX803-NEXT: v_mov_b32_e32 v2, 0x7b			; GFX803-NEXT: v_mov_b32_e32 v2, 0x7b
	; GFX803-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen			; GFX803-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
	; GFX803-NEXT: s_waitcnt vmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0)
	; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4058			; GFX803-NEXT: buffer_load_ushort v1, off, s[0:3], s32 offset:4058
				; GFX803-NEXT: s_mov_b32 s4, 0x1000504
	; GFX803-NEXT: s_waitcnt vmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0)
	; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1			; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
	; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
	; GFX803-NEXT: flat_store_dword v[0:1], v0			; GFX803-NEXT: flat_store_dword v[0:1], v0
	; GFX803-NEXT: s_waitcnt vmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0)
	; GFX803-NEXT: s_setpc_b64 s[30:31]			; GFX803-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:			; GFX900-FLATSCR-LABEL: load_private_hi_v2i16_reglo_vreg_to_offset:
	; GFX900-FLATSCR: ; %bb.0: ; %entry			; GFX900-FLATSCR: ; %bb.0: ; %entry
	; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, 0x7b			; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, 0x7b
	▲ Show 20 Lines • Show All 590 Lines • ▼ Show 20 Lines
	; GFX906-NEXT: s_waitcnt lgkmcnt(0)			; GFX906-NEXT: s_waitcnt lgkmcnt(0)
	; GFX906-NEXT: s_setpc_b64 s[30:31]			; GFX906-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX803-LABEL: load_local_hi_v2i16_store_local_lo:			; GFX803-LABEL: load_local_hi_v2i16_store_local_lo:
	; GFX803: ; %bb.0: ; %entry			; GFX803: ; %bb.0: ; %entry
	; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX803-NEXT: s_mov_b32 m0, -1			; GFX803-NEXT: s_mov_b32 m0, -1
	; GFX803-NEXT: ds_read_u16 v2, v1			; GFX803-NEXT: ds_read_u16 v2, v1
				; GFX803-NEXT: s_mov_b32 s4, 0x1000504
	; GFX803-NEXT: ds_write_b16 v1, v0			; GFX803-NEXT: ds_write_b16 v1, v0
	; GFX803-NEXT: s_waitcnt lgkmcnt(1)			; GFX803-NEXT: s_waitcnt lgkmcnt(1)
	; GFX803-NEXT: v_lshlrev_b32_e32 v2, 16, v2			; GFX803-NEXT: v_perm_b32 v2, v0, v2, s4
	; GFX803-NEXT: v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
	; GFX803-NEXT: v_mov_b32_e32 v0, v2			; GFX803-NEXT: v_mov_b32_e32 v0, v2
	; GFX803-NEXT: s_waitcnt lgkmcnt(0)			; GFX803-NEXT: s_waitcnt lgkmcnt(0)
	; GFX803-NEXT: s_setpc_b64 s[30:31]			; GFX803-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_store_local_lo:			; GFX900-FLATSCR-LABEL: load_local_hi_v2i16_store_local_lo:
	; GFX900-FLATSCR: ; %bb.0: ; %entry			; GFX900-FLATSCR: ; %bb.0: ; %entry
	; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, v0			; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v2, v0
	Show All 16 Lines

llvm/test/CodeGen/AMDGPU/load-lo16.ll

	Show First 20 Lines • Show All 615 Lines • ▼ Show 20 Lines
	; GFX906-NEXT: s_setpc_b64 s[30:31]			; GFX906-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:			; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
	; GFX803: ; %bb.0: ; %entry			; GFX803: ; %bb.0: ; %entry
	; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX803-NEXT: s_mov_b32 m0, -1			; GFX803-NEXT: s_mov_b32 m0, -1
	; GFX803-NEXT: ds_read_u16 v0, v0			; GFX803-NEXT: ds_read_u16 v0, v0
	; GFX803-NEXT: v_mov_b32_e32 v2, 0			; GFX803-NEXT: v_mov_b32_e32 v2, 0
	; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1			; GFX803-NEXT: s_mov_b32 s4, 0x3020504
	; GFX803-NEXT: s_waitcnt lgkmcnt(0)			; GFX803-NEXT: s_waitcnt lgkmcnt(0)
	; GFX803-NEXT: ds_write_b16 v2, v0			; GFX803-NEXT: ds_write_b16 v2, v0
	; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD			; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
	; GFX803-NEXT: flat_store_dword v[0:1], v0			; GFX803-NEXT: flat_store_dword v[0:1], v0
	; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GFX803-NEXT: s_setpc_b64 s[30:31]			; GFX803-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:			; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lo:
	; GFX900-FLATSCR: ; %bb.0: ; %entry			; GFX900-FLATSCR: ; %bb.0: ; %entry
	; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0			; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0
	▲ Show 20 Lines • Show All 93 Lines • ▼ Show 20 Lines
	; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)			; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GFX906-NEXT: s_setpc_b64 s[30:31]			; GFX906-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:			; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
	; GFX803: ; %bb.0: ; %entry			; GFX803: ; %bb.0: ; %entry
	; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX803-NEXT: s_mov_b32 m0, -1			; GFX803-NEXT: s_mov_b32 m0, -1
	; GFX803-NEXT: ds_read_u16 v0, v0			; GFX803-NEXT: ds_read_u16 v0, v0
				; GFX803-NEXT: s_mov_b32 s4, 0x3020504
	; GFX803-NEXT: v_lshrrev_b32_e32 v4, 16, v1			; GFX803-NEXT: v_lshrrev_b32_e32 v4, 16, v1
	; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
	; GFX803-NEXT: s_waitcnt lgkmcnt(0)			; GFX803-NEXT: s_waitcnt lgkmcnt(0)
	; GFX803-NEXT: ds_write_b16 v2, v0			; GFX803-NEXT: ds_write_b16 v2, v0
	; GFX803-NEXT: ds_write_b16 v3, v4			; GFX803-NEXT: ds_write_b16 v3, v4
	; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD			; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4
	; GFX803-NEXT: flat_store_dword v[0:1], v0			; GFX803-NEXT: flat_store_dword v[0:1], v0
	; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GFX803-NEXT: s_setpc_b64 s[30:31]			; GFX803-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:			; GFX900-FLATSCR-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
	; GFX900-FLATSCR: ; %bb.0: ; %entry			; GFX900-FLATSCR: ; %bb.0: ; %entry
	; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0			; GFX900-FLATSCR-NEXT: ds_read_u16 v0, v0
	▲ Show 20 Lines • Show All 1,587 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/permute_i8.ll

Show First 20 Lines • Show All 2,711 Lines • ▼ Show 20 Lines	; GFX9-NEXT: s_setpc_b64 s[30:31]
%vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4		%vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
%vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4		%vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
%shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 4>		%shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
%insvec = zext <4 x i8> %shuffle0_0 to <4 x i16>		%insvec = zext <4 x i8> %shuffle0_0 to <4 x i16>
store <4 x i16> %insvec, ptr addrspace(1) %out1		store <4 x i16> %insvec, ptr addrspace(1) %out1
store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0		store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
ret void		ret void
}		}

		define void @Source16Bit(i16 %in, <2 x i16> %reg) {
		; GFX10-LABEL: Source16Bit:
		; GFX10: ; %bb.0: ; %entry
		; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3050204
		; GFX10-NEXT: global_store_dword v[0:1], v0, off
		; GFX10-NEXT: s_setpc_b64 s[30:31]
		;
		; GFX9-LABEL: Source16Bit:
		; GFX9: ; %bb.0: ; %entry
		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GFX9-NEXT: s_mov_b32 s4, 0x3050204
		; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
		; GFX9-NEXT: global_store_dword v[0:1], v0, off
		; GFX9-NEXT: s_waitcnt vmcnt(0)
		; GFX9-NEXT: s_setpc_b64 s[30:31]
		entry:
		%elt0 = extractelement <2 x i16> %reg, i32 1
		%e0b0 = and i16 %elt0, 255
		%e0b1 = and i16 %elt0, -256
		%e1b0 = and i16 %in, 255
		%e1b1 = and i16 %in, -256
		%tmp0 = shl i16 %e0b0, 8
		%byte0 = or i16 %tmp0, %e1b0
		%tmp2 = lshr i16 %e1b1, 8
		%byte1 = or i16 %e0b1, %tmp2
		%ext0 = zext i16 %byte0 to i32
		%ext1 = zext i16 %byte1 to i32
		%shifted = shl i32 %ext1, 16
		%result = or i32 %shifted, %ext0
		store i32 %result, ptr addrspace(1) undef
		ret void
		}

llvm/test/CodeGen/AMDGPU/trunc-combine.ll

	Show First 20 Lines • Show All 144 Lines • ▼ Show 20 Lines
	; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0			; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
	; SI-NEXT: v_or_b32_e32 v0, v0, v1			; SI-NEXT: v_or_b32_e32 v0, v0, v1
	; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2			; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
	; SI-NEXT: s_setpc_b64 s[30:31]			; SI-NEXT: s_setpc_b64 s[30:31]
	;			;
	; VI-LABEL: trunc_v2i64_arg_to_v2i16:			; VI-LABEL: trunc_v2i64_arg_to_v2i16:
	; VI: ; %bb.0:			; VI: ; %bb.0:
	; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2			; VI-NEXT: s_mov_b32 s4, 0x1000504
	; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD			; VI-NEXT: v_perm_b32 v0, v0, v2, s4
	; VI-NEXT: s_setpc_b64 s[30:31]			; VI-NEXT: s_setpc_b64 s[30:31]
	%trunc = trunc <2 x i64> %arg0 to <2 x i16>			%trunc = trunc <2 x i64> %arg0 to <2 x i16>
	ret <2 x i16> %trunc			ret <2 x i16> %trunc
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Allow 8,16 bit sources in calculateSrcByteClosedPublic

Details

Diff Detail

Event Timeline