This is an archive of the discontinued LLVM Phabricator instance.

llvm/lib/Target/AMDGPU/SIISelLowering.cpp
10212	Calling this "Src" is a bit confusing given you handle cast operations here and this is the Dest for them
10380	I'd hope this is unreachable
10475	there are stripBitcast helpers around
10677	I think isByteSized works

jrbyrnes added a child revision: D155995: [AMDGPU]: Allow combining into v_dot4.Jul 21 2023, 2:29 PM

Address comments + rework "hasEightBitAccesses".

hasEightBitAccesses is really just a heuristic to stop combinations of type:

s_mov mask, 0x01000504
v_perm d, v1, v0, mask

when we can instead do:

v_lshl_or d, v0, 16, v1

These will occur iff both operands have ultimate srcs that are exactly 16 bit, and they are addressed as 16 bit operands in the relevant or (that is, they havent been byte extracted / shuffled).

jrbyrnes edited the summary of this revision. (Show Details)Jul 24 2023, 1:38 PM

Harbormaster completed remote builds in B247780: Diff 543685.Jul 24 2023, 10:18 PM

arsenm accepted this revision.Jul 26 2023, 2:19 PM

This revision is now accepted and ready to land.Jul 26 2023, 2:19 PM

Closed by commit rG391249d1afe4: [AMDGPU] Allow 8,16 bit sources in calculateSrcByte (authored by jrbyrnes). · Explain WhyJul 28 2023, 9:51 AM

This revision was automatically updated to reflect the committed changes.

jrbyrnes added a commit: rG391249d1afe4: [AMDGPU] Allow 8,16 bit sources in calculateSrcByte.

jrbyrnes added a child revision: D157133: [AMDGPU] Extend CalculateByteProvider to capture vectors and signed.Aug 4 2023, 1:26 PM

jrbyrnes removed a child revision: D155995: [AMDGPU]: Allow combining into v_dot4.Aug 11 2023, 9:33 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

SIISelLowering.cpp

56 lines

test/

CodeGen/

AMDGPU/

12 lines

4 lines

38 lines

4 lines

Diff 542580

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 10,203 Lines • ▼ Show 20 Lines
// performed.		// performed.
static const std::optional<ByteProvider<SDValue>>		static const std::optional<ByteProvider<SDValue>>
calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,		calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
unsigned Depth = 0) {		unsigned Depth = 0) {
// We may need to recursively traverse a series of SRLs		// We may need to recursively traverse a series of SRLs
if (Depth >= 6)		if (Depth >= 6)
return std::nullopt;		return std::nullopt;

		auto SrcSize = Op.getValueSizeInBits();
		arsenmUnsubmitted Done Reply Inline Actions Calling this "Src" is a bit confusing given you handle cast operations here and this is the Dest for them arsenm: Calling this "Src" is a bit confusing given you handle cast operations here and this is the…
		if (SrcSize != 8 && SrcSize != 16 && SrcSize != 32)
		return std::nullopt;

switch (Op->getOpcode()) {		switch (Op->getOpcode()) {
case ISD::TRUNCATE: {		case ISD::TRUNCATE: {
if (Op->getOperand(0).getScalarValueSizeInBits() != 32)
return std::nullopt;
return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);		return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
}		}

case ISD::SRL: {		case ISD::SRL: {
auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));		auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!ShiftOp)		if (!ShiftOp)
return std::nullopt;		return std::nullopt;

uint64_t BitShift = ShiftOp->getZExtValue();		uint64_t BitShift = ShiftOp->getZExtValue();

if (BitShift % 8 != 0)		if (BitShift % 8 != 0)
return std::nullopt;		return std::nullopt;

SrcIndex += BitShift / 8;		SrcIndex += BitShift / 8;

return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);		return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
}		}

default: {		default: {
if (Op.getScalarValueSizeInBits() != 32)
return std::nullopt;

return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);		return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
}		}
}		}
llvm_unreachable("fully handled switch");		llvm_unreachable("fully handled switch");
}		}

// For a byte position in the result of an Or, traverse the tree and find the		// For a byte position in the result of an Or, traverse the tree and find the
// node (and the byte of the node) which ultimately provides this {Or,		// node (and the byte of the node) which ultimately provides this {Or,
▲ Show 20 Lines • Show All 125 Lines • ▼ Show 20 Lines	case ISD::TRUNCATE: {
if (NarrowByteWidth >= Index) {		if (NarrowByteWidth >= Index) {
return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,		return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
StartingIndex);		StartingIndex);
}		}

return std::nullopt;		return std::nullopt;
}		}

		case ISD::CopyFromReg: {
		auto BitWidth = Op.getScalarValueSizeInBits();
		if (BitWidth % 8)
		arsenmUnsubmitted Done Reply Inline Actions I'd hope this is unreachable arsenm: I'd hope this is unreachable
		return std::nullopt;

		if (BitWidth / 8 > Index)
		return calculateSrcByte(Op, StartingIndex, Index);

		return std::nullopt;
		}

case ISD::LOAD: {		case ISD::LOAD: {
auto L = cast<LoadSDNode>(Op.getNode());		auto L = cast<LoadSDNode>(Op.getNode());
unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();		unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
if (NarrowBitWidth % 8 != 0)		if (NarrowBitWidth % 8 != 0)
return std::nullopt;		return std::nullopt;
uint64_t NarrowByteWidth = NarrowBitWidth / 8;		uint64_t NarrowByteWidth = NarrowBitWidth / 8;

// If the width of the load does not reach byte we are trying to provide for		// If the width of the load does not reach byte we are trying to provide for
▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines
// and the selected bits (based on PermMask) correspond with two		// and the selected bits (based on PermMask) correspond with two
// easily addressable 16 bit operands.		// easily addressable 16 bit operands.
static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op,		static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op,
SDValue &OtherOp) {		SDValue &OtherOp) {
int Low16 = PermMask & 0xffff;		int Low16 = PermMask & 0xffff;
int Hi16 = (PermMask & 0xffff0000) >> 16;		int Hi16 = (PermMask & 0xffff0000) >> 16;

// ByteProvider only accepts 32 bit operands		// ByteProvider only accepts 32 bit operands
assert(Op.getValueType().getSizeInBits() == 32);		assert(!(Op.getValueType().getSizeInBits() % 8));
assert(OtherOp.getValueType().getSizeInBits() == 32);		assert(!(OtherOp.getValueType().getSizeInBits() % 8));

auto OpIs16Bit = is16BitScalarOp(Op);		auto TempOp = Op.getOpcode() == ISD::BITCAST ? Op.getOperand(0) : Op;
		arsenmUnsubmitted Done Reply Inline Actions there are stripBitcast helpers around arsenm: there are stripBitcast helpers around
auto OtherOpIs16Bit = is16BitScalarOp(Op);		auto TempOtherOp =
		OtherOp.getOpcode() == ISD::BITCAST ? OtherOp.getOperand(0) : OtherOp;

		// Vectors of 16 bit ops should be counted as 16 bit ops. If they are cleanly
		// addressed, then there are no Eight bit accesses
		auto OpIs16Bit = TempOtherOp.getValueType().getSizeInBits() == 16 \|\|
		is16BitScalarOp(TempOp) \|\|
		TempOp.getScalarValueSizeInBits() == 16;
		auto OtherOpIs16Bit = TempOtherOp.getValueType().getSizeInBits() == 16 \|\|
		is16BitScalarOp(TempOtherOp) \|\|
		TempOtherOp.getScalarValueSizeInBits() == 16;

// If there is a size mismatch, then we must use masking on at least one		// If there is a size mismatch, then we must use masking on at least one
// operand		// operand
if (OpIs16Bit != OtherOpIs16Bit)		if (OpIs16Bit != OtherOpIs16Bit)
return true;		return true;

// If both operands are 16 bit, return whether or not we cleanly address both		// If both operands are 16 bit, return whether or not we cleanly address both
if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp))		if (OpIs16Bit && OtherOpIs16Bit) {
return !addresses16Bits(Low16) \|\| !addresses16Bits(Hi16);		return !addresses16Bits(Low16) \|\| !addresses16Bits(Hi16);
		}

// Both are 32 bit operands		// Both are 32 bit operands
return true;		return true;
}		}

SDValue SITargetLowering::performOrCombine(SDNode *N,		SDValue SITargetLowering::performOrCombine(SDNode *N,
DAGCombinerInfo &DCI) const {		DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;		SelectionDAG &DAG = DCI.DAG;
▲ Show 20 Lines • Show All 111 Lines • ▼ Show 20 Lines	if (LHSMask == ~0u \|\| RHSMask == ~0u) {

// VT is known to be MVT::i32, so we need to provide 4 bytes.		// VT is known to be MVT::i32, so we need to provide 4 bytes.
assert(VT == MVT::i32);		assert(VT == MVT::i32);
for (int i = 0; i < 4; i++) {		for (int i = 0; i < 4; i++) {
// Find the ByteProvider that provides the ith byte of the result of OR		// Find the ByteProvider that provides the ith byte of the result of OR
std::optional<ByteProvider<SDValue>> P =		std::optional<ByteProvider<SDValue>> P =
calculateByteProvider(SDValue(N, 0), i, 0, /StartingIndex = / i);		calculateByteProvider(SDValue(N, 0), i, 0, /StartingIndex = / i);
// TODO support constantZero		// TODO support constantZero
if (!P \|\| P->isConstantZero())		if (!P \|\| P->isConstantZero()) {
return SDValue();		return SDValue();
		}

PermNodes.push_back(*P);		PermNodes.push_back(*P);
}		}
if (PermNodes.size() != 4)		if (PermNodes.size() != 4)
return SDValue();		return SDValue();

int FirstSrc = 0;		int FirstSrc = 0;
std::optional<int> SecondSrc;		std::optional<int> SecondSrc;
uint64_t permMask = 0x00000000;		uint64_t permMask = 0x00000000;
for (size_t i = 0; i < PermNodes.size(); i++) {		for (size_t i = 0; i < PermNodes.size(); i++) {
auto PermOp = PermNodes[i];		auto PermOp = PermNodes[i];
// Since the mask is applied to Src1:Src2, Src1 bytes must be offset		// Since the mask is applied to Src1:Src2, Src1 bytes must be offset
// by sizeof(Src2) = 4		// by sizeof(Src2) = 4
int SrcByteAdjust = 4;		int SrcByteAdjust = 4;

if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {		if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
if (SecondSrc.has_value())		if (SecondSrc.has_value())
if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))		if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
return SDValue();		return SDValue();

// Set the index of the second distinct Src node		// Set the index of the second distinct Src node
SecondSrc = i;		SecondSrc = i;
assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() ==		assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
32);
SrcByteAdjust = 0;		SrcByteAdjust = 0;
}		}
assert(PermOp.SrcOffset + SrcByteAdjust < 8);		assert(PermOp.SrcOffset + SrcByteAdjust < 8);
assert(!DAG.getDataLayout().isBigEndian());		assert(!DAG.getDataLayout().isBigEndian());
permMask \|= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);		permMask \|= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
}		}

SDValue Op = *PermNodes[FirstSrc].Src;		SDValue Op = *PermNodes[FirstSrc].Src;
Show All 10 Lines	if (LHSMask == ~0u \|\| RHSMask == ~0u) {

// The perm op would really just produce Op. So combine into Op		// The perm op would really just produce Op. So combine into Op
if (WellFormedLow && WellFormedHi)		if (WellFormedLow && WellFormedHi)
return Op;		return Op;
}		}

if (hasEightBitAccesses(permMask, Op, OtherOp)) {		if (hasEightBitAccesses(permMask, Op, OtherOp)) {
SDLoc DL(N);		SDLoc DL(N);
		assert(
		!(Op.getValueSizeInBits() % 8 \|\| OtherOp.getValueSizeInBits() % 8));
		arsenmUnsubmitted Done Reply Inline Actions I think isByteSized works arsenm: I think isByteSized works
		if (Op.getValueSizeInBits() < 32)
		Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
		if (OtherOp.getValueSizeInBits() < 32)
		OtherOp = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,		return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
DAG.getConstant(permMask, DL, MVT::i32));		DAG.getConstant(permMask, DL, MVT::i32));
}		}
}		}
}		}

if (VT != MVT::i64 \|\| DCI.isBeforeLegalizeOps())		if (VT != MVT::i64 \|\| DCI.isBeforeLegalizeOps())
return SDValue();		return SDValue();
▲ Show 20 Lines • Show All 3,588 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/bf16.ll

	Show First 20 Lines • Show All 1,299 Lines • ▼ Show 20 Lines
	; GFX8: ; %bb.0: ; %entry			; GFX8: ; %bb.0: ; %entry
	; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1			; GFX8-NEXT: v_and_b32_e32 v1, 0xffff, v1
	; GFX8-NEXT: s_setpc_b64 s[30:31]			; GFX8-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX9-LABEL: test_ret_v3bf16:			; GFX9-LABEL: test_ret_v3bf16:
	; GFX9: ; %bb.0: ; %entry			; GFX9: ; %bb.0: ; %entry
	; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX9-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
	; GFX9-NEXT: s_mov_b32 s4, 0xffff
	; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v2
	; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1			; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
	; GFX9-NEXT: s_setpc_b64 s[30:31]			; GFX9-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX10-LABEL: test_ret_v3bf16:			; GFX10-LABEL: test_ret_v3bf16:
	; GFX10: ; %bb.0: ; %entry			; GFX10: ; %bb.0: ; %entry
	; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX10-NEXT: s_waitcnt_vscnt null, 0x0			; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
	; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
	; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1			; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
	; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v2
	; GFX10-NEXT: s_setpc_b64 s[30:31]			; GFX10-NEXT: s_setpc_b64 s[30:31]
	entry:			entry:
	ret <3 x bfloat> %in			ret <3 x bfloat> %in
	}			}

	define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {			define <4 x bfloat> @test_ret_v4bf16(<4 x bfloat> %in) {
	; GCN-LABEL: test_ret_v4bf16:			; GCN-LABEL: test_ret_v4bf16:
	; GCN: ; %bb.0: ; %entry			; GCN: ; %bb.0: ; %entry
	▲ Show 20 Lines • Show All 516 Lines • ▼ Show 20 Lines
	; GFX9: ; %bb.0: ; %entry			; GFX9: ; %bb.0: ; %entry
	; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX9-NEXT: s_mov_b32 s6, s33			; GFX9-NEXT: s_mov_b32 s6, s33
	; GFX9-NEXT: s_mov_b32 s33, s32			; GFX9-NEXT: s_mov_b32 s33, s32
	; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1			; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1
	; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill			; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
	; GFX9-NEXT: s_mov_b64 exec, s[4:5]			; GFX9-NEXT: s_mov_b64 exec, s[4:5]
	; GFX9-NEXT: s_addk_i32 s32, 0x400			; GFX9-NEXT: s_addk_i32 s32, 0x400
	; GFX9-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
	; GFX9-NEXT: s_mov_b32 s4, 0xffff
	; GFX9-NEXT: v_and_or_b32 v0, v0, s4, v4
	; GFX9-NEXT: s_getpc_b64 s[4:5]			; GFX9-NEXT: s_getpc_b64 s[4:5]
	; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4			; GFX9-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
	; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12			; GFX9-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
	; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0			; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
	; GFX9-NEXT: v_writelane_b32 v3, s30, 0			; GFX9-NEXT: v_writelane_b32 v3, s30, 0
	; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1			; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
	; GFX9-NEXT: v_writelane_b32 v3, s31, 1			; GFX9-NEXT: v_writelane_b32 v3, s31, 1
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	Show All 21 Lines
	; GFX10-NEXT: s_xor_saveexec_b32 s4, -1			; GFX10-NEXT: s_xor_saveexec_b32 s4, -1
	; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill			; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
	; GFX10-NEXT: s_waitcnt_depctr 0xffe3			; GFX10-NEXT: s_waitcnt_depctr 0xffe3
	; GFX10-NEXT: s_mov_b32 exec_lo, s4			; GFX10-NEXT: s_mov_b32 exec_lo, s4
	; GFX10-NEXT: s_addk_i32 s32, 0x200			; GFX10-NEXT: s_addk_i32 s32, 0x200
	; GFX10-NEXT: s_getpc_b64 s[4:5]			; GFX10-NEXT: s_getpc_b64 s[4:5]
	; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4			; GFX10-NEXT: s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
	; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12			; GFX10-NEXT: s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
	; GFX10-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
	; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
	; GFX10-NEXT: v_writelane_b32 v3, s30, 0			; GFX10-NEXT: v_writelane_b32 v3, s30, 0
				; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
	; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1			; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1
	; GFX10-NEXT: v_and_or_b32 v0, 0xffff, v0, v4
	; GFX10-NEXT: v_writelane_b32 v3, s31, 1			; GFX10-NEXT: v_writelane_b32 v3, s31, 1
	; GFX10-NEXT: s_waitcnt lgkmcnt(0)			; GFX10-NEXT: s_waitcnt lgkmcnt(0)
	; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]			; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
	; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4			; GFX10-NEXT: buffer_store_short v1, v2, s[0:3], 0 offen offset:4
	; GFX10-NEXT: s_waitcnt_vscnt null, 0x0			; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
	; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen			; GFX10-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen
	; GFX10-NEXT: s_waitcnt_vscnt null, 0x0			; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
	; GFX10-NEXT: v_readlane_b32 s31, v3, 1			; GFX10-NEXT: v_readlane_b32 s31, v3, 1
	▲ Show 20 Lines • Show All 1,284 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/load-lo16.ll

	Show First 20 Lines • Show All 676 Lines • ▼ Show 20 Lines
	; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)			; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GFX906-NEXT: s_setpc_b64 s[30:31]			; GFX906-NEXT: s_setpc_b64 s[30:31]
	;			;
	; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi:			; GFX803-LABEL: load_local_lo_v2i16_reghi_vreg_multi_use_hi:
	; GFX803: ; %bb.0: ; %entry			; GFX803: ; %bb.0: ; %entry
	; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; GFX803-NEXT: s_mov_b32 m0, -1			; GFX803-NEXT: s_mov_b32 m0, -1
	; GFX803-NEXT: ds_read_u16 v0, v0			; GFX803-NEXT: ds_read_u16 v0, v0
	; GFX803-NEXT: s_mov_b32 s4, 0x3020504
	; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v1			; GFX803-NEXT: v_lshrrev_b32_e32 v2, 16, v1
				; GFX803-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
	; GFX803-NEXT: v_mov_b32_e32 v3, 0			; GFX803-NEXT: v_mov_b32_e32 v3, 0
	; GFX803-NEXT: ds_write_b16 v3, v2			; GFX803-NEXT: ds_write_b16 v3, v2
	; GFX803-NEXT: s_waitcnt lgkmcnt(1)			; GFX803-NEXT: s_waitcnt lgkmcnt(1)
	; GFX803-NEXT: v_perm_b32 v0, v0, v1, s4			; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
	; GFX803-NEXT: flat_store_dword v[0:1], v0			; GFX803-NEXT: flat_store_dword v[0:1], v0
	; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)			; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GFX803-NEXT: s_setpc_b64 s[30:31]			; GFX803-NEXT: s_setpc_b64 s[30:31]
	entry:			entry:
	%load = load i16, ptr addrspace(3) %in			%load = load i16, ptr addrspace(3) %in
	%elt1 = extractelement <2 x i16> %reg, i32 1			%elt1 = extractelement <2 x i16> %reg, i32 1
	store i16 %elt1, ptr addrspace(3) null			store i16 %elt1, ptr addrspace(3) null
	%build1 = insertelement <2 x i16> %reg, i16 %load, i32 0			%build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
	▲ Show 20 Lines • Show All 1,639 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/permute_i8.ll

Show First 20 Lines • Show All 2,803 Lines • ▼ Show 20 Lines	; GFX9-NEXT: s_setpc_b64 s[30:31]
%vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4		%vec0 = load <4 x i8>, ptr addrspace(1) %gep0, align 4
%vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4		%vec1 = load <4 x i8>, ptr addrspace(1) %gep1, align 4
%shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 4>		%shuffle0_0 = shufflevector <4 x i8> %vec0, <4 x i8> %vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
%insvec = zext <4 x i8> %shuffle0_0 to <4 x i16>		%insvec = zext <4 x i8> %shuffle0_0 to <4 x i16>
store <4 x i16> %insvec, ptr addrspace(1) %out1		store <4 x i16> %insvec, ptr addrspace(1) %out1
store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0		store <4 x i8> %shuffle0_0, ptr addrspace(1) %out0
ret void		ret void
}		}

		define void @Source16Bit(i16 %in, <2 x i16> %reg) {
		; GFX10-LABEL: Source16Bit:
		; GFX10: ; %bb.0: ; %entry
		; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
		; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
		; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x3050204
		; GFX10-NEXT: global_store_dword v[0:1], v0, off
		; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
		; GFX10-NEXT: s_setpc_b64 s[30:31]
		;
		; GFX9-LABEL: Source16Bit:
		; GFX9: ; %bb.0: ; %entry
		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
		; GFX9-NEXT: s_mov_b32 s4, 0x3050204
		; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4
		; GFX9-NEXT: global_store_dword v[0:1], v0, off
		; GFX9-NEXT: s_waitcnt vmcnt(0)
		; GFX9-NEXT: s_setpc_b64 s[30:31]
		entry:
		%elt0 = extractelement <2 x i16> %reg, i32 1
		%e0b0 = and i16 %elt0, 255
		%e0b1 = and i16 %elt0, -256
		%e1b0 = and i16 %in, 255
		%e1b1 = and i16 %in, -256
		%tmp0 = shl i16 %e0b0, 8
		%byte0 = or i16 %tmp0, %e1b0
		%tmp2 = lshr i16 %e1b1, 8
		%byte1 = or i16 %e0b1, %tmp2
		%ext0 = zext i16 %byte0 to i32
		%ext1 = zext i16 %byte1 to i32
		%shifted = shl i32 %ext1, 16
		%result = or i32 %shifted, %ext0
		store i32 %result, ptr addrspace(1) undef
		ret void
		}

llvm/test/CodeGen/AMDGPU/trunc-combine.ll

	Show First 20 Lines • Show All 144 Lines • ▼ Show 20 Lines
	; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0			; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
	; SI-NEXT: v_or_b32_e32 v0, v0, v1			; SI-NEXT: v_or_b32_e32 v0, v0, v1
	; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2			; SI-NEXT: v_and_b32_e32 v1, 0xffff, v2
	; SI-NEXT: s_setpc_b64 s[30:31]			; SI-NEXT: s_setpc_b64 s[30:31]
	;			;
	; VI-LABEL: trunc_v2i64_arg_to_v2i16:			; VI-LABEL: trunc_v2i64_arg_to_v2i16:
	; VI: ; %bb.0:			; VI: ; %bb.0:
	; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)			; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
	; VI-NEXT: v_lshlrev_b32_e32 v1, 16, v2			; VI-NEXT: s_mov_b32 s4, 0x1000504
	; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD			; VI-NEXT: v_perm_b32 v0, v0, v2, s4
	; VI-NEXT: s_setpc_b64 s[30:31]			; VI-NEXT: s_setpc_b64 s[30:31]
	%trunc = trunc <2 x i64> %arg0 to <2 x i16>			%trunc = trunc <2 x i64> %arg0 to <2 x i16>
	ret <2 x i16> %trunc			ret <2 x i16> %trunc
	}			}

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Allow 8,16 bit sources in calculateSrcByteClosedPublic

Details

Diff Detail

Event Timeline