This is an archive of the discontinued LLVM Phabricator instance.

[X86][SSE] Simplify extract(shuffle(load())) handling (PR43971)
AbandonedPublic

Authored by RKSimon on Nov 14 2019, 12:20 PM.

Download Raw Diff

Details

Reviewers

craig.topper
spatel
wolfgangp
deadalnix

Summary

PR43971 showed how XFormVExtractWithShuffleIntoLoad was relying on a later call to DAGCombiner::visitEXTRACT_VECTOR_ELT to succeed before the regenerated VECTOR_SHUFFLE was re-lowered to a target shuffle again.

This patch removes XFormVExtractWithShuffleIntoLoad entirely, avoiding the creation of the VECTOR_SHUFFLE, instead it uses combineExtractWithShuffle to extract directly from the load (stripping any bitcasts).

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

RKSimon created this revision.Nov 14 2019, 12:20 PM

Herald added a project: Restricted Project. · View Herald TranscriptNov 14 2019, 12:20 PM

Herald added subscribers: dmgreen, hiraditya. · View Herald Transcript

I applied your patch to ToT, but now I'm seeing a loop with the following IR (on linux). Seems still stuck in DAGCombine.

define void @test() local_unnamed_addr {
entry:
  %id34847 = alloca <2 x double>, align 16
  %id34846 = alloca double, align 8
  %id34847.0.id34847.0. = load volatile <2 x double>, <2 x double>* %id34847, align 16
  %vecext = extractelement <2 x double> %id34847.0.id34847.0., i32 1
  store volatile double %vecext, double* %id34846, align 8
  ret void
}

Thanks @wolfgangp

Ensure the load is simple

RKSimon mentioned this in rGc3607f52b1fd: [X86][SSE] Add test for extractelement from volatile vector load.Nov 15 2019, 8:00 AM

rebase

I'm unfortunately still getting a loop in DAGcombiner with this one (llc -mattr=+avx on linux). If you make the 1.000 in the select instr into 0.000 it finishes.

define float @test(<8 x float> *%a0) {
entry:
  %0 = load <8 x float>, <8 x float>* %a0, align 32
  %vecext = extractelement <8 x float> %0, i32 1
  %cmp = fcmp oeq float %vecext, 0.000000e+00
  %cond = select i1 %cmp, float 1.000000e+00, float %vecext
  ret float %cond
}

RKSimon mentioned this in rGb68191e729e0: [X86][SSE] Add test for extractelement with multiple uses.Nov 18 2019, 3:59 AM

RKSimon mentioned this in rGbbf4af3109d1: [X86][SSE] Remove XFormVExtractWithShuffleIntoLoad to prevent legalization….Nov 19 2019, 4:05 AM

RKSimon planned changes to this revision.Nov 22 2019, 8:37 AM

RKSimon abandoned this revision.Jan 6 2020, 5:45 AM

RKSimon mentioned this in rGde735247c8b6: [X86] Add extra PR43971 test case mentioned in D70267.Jan 6 2020, 5:47 AM

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86ISelLowering.cpp

155 lines

test/

CodeGen/

X86/

extractelement-load.ll

46 lines

Diff 229555

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 35,119 Lines • ▼ Show 20 Lines	if (ShuffleMask.size() == (unsigned)NumElts &&
return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);		return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countTrailingZeros()]);
}		}
}		}

return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(		return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
Op, DemandedBits, DemandedElts, DAG, Depth);		Op, DemandedBits, DemandedElts, DAG, Depth);
}		}

/// Check if a vector extract from a target-specific shuffle of a load can be
/// folded into a single element load.
/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
/// shuffles have been custom lowered so we need to handle those here.
static SDValue
XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
if (DCI.isBeforeLegalizeOps())
return SDValue();

SDValue InVec = N->getOperand(0);
SDValue EltNo = N->getOperand(1);
EVT EltVT = N->getValueType(0);

if (!isa<ConstantSDNode>(EltNo))
return SDValue();

EVT OriginalVT = InVec.getValueType();
unsigned NumOriginalElts = OriginalVT.getVectorNumElements();

// Peek through bitcasts, don't duplicate a load with other uses.
InVec = peekThroughOneUseBitcasts(InVec);

EVT CurrentVT = InVec.getValueType();
if (!CurrentVT.isVector())
return SDValue();

unsigned NumCurrentElts = CurrentVT.getVectorNumElements();
if ((NumOriginalElts % NumCurrentElts) != 0)
return SDValue();

if (!isTargetShuffle(InVec.getOpcode()))
return SDValue();

// Don't duplicate a load with other uses.
if (!InVec.hasOneUse())
return SDValue();

SmallVector<int, 16> ShuffleMask;
SmallVector<SDValue, 2> ShuffleOps;
bool UnaryShuffle;
if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
ShuffleOps, ShuffleMask, UnaryShuffle))
return SDValue();

unsigned Scale = NumOriginalElts / NumCurrentElts;
if (Scale > 1) {
SmallVector<int, 16> ScaledMask;
scaleShuffleMask<int>(Scale, ShuffleMask, ScaledMask);
ShuffleMask = std::move(ScaledMask);
}
assert(ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch");

// Select the input vector, guarding against out of range extract vector.
int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
int Idx = (Elt > (int)NumOriginalElts) ? SM_SentinelUndef : ShuffleMask[Elt];

if (Idx == SM_SentinelZero)
return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
: DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
if (Idx == SM_SentinelUndef)
return DAG.getUNDEF(EltVT);

// Bail if any mask element is SM_SentinelZero - getVectorShuffle below
// won't handle it.
if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))
return SDValue();

assert(0 <= Idx && Idx < (int)(2 * NumOriginalElts) &&
"Shuffle index out of range");
SDValue LdNode = (Idx < (int)NumOriginalElts) ? ShuffleOps[0] : ShuffleOps[1];

// If inputs to shuffle are the same for both ops, then allow 2 uses
unsigned AllowedUses =
(ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;

if (LdNode.getOpcode() == ISD::BITCAST) {
// Don't duplicate a load with other uses.
if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
return SDValue();

AllowedUses = 1; // only allow 1 load use if we have a bitcast
LdNode = LdNode.getOperand(0);
}

if (!ISD::isNormalLoad(LdNode.getNode()))
return SDValue();

LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);

if (!LN0 \|\| !LN0->hasNUsesOfValue(AllowedUses, 0) \|\| !LN0->isSimple())
return SDValue();

// If there's a bitcast before the shuffle, check if the load type and
// alignment is valid.
unsigned Align = LN0->getAlignment();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
EltVT.getTypeForEVT(*DAG.getContext()));

if (NewAlign > Align \|\| !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
return SDValue();

// All checks match so transform back to vector_shuffle so that DAG combiner
// can finish the job
SDLoc dl(N);

// Create shuffle node taking into account the case that its a unary shuffle
SDValue Shuffle = UnaryShuffle ? DAG.getUNDEF(OriginalVT)
: DAG.getBitcast(OriginalVT, ShuffleOps[1]);
Shuffle = DAG.getVectorShuffle(OriginalVT, dl,
DAG.getBitcast(OriginalVT, ShuffleOps[0]),
Shuffle, ShuffleMask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
EltNo);
}

// Helper to peek through bitops/setcc to determine size of source vector.		// Helper to peek through bitops/setcc to determine size of source vector.
// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.		// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {		static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
switch (Src.getOpcode()) {		switch (Src.getOpcode()) {
case ISD::SETCC:		case ISD::SETCC:
return Src.getOperand(0).getValueSizeInBits() == Size;		return Src.getOperand(0).getValueSizeInBits() == Size;
case ISD::AND:		case ISD::AND:
case ISD::XOR:		case ISD::XOR:
▲ Show 20 Lines • Show All 970 Lines • ▼ Show 20 Lines	assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
"Unexpected extraction type");		"Unexpected extraction type");
unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);		unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
SrcOp = DAG.getBitcast(SrcVT, SrcOp);		SrcOp = DAG.getBitcast(SrcVT, SrcOp);
SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,		SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
DAG.getIntPtrConstant(SrcIdx, dl));		DAG.getIntPtrConstant(SrcIdx, dl));
return DAG.getZExtOrTrunc(ExtOp, dl, VT);		return DAG.getZExtOrTrunc(ExtOp, dl, VT);
}		}

		// Count how many times the target shuffle used SrcOp.
		unsigned RepeatedOps =
		count_if(Ops, [SrcOp](SDValue V) { return V == SrcOp; });

		// If this is a one-use load, attempt to extract the element directly by
		// removing the bitcasts between the extract, shuffle and the load.
		const TargetLowering &TLI = DAG.getTargetLoweringInfo();
		bool SrcOneUse = SrcBC == peekThroughOneUseBitcasts(Src);
		if (SrcOneUse && VT == SrcVT.getScalarType() &&
		SrcOp->hasNUsesOfValue(RepeatedOps, 0) &&
		TLI.isOperationLegalOrCustom(ISD::LOAD, VT) &&
		TLI.isOperationLegalOrCustom(ISD::LOAD, SrcVT)) {
		SDValue SrcOpBC = peekThroughOneUseBitcasts(SrcOp);
		if (ISD::isNormalLoad(SrcOpBC.getNode())) {
		auto *LD = cast<LoadSDNode>(SrcOpBC);
		unsigned Align = LD->getAlignment();
		unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
		SrcVT.getTypeForEVT(*DAG.getContext()));
		if (LD->isSimple() && Align >= NewAlign) {
		if (SrcVT != SrcOpBC.getValueType()) {
		SrcOpBC = DAG.getLoad(SrcVT, dl, LD->getChain(), LD->getBasePtr(),
		LD->getPointerInfo(), Align,
		LD->getMemOperand()->getFlags());
		DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), SrcOpBC.getValue(1));
		}
		return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, SrcOpBC,
		DAG.getConstant(SrcIdx, dl, Idx.getValueType()));
		}
		}
		}

return SDValue();		return SDValue();
}		}

/// Extracting a scalar FP value from vector element 0 is free, so extract each		/// Extracting a scalar FP value from vector element 0 is free, so extract each
/// operand first, then perform the math as a scalar op.		/// operand first, then perform the math as a scalar op.
static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {		static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG) {
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");		assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract");
SDValue Vec = ExtElt->getOperand(0);		SDValue Vec = ExtElt->getOperand(0);
▲ Show 20 Lines • Show All 251 Lines • ▼ Show 20 Lines	if ((InputVector.getOpcode() == X86ISD::PINSRB \|\|
assert(SrcVT == InputVector.getOperand(0).getValueType() &&		assert(SrcVT == InputVector.getOperand(0).getValueType() &&
"Vector type mismatch");		"Vector type mismatch");
SDValue Scl = InputVector.getOperand(1);		SDValue Scl = InputVector.getOperand(1);
Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);		Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl);
return DAG.getZExtOrTrunc(Scl, dl, VT);		return DAG.getZExtOrTrunc(Scl, dl, VT);
}		}

// TODO - Remove this once we can handle the implicit zero-extension of		// TODO - Remove this once we can handle the implicit zero-extension of
// X86ISD::PEXTRW/X86ISD::PEXTRB in XFormVExtractWithShuffleIntoLoad,		// X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and
// combineHorizontalPredicateResult and combineBasicSADPattern.		// combineBasicSADPattern.
return SDValue();		return SDValue();
}		}

if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
return NewOp;

// Detect mmx extraction of all bits as a i64. It works better as a bitcast.		// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&		if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {		VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
SDValue MMXSrc = InputVector.getOperand(0);		SDValue MMXSrc = InputVector.getOperand(0);

// The bitcast source is a direct mmx result.		// The bitcast source is a direct mmx result.
if (MMXSrc.getValueType() == MVT::x86mmx)		if (MMXSrc.getValueType() == MVT::x86mmx)
return DAG.getBitcast(VT, InputVector);		return DAG.getBitcast(VT, InputVector);
▲ Show 20 Lines • Show All 9,820 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/extractelement-load.ll

	Show First 20 Lines • Show All 113 Lines • ▼ Show 20 Lines
	; X64-AVX-NEXT: vmovaps (%rdi), %xmm0			; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
	; X64-AVX-NEXT: vmovhps %xmm0, (%rsi)			; X64-AVX-NEXT: vmovhps %xmm0, (%rsi)
	; X64-AVX-NEXT: retq			; X64-AVX-NEXT: retq
	%vecload = load volatile <2 x double>, <2 x double>* %a0, align 16			%vecload = load volatile <2 x double>, <2 x double>* %a0, align 16
	%vecext = extractelement <2 x double> %vecload, i32 1			%vecext = extractelement <2 x double> %vecload, i32 1
	store volatile double %vecext, double* %a1, align 8			store volatile double %vecext, double* %a1, align 8
	ret void			ret void
	}			}

				define void @PR43971(<8 x float> %a0, float %a1) {
				; X32-SSE2-LABEL: PR43971:
				; X32-SSE2: # %bb.0: # %entry
				; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
				; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
				; X32-SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
				; X32-SSE2-NEXT: xorps %xmm1, %xmm1
				; X32-SSE2-NEXT: cmpltss %xmm0, %xmm1
				; X32-SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
				; X32-SSE2-NEXT: andps %xmm1, %xmm2
				; X32-SSE2-NEXT: andnps %xmm0, %xmm1
				; X32-SSE2-NEXT: orps %xmm2, %xmm1
				; X32-SSE2-NEXT: movss %xmm1, (%eax)
				; X32-SSE2-NEXT: retl
				;
				; X64-SSSE3-LABEL: PR43971:
				; X64-SSSE3: # %bb.0: # %entry
				; X64-SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
				; X64-SSSE3-NEXT: xorps %xmm1, %xmm1
				; X64-SSSE3-NEXT: cmpltss %xmm0, %xmm1
				; X64-SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
				; X64-SSSE3-NEXT: andps %xmm1, %xmm2
				; X64-SSSE3-NEXT: andnps %xmm0, %xmm1
				; X64-SSSE3-NEXT: orps %xmm2, %xmm1
				; X64-SSSE3-NEXT: movss %xmm1, (%rsi)
				; X64-SSSE3-NEXT: retq
				;
				; X64-AVX-LABEL: PR43971:
				; X64-AVX: # %bb.0: # %entry
				; X64-AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0]
				; X64-AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1
				; X64-AVX-NEXT: vcmpltss 24(%rdi), %xmm1, %xmm1
				; X64-AVX-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
				; X64-AVX-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0
				; X64-AVX-NEXT: vmovss %xmm0, (%rsi)
				; X64-AVX-NEXT: retq
				entry:
				%0 = load <8 x float>, <8 x float>* %a0, align 32
				%vecext = extractelement <8 x float> %0, i32 6
				%cmp = fcmp ogt float %vecext, 0.000000e+00
				%1 = load float, float* %a1, align 4
				%cond = select i1 %cmp, float %1, float %vecext
				store float %cond, float* %a1, align 4
				ret void
				}