Skip to content

Commit 46696ef

Browse files
committedJan 26, 2016
[X86][SSE] Add zero element and general 64-bit VZEXT_LOAD support to EltsFromConsecutiveLoads
This patch adds support for trailing zero elements to VZEXT_LOAD loads (and checks that no zero elts occur within the consecutive load). It also generalizes the 64-bit VZEXT_LOAD load matching to work for loads other than 2x32-bit loads. After this patch it will also be easier to add support for other basic load patterns like 32-bit VZEXT_LOAD loads, PMOVZX and subvector load insertion. Differential Revision: http://reviews.llvm.org/D16217 llvm-svn: 258798
1 parent c9655d9 commit 46696ef

File tree

2 files changed

+94
-101
lines changed

2 files changed

+94
-101
lines changed
 

‎llvm/lib/Target/X86/X86ISelLowering.cpp

+87-56
Original file line numberDiff line numberDiff line change
@@ -5480,55 +5480,84 @@ LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
54805480
/// elements can be replaced by a single large load which has the same value as
54815481
/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
54825482
///
5483-
/// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
5484-
///
5485-
/// FIXME: we'd also like to handle the case where the last elements are zero
5486-
/// rather than undef via VZEXT_LOAD, but we do not detect that case today.
5487-
/// There's even a handy isZeroNode for that purpose.
5483+
/// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
54885484
static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
54895485
SDLoc &DL, SelectionDAG &DAG,
54905486
bool isAfterLegalize) {
54915487
unsigned NumElems = Elts.size();
54925488

5493-
LoadSDNode *LDBase = nullptr;
5494-
unsigned LastLoadedElt = -1U;
5489+
int LastLoadedElt = -1;
5490+
SmallBitVector LoadMask(NumElems, false);
5491+
SmallBitVector ZeroMask(NumElems, false);
5492+
SmallBitVector UndefMask(NumElems, false);
5493+
5494+
auto PeekThroughBitcast = [](SDValue V) {
5495+
while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5496+
V = V.getOperand(0);
5497+
return V;
5498+
};
54955499

5496-
// For each element in the initializer, see if we've found a load or an undef.
5497-
// If we don't find an initial load element, or later load elements are
5498-
// non-consecutive, bail out.
5500+
// For each element in the initializer, see if we've found a load, zero or an
5501+
// undef.
54995502
for (unsigned i = 0; i < NumElems; ++i) {
5500-
SDValue Elt = Elts[i];
5501-
// Look through a bitcast.
5502-
if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
5503-
Elt = Elt.getOperand(0);
5504-
if (!Elt.getNode() ||
5505-
(Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
5503+
SDValue Elt = PeekThroughBitcast(Elts[i]);
5504+
if (!Elt.getNode())
55065505
return SDValue();
5507-
if (!LDBase) {
5508-
if (Elt.getNode()->getOpcode() == ISD::UNDEF)
5509-
return SDValue();
5510-
LDBase = cast<LoadSDNode>(Elt.getNode());
5511-
LastLoadedElt = i;
5512-
continue;
5513-
}
5514-
if (Elt.getOpcode() == ISD::UNDEF)
5515-
continue;
55165506

5517-
LoadSDNode *LD = cast<LoadSDNode>(Elt);
5518-
EVT LdVT = Elt.getValueType();
5519-
// Each loaded element must be the correct fractional portion of the
5520-
// requested vector load.
5521-
if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
5522-
return SDValue();
5523-
if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
5507+
if (Elt.isUndef())
5508+
UndefMask[i] = true;
5509+
else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
5510+
ZeroMask[i] = true;
5511+
else if (ISD::isNON_EXTLoad(Elt.getNode())) {
5512+
LoadMask[i] = true;
5513+
LastLoadedElt = i;
5514+
// Each loaded element must be the correct fractional portion of the
5515+
// requested vector load.
5516+
if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
5517+
return SDValue();
5518+
} else
55245519
return SDValue();
5525-
LastLoadedElt = i;
55265520
}
5521+
assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
5522+
"Incomplete element masks");
55275523

5524+
// Handle Special Cases - all undef or undef/zero.
5525+
if (UndefMask.count() == NumElems)
5526+
return DAG.getUNDEF(VT);
5527+
5528+
// FIXME: Should we return this as a BUILD_VECTOR instead?
5529+
if ((ZeroMask | UndefMask).count() == NumElems)
5530+
return VT.isInteger() ? DAG.getConstant(0, DL, VT)
5531+
: DAG.getConstantFP(0.0, DL, VT);
5532+
5533+
int FirstLoadedElt = LoadMask.find_first();
5534+
SDValue EltBase = PeekThroughBitcast(Elts[FirstLoadedElt]);
5535+
LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
5536+
EVT LDBaseVT = EltBase.getValueType();
5537+
5538+
// Consecutive loads can contain UNDEFS but not ZERO elements.
5539+
bool IsConsecutiveLoad = true;
5540+
for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
5541+
if (LoadMask[i]) {
5542+
SDValue Elt = PeekThroughBitcast(Elts[i]);
5543+
LoadSDNode *LD = cast<LoadSDNode>(Elt);
5544+
if (!DAG.isConsecutiveLoad(LD, LDBase,
5545+
Elt.getValueType().getStoreSizeInBits() / 8,
5546+
i - FirstLoadedElt)) {
5547+
IsConsecutiveLoad = false;
5548+
break;
5549+
}
5550+
} else if (ZeroMask[i]) {
5551+
IsConsecutiveLoad = false;
5552+
break;
5553+
}
5554+
}
5555+
5556+
// LOAD - all consecutive load/undefs (must start/end with a load).
55285557
// If we have found an entire vector of loads and undefs, then return a large
5529-
// load of the entire vector width starting at the base pointer. If we found
5530-
// consecutive loads for the low half, generate a vzext_load node.
5531-
if (LastLoadedElt == NumElems - 1) {
5558+
// load of the entire vector width starting at the base pointer.
5559+
if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
5560+
LastLoadedElt == (int)(NumElems - 1) && ZeroMask.none()) {
55325561
assert(LDBase && "Did not find base load for merging consecutive loads");
55335562
EVT EltVT = LDBase->getValueType(0);
55345563
// Ensure that the input vector size for the merged loads matches the
@@ -5548,9 +5577,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
55485577
LDBase->getAlignment());
55495578

55505579
if (LDBase->hasAnyUseOfValue(1)) {
5551-
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
5552-
SDValue(LDBase, 1),
5553-
SDValue(NewLd.getNode(), 1));
5580+
SDValue NewChain =
5581+
DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
5582+
SDValue(NewLd.getNode(), 1));
55545583
DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
55555584
DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
55565585
SDValue(NewLd.getNode(), 1));
@@ -5559,11 +5588,14 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
55595588
return NewLd;
55605589
}
55615590

5562-
//TODO: The code below fires only for for loading the low v2i32 / v2f32
5563-
//of a v4i32 / v4f32. It's probably worth generalizing.
5564-
EVT EltVT = VT.getVectorElementType();
5565-
if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
5566-
DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
5591+
int LoadSize =
5592+
(1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
5593+
5594+
// VZEXT_LOAD - consecutive load/undefs followed by zeros/undefs.
5595+
// TODO: The code below fires only for for loading the low 64-bits of a
5596+
// of a 128-bit vector. It's probably worth generalizing more.
5597+
if (IsConsecutiveLoad && FirstLoadedElt == 0 && VT.is128BitVector() &&
5598+
(LoadSize == 64 && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64))) {
55675599
SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
55685600
SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
55695601
SDValue ResNode =
@@ -5577,8 +5609,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
55775609
// terms of dependency. We create a TokenFactor for LDBase and ResNode, and
55785610
// update uses of LDBase's output chain to use the TokenFactor.
55795611
if (LDBase->hasAnyUseOfValue(1)) {
5580-
SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
5581-
SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
5612+
SDValue NewChain =
5613+
DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
5614+
SDValue(ResNode.getNode(), 1));
55825615
DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
55835616
DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
55845617
SDValue(ResNode.getNode(), 1));
@@ -6551,15 +6584,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
65516584
if (IsAllConstants)
65526585
return SDValue();
65536586

6554-
// For AVX-length vectors, see if we can use a vector load to get all of the
6555-
// elements, otherwise build the individual 128-bit pieces and use
6556-
// shuffles to put them in place.
6557-
if (VT.is256BitVector() || VT.is512BitVector()) {
6587+
// See if we can use a vector load to get all of the elements.
6588+
if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
65586589
SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
6559-
6560-
// Check for a build vector of consecutive loads.
65616590
if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
65626591
return LD;
6592+
}
6593+
6594+
// For AVX-length vectors, build the individual 128-bit pieces and use
6595+
// shuffles to put them in place.
6596+
if (VT.is256BitVector() || VT.is512BitVector()) {
6597+
SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
65636598

65646599
EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
65656600

@@ -6648,10 +6683,6 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
66486683
for (unsigned i = 0; i < NumElems; ++i)
66496684
V[i] = Op.getOperand(i);
66506685

6651-
// Check for elements which are consecutive loads.
6652-
if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
6653-
return LD;
6654-
66556686
// Check for a build vector from mostly shuffle plus few inserting.
66566687
if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
66576688
return Sh;

‎llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll

+7-45
Original file line numberDiff line numberDiff line change
@@ -347,18 +347,12 @@ define <8 x i16> @merge_8i16_i16_34uuuuuu(i16* %ptr) nounwind uwtable noinline s
347347
define <8 x i16> @merge_8i16_i16_45u7zzzz(i16* %ptr) nounwind uwtable noinline ssp {
348348
; SSE-LABEL: merge_8i16_i16_45u7zzzz:
349349
; SSE: # BB#0:
350-
; SSE-NEXT: pxor %xmm0, %xmm0
351-
; SSE-NEXT: pinsrw $0, 8(%rdi), %xmm0
352-
; SSE-NEXT: pinsrw $1, 10(%rdi), %xmm0
353-
; SSE-NEXT: pinsrw $3, 14(%rdi), %xmm0
350+
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
354351
; SSE-NEXT: retq
355352
;
356353
; AVX-LABEL: merge_8i16_i16_45u7zzzz:
357354
; AVX: # BB#0:
358-
; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
359-
; AVX-NEXT: vpinsrw $0, 8(%rdi), %xmm0, %xmm0
360-
; AVX-NEXT: vpinsrw $1, 10(%rdi), %xmm0, %xmm0
361-
; AVX-NEXT: vpinsrw $3, 14(%rdi), %xmm0, %xmm0
355+
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
362356
; AVX-NEXT: retq
363357
%ptr0 = getelementptr inbounds i16, i16* %ptr, i64 4
364358
%ptr1 = getelementptr inbounds i16, i16* %ptr, i64 5
@@ -478,46 +472,14 @@ define <16 x i8> @merge_16i8_i8_01u3uuzzuuuuuzzz(i8* %ptr) nounwind uwtable noin
478472
}
479473

480474
define <16 x i8> @merge_16i8_i8_0123uu67uuuuuzzz(i8* %ptr) nounwind uwtable noinline ssp {
481-
; SSE2-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
482-
; SSE2: # BB#0:
483-
; SSE2-NEXT: movzbl 2(%rdi), %eax
484-
; SSE2-NEXT: movzbl 3(%rdi), %ecx
485-
; SSE2-NEXT: shll $8, %ecx
486-
; SSE2-NEXT: orl %eax, %ecx
487-
; SSE2-NEXT: movzbl (%rdi), %eax
488-
; SSE2-NEXT: movzbl 1(%rdi), %edx
489-
; SSE2-NEXT: shll $8, %edx
490-
; SSE2-NEXT: orl %eax, %edx
491-
; SSE2-NEXT: pxor %xmm0, %xmm0
492-
; SSE2-NEXT: pinsrw $0, %edx, %xmm0
493-
; SSE2-NEXT: pinsrw $1, %ecx, %xmm0
494-
; SSE2-NEXT: movzbl 6(%rdi), %eax
495-
; SSE2-NEXT: movzbl 7(%rdi), %ecx
496-
; SSE2-NEXT: shll $8, %ecx
497-
; SSE2-NEXT: orl %eax, %ecx
498-
; SSE2-NEXT: pinsrw $3, %ecx, %xmm0
499-
; SSE2-NEXT: retq
500-
;
501-
; SSE41-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
502-
; SSE41: # BB#0:
503-
; SSE41-NEXT: pxor %xmm0, %xmm0
504-
; SSE41-NEXT: pinsrb $0, (%rdi), %xmm0
505-
; SSE41-NEXT: pinsrb $1, 1(%rdi), %xmm0
506-
; SSE41-NEXT: pinsrb $2, 2(%rdi), %xmm0
507-
; SSE41-NEXT: pinsrb $3, 3(%rdi), %xmm0
508-
; SSE41-NEXT: pinsrb $6, 6(%rdi), %xmm0
509-
; SSE41-NEXT: pinsrb $7, 7(%rdi), %xmm0
510-
; SSE41-NEXT: retq
475+
; SSE-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
476+
; SSE: # BB#0:
477+
; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
478+
; SSE-NEXT: retq
511479
;
512480
; AVX-LABEL: merge_16i8_i8_0123uu67uuuuuzzz:
513481
; AVX: # BB#0:
514-
; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
515-
; AVX-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0
516-
; AVX-NEXT: vpinsrb $1, 1(%rdi), %xmm0, %xmm0
517-
; AVX-NEXT: vpinsrb $2, 2(%rdi), %xmm0, %xmm0
518-
; AVX-NEXT: vpinsrb $3, 3(%rdi), %xmm0, %xmm0
519-
; AVX-NEXT: vpinsrb $6, 6(%rdi), %xmm0, %xmm0
520-
; AVX-NEXT: vpinsrb $7, 7(%rdi), %xmm0, %xmm0
482+
; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
521483
; AVX-NEXT: retq
522484
%ptr0 = getelementptr inbounds i8, i8* %ptr, i64 0
523485
%ptr1 = getelementptr inbounds i8, i8* %ptr, i64 1

0 commit comments

Comments
 (0)