Index: llvm/trunk/include/llvm/Target/TargetLowering.h =================================================================== --- llvm/trunk/include/llvm/Target/TargetLowering.h +++ llvm/trunk/include/llvm/Target/TargetLowering.h @@ -1518,6 +1518,10 @@ return false; } + /// Return true if folding a vector load into ExtVal (a sign, zero, or any + /// extend node) is profitable. + virtual bool isVectorLoadExtDesirable(SDValue ExtVal) const { return false; } + /// Return true if an fneg operation is free to the point where it is never /// worthwhile to replace it with a bitwise operation. virtual bool isFNegFree(EVT VT) const { Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -327,6 +327,7 @@ SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp, unsigned HiOp); SDValue CombineConsecutiveLoads(SDNode *N, EVT VT); + SDValue CombineExtLoad(SDNode *N); SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT); SDValue BuildSDIV(SDNode *N); SDValue BuildSDIVPow2(SDNode *N); @@ -5307,6 +5308,102 @@ } } +// FIXME: Bring more similar combines here, common to sext/zext (maybe aext?). +SDValue DAGCombiner::CombineExtLoad(SDNode *N) { + SDValue N0 = N->getOperand(0); + EVT DstVT = N->getValueType(0); + EVT SrcVT = N0.getValueType(); + + assert((N->getOpcode() == ISD::SIGN_EXTEND || + N->getOpcode() == ISD::ZERO_EXTEND) && + "Unexpected node type (not an extend)!"); + + // fold (sext (load x)) to multiple smaller sextloads; same for zext. + // For example, on a target with legal v4i32, but illegal v8i32, turn: + // (v8i32 (sext (v8i16 (load x)))) + // into: + // (v8i32 (concat_vectors (v4i32 (sextload x)), + // (v4i32 (sextload (x + 16))))) + // Where uses of the original load, i.e.: + // (v8i16 (load x)) + // are replaced with: + // (v8i16 (truncate + // (v8i32 (concat_vectors (v4i32 (sextload x)), + // (v4i32 (sextload (x + 16))))))) + // + // This combine is only applicable to illegal, but splittable, vectors. + // All legal types, and illegal non-vector types, are handled elsewhere. + // This combine is controlled by TargetLowering::isVectorLoadExtDesirable. + // + if (N0->getOpcode() != ISD::LOAD) + return SDValue(); + + LoadSDNode *LN0 = cast(N0); + + if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) || + !N0.hasOneUse() || LN0->isVolatile() || !DstVT.isVector() || + !DstVT.isPow2VectorType() || !TLI.isVectorLoadExtDesirable(SDValue(N, 0))) + return SDValue(); + + SmallVector SetCCs; + if (!ExtendUsesToFormExtLoad(N, N0, N->getOpcode(), SetCCs, TLI)) + return SDValue(); + + ISD::LoadExtType ExtType = + N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + + // Try to split the vector types to get down to legal types. + EVT SplitSrcVT = SrcVT; + EVT SplitDstVT = DstVT; + while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) && + SplitSrcVT.getVectorNumElements() > 1) { + SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first; + SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first; + } + + if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT)) + return SDValue(); + + SDLoc DL(N); + const unsigned NumSplits = + DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements(); + const unsigned Stride = SplitSrcVT.getStoreSize(); + SmallVector Loads; + SmallVector Chains; + + SDValue BasePtr = LN0->getBasePtr(); + for (unsigned Idx = 0; Idx < NumSplits; Idx++) { + const unsigned Offset = Idx * Stride; + const unsigned Align = MinAlign(LN0->getAlignment(), Offset); + + SDValue SplitLoad = DAG.getExtLoad( + ExtType, DL, SplitDstVT, LN0->getChain(), BasePtr, + LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, + LN0->isVolatile(), LN0->isNonTemporal(), LN0->isInvariant(), + Align, LN0->getAAInfo()); + + BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, + DAG.getConstant(Stride, BasePtr.getValueType())); + + Loads.push_back(SplitLoad.getValue(0)); + Chains.push_back(SplitLoad.getValue(1)); + } + + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads); + + CombineTo(N, NewValue); + + // Replace uses of the original load (before extension) + // with a truncate of the concatenated sextloaded vectors. + SDValue Trunc = + DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue); + CombineTo(N0.getNode(), Trunc, NewChain); + ExtendSetCCUses(SetCCs, Trunc, NewValue, DL, + (ISD::NodeType)N->getOpcode()); + return SDValue(N, 0); // Return N so it doesn't get rechecked! +} + SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) { SDValue N0 = N->getOperand(0); EVT VT = N->getValueType(0); @@ -5373,17 +5470,18 @@ } // fold (sext (load x)) -> (sext (truncate (sextload x))) - // None of the supported targets knows how to perform load and sign extend - // on vectors in one instruction. We only perform this transformation on - // scalars. - if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && - ISD::isUNINDEXEDLoad(N0.getNode()) && - ((!LegalOperations && !cast(N0)->isVolatile()) || + // Only generate vector extloads when 1) they're legal, and 2) they are + // deemed desirable by the target. + if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && + ((!LegalOperations && !VT.isVector() && + !cast(N0)->isVolatile()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()))) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI); + if (VT.isVector()) + DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0)); if (DoXform) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, @@ -5400,6 +5498,11 @@ } } + // fold (sext (load x)) to multiple smaller sextloads. + // Only on illegal but splittable vectors. + if (SDValue ExtLoad = CombineExtLoad(N)) + return ExtLoad; + // fold (sext (sextload x)) -> (sext (truncate (sextload x))) // fold (sext ( extload x)) -> (sext (truncate (sextload x))) if ((ISD::isSEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) && @@ -5663,17 +5766,18 @@ } // fold (zext (load x)) -> (zext (truncate (zextload x))) - // None of the supported targets knows how to perform load and vector_zext - // on vectors in one instruction. We only perform this transformation on - // scalars. - if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && - ISD::isUNINDEXEDLoad(N0.getNode()) && - ((!LegalOperations && !cast(N0)->isVolatile()) || + // Only generate vector extloads when 1) they're legal, and 2) they are + // deemed desirable by the target. + if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && + ((!LegalOperations && !VT.isVector() && + !cast(N0)->isVolatile()) || TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()))) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI); + if (VT.isVector()) + DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0)); if (DoXform) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, @@ -5691,6 +5795,11 @@ } } + // fold (zext (load x)) to multiple smaller zextloads. + // Only on illegal but splittable vectors. + if (SDValue ExtLoad = CombineExtLoad(N)) + return ExtLoad; + // fold (zext (and/or/xor (load x), cst)) -> // (and/or/xor (zextload x), (zext cst)) if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -744,6 +744,10 @@ bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; + /// Return true if folding a vector load into ExtVal (a sign, zero, or any + /// extend node) is profitable. + bool isVectorLoadExtDesirable(SDValue) const override; + /// Return true if an FMA operation is faster than a pair of fmul and fadd /// instructions. fmuladd intrinsics will be expanded to FMAs when this /// method returns true, otherwise fmuladd is expanded to fmul + fadd. Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -16294,6 +16294,7 @@ // may emit an illegal shuffle but the expansion is still better than scalar // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise // we'll emit a shuffle and a arithmetic shift. +// FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during // the shuffle phase or after the shuffle. static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, @@ -20399,6 +20400,8 @@ return false; } +bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; } + bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) Index: llvm/trunk/test/CodeGen/X86/vector-sext.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-sext.ll +++ llvm/trunk/test/CodeGen/X86/vector-sext.ll @@ -523,46 +523,35 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) { ; SSE2-LABEL: sext_16i8_to_16i16: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movq (%rdi), %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psllw $8, %xmm0 ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psllw $8, %xmm1 +; SSE2-NEXT: movq 8(%rdi), %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_16i8_to_16i16: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa (%rdi), %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: movq (%rdi), %xmm0 ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: psllw $8, %xmm0 ; SSSE3-NEXT: psraw $8, %xmm0 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: psllw $8, %xmm1 +; SSSE3-NEXT: movq 8(%rdi), %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: psraw $8, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: sext_16i8_to_16i16: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: pmovzxbw %xmm1, %xmm0 -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: psraw $8, %xmm0 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: psraw $8, %xmm1 +; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 +; SSE41-NEXT: pmovsxbw 8(%rdi), %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: sext_16i8_to_16i16: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: sext_16i8_to_16i16: @@ -573,13 +562,8 @@ ; X32-SSE41-LABEL: sext_16i8_to_16i16: ; X32-SSE41: # BB#0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movdqa (%eax), %xmm1 -; X32-SSE41-NEXT: pmovzxbw %xmm1, %xmm0 -; X32-SSE41-NEXT: psllw $8, %xmm0 -; X32-SSE41-NEXT: psraw $8, %xmm0 -; X32-SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X32-SSE41-NEXT: psllw $8, %xmm1 -; X32-SSE41-NEXT: psraw $8, %xmm1 +; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxbw 8(%eax), %xmm1 ; X32-SSE41-NEXT: retl entry: %X = load <16 x i8>* %ptr @@ -705,73 +689,36 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { ; SSE2-LABEL: load_sext_4i8_to_4i64: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movd (%rdi), %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movsbq %al, %rax +; SSE2-NEXT: movsbq 1(%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movsbq (%rdi), %rax ; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movsbq %al, %rax -; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movsbq %al, %rax -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movsbq %al, %rax +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movsbq 3(%rdi), %rax ; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: movsbq 2(%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_4i8_to_4i64: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movd (%rdi), %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movsbq %al, %rax +; SSSE3-NEXT: movsbq 1(%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movsbq (%rdi), %rax ; SSSE3-NEXT: movd %rax, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movsbq %al, %rax -; SSSE3-NEXT: movd %rax, %xmm2 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movsbq %al, %rax -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movsbq %al, %rax +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movsbq 3(%rdi), %rax ; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: movsbq 2(%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm1 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_4i8_to_4i64: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 -; SSE41-NEXT: pmovzxdq %xmm1, %xmm0 -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: movsbq %al, %rax -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: movsbq %al, %rax -; SSE41-NEXT: movd %rax, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: movsbq %al, %rax -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: movd %xmm1, %rax -; SSE41-NEXT: movsbq %al, %rax -; SSE41-NEXT: movd %rax, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 +; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: load_sext_4i8_to_4i64: @@ -791,30 +738,8 @@ ; X32-SSE41-LABEL: load_sext_4i8_to_4i64: ; X32-SSE41: # BB#0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movd (%eax), %xmm0 -; X32-SSE41-NEXT: pmovzxbd %xmm0, %xmm1 -; X32-SSE41-NEXT: pmovzxbq %xmm0, %xmm2 -; X32-SSE41-NEXT: movd %xmm2, %eax -; X32-SSE41-NEXT: movsbl %al, %eax -; X32-SSE41-NEXT: movd %eax, %xmm0 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax -; X32-SSE41-NEXT: movsbl %al, %eax -; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; X32-SSE41-NEXT: movd %xmm2, %eax -; X32-SSE41-NEXT: movsbl %al, %eax -; X32-SSE41-NEXT: movd %eax, %xmm1 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 -; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax -; X32-SSE41-NEXT: movsbl %al, %eax -; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm1 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1 ; X32-SSE41-NEXT: retl entry: %X = load <4 x i8>* %ptr @@ -825,72 +750,36 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; SSE2-LABEL: load_sext_4i16_to_4i64: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movq (%rdi), %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movswq %ax, %rax +; SSE2-NEXT: movswq 2(%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movswq (%rdi), %rax ; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movswq %ax, %rax -; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movswq %ax, %rax -; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movswq %ax, %rax +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movswq 6(%rdi), %rax ; SSE2-NEXT: movd %rax, %xmm2 +; SSE2-NEXT: movswq 4(%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm1 ; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_4i16_to_4i64: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movq (%rdi), %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movswq %ax, %rax +; SSSE3-NEXT: movswq 2(%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movswq (%rdi), %rax ; SSSE3-NEXT: movd %rax, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movswq %ax, %rax -; SSSE3-NEXT: movd %rax, %xmm2 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movswq %ax, %rax -; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movswq %ax, %rax +; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movswq 6(%rdi), %rax ; SSSE3-NEXT: movd %rax, %xmm2 +; SSSE3-NEXT: movswq 4(%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm1 ; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_4i16_to_4i64: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movq (%rdi), %xmm0 -; SSE41-NEXT: pmovzxwd %xmm0, %xmm1 -; SSE41-NEXT: pmovzxwq %xmm0, %xmm0 -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: movswq %ax, %rax -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: movswq %ax, %rax -; SSE41-NEXT: movd %rax, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: movswq %ax, %rax -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: movd %xmm1, %rax -; SSE41-NEXT: movswq %ax, %rax -; SSE41-NEXT: movd %rax, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: pmovsxwq (%rdi), %xmm0 +; SSE41-NEXT: pmovsxwq 4(%rdi), %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: load_sext_4i16_to_4i64: @@ -910,30 +799,8 @@ ; X32-SSE41-LABEL: load_sext_4i16_to_4i64: ; X32-SSE41: # BB#0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movsd (%eax), %xmm0 -; X32-SSE41-NEXT: pmovzxwd %xmm0, %xmm1 -; X32-SSE41-NEXT: pmovzxwq %xmm0, %xmm2 -; X32-SSE41-NEXT: movd %xmm2, %eax -; X32-SSE41-NEXT: cwtl -; X32-SSE41-NEXT: movd %eax, %xmm0 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax -; X32-SSE41-NEXT: cwtl -; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; X32-SSE41-NEXT: movd %xmm2, %eax -; X32-SSE41-NEXT: cwtl -; X32-SSE41-NEXT: movd %eax, %xmm1 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 -; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax -; X32-SSE41-NEXT: cwtl -; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm1 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxwq 4(%eax), %xmm1 ; X32-SSE41-NEXT: retl entry: %X = load <4 x i16>* %ptr Index: llvm/trunk/test/CodeGen/X86/vector-zext.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-zext.ll +++ llvm/trunk/test/CodeGen/X86/vector-zext.ll @@ -230,20 +230,14 @@ ; SSE41-LABEL: load_zext_16i8_to_16i16: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: pmovzxbw %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: punpckhbw %xmm1, %xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pmovzxbw (%rdi), %xmm0 +; SSE41-NEXT: pmovzxbw 8(%rdi), %xmm1 ; SSE41-NEXT: retq ; AVX1-LABEL: load_zext_16i8_to_16i16: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxbw (%rdi), %xmm0 +; AVX1-NEXT: vpmovzxbw 8(%rdi), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -282,20 +276,14 @@ ; SSE41-LABEL: load_zext_8i16_to_8i32: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: pmovzxwd %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: punpckhwd %xmm1, %xmm1 # xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pmovzxwd (%rdi), %xmm0 +; SSE41-NEXT: pmovzxwd 8(%rdi), %xmm1 ; SSE41-NEXT: retq ; AVX1-LABEL: load_zext_8i16_to_8i32: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxwd (%rdi), %xmm0 +; AVX1-NEXT: vpmovzxwd 8(%rdi), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -332,20 +320,14 @@ ; SSE41-LABEL: load_zext_4i32_to_4i64: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: pmovzxdq %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pshufd $250, %xmm1, %xmm1 # xmm1 = xmm1[2,2,3,3] -; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pmovzxdq (%rdi), %xmm0 +; SSE41-NEXT: pmovzxdq 8(%rdi), %xmm1 ; SSE41-NEXT: retq ; AVX1-LABEL: load_zext_4i32_to_4i64: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxdq (%rdi), %xmm0 +; AVX1-NEXT: vpmovzxdq 8(%rdi), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq