Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -1508,6 +1508,11 @@ return isZExtFree(Val.getValueType(), VT2); } + /// Return true if folding a load into ExtVal is profitable. + virtual bool isVectorExtLdDesirable(SDValue ExtVal) const { + return false; + } + /// Return true if an fneg operation is free to the point where it is never /// worthwhile to replace it with a bitwise operation. virtual bool isFNegFree(EVT VT) const { Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -5278,17 +5278,17 @@ } // fold (sext (load x)) -> (sext (truncate (sextload x))) - // None of the supported targets knows how to perform load and sign extend - // on vectors in one instruction. We only perform this transformation on - // scalars. - if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && + if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && - ((!LegalOperations && !cast(N0)->isVolatile()) || + ((!LegalOperations && !VT.isVector() && + !cast(N0)->isVolatile()) || TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()))) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI); + if (VT.isVector()) + DoXform &= TLI.isVectorExtLdDesirable(SDValue(N, 0)); if (DoXform) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT, @@ -5305,6 +5305,68 @@ } } + // fold (sext (load x)) -> (sext (truncate (sextload x))) + // Only on illegal but splittable vectors. + if (ISD::isNON_EXTLoad(N0.getNode()) && VT.isVector() && + ISD::isUNINDEXEDLoad(N0.getNode()) && + (!LegalOperations && !cast(N0)->isVolatile())) { + bool DoXform = true; + SmallVector SetCCs; + if (VT.isVector()) + DoXform = TLI.isVectorExtLdDesirable(SDValue(N, 0)); + if (!N0.hasOneUse()) + DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI); + if (DoXform) { + LoadSDNode *LN0 = cast(N0); + EVT SrcVT = N0.getValueType(); + EVT DstVT = VT; + EVT SplitSrcVT = SrcVT; + EVT SplitDstVT = DstVT; + ISD::LoadExtType ExtType = ISD::SEXTLOAD; + while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) && + SplitSrcVT.getVectorNumElements() > 1) { + SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first; + SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first; + } + + if (TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT)) { + SDLoc dl(N); + unsigned NumSplits = + DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements(); + unsigned Stride = SplitSrcVT.getSizeInBits() / 8; + SmallVector SplitDsts; + SmallVector LoadChains; + + SDValue BasePTR = LN0->getBasePtr(); + for (unsigned Idx = 0; Idx < NumSplits; Idx++) { + SDValue SplitLoad = DAG.getExtLoad( + ExtType, dl, SplitDstVT, LN0->getChain(), BasePTR, + LN0->getPointerInfo().getWithOffset(Idx * Stride), SplitSrcVT, + LN0->isVolatile(), LN0->isNonTemporal(), LN0->isInvariant(), + LN0->getAlignment(), LN0->getAAInfo()); + + BasePTR = + DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR, + DAG.getConstant(Stride, BasePTR.getValueType())); + + SplitDsts.push_back(SplitLoad.getValue(0)); + + LoadChains.push_back(SplitLoad.getValue(1)); + } + SDValue NewChain = + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); + SDValue Value = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, SplitDsts); + + CombineTo(N, Value); + SDValue Trunc = + DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), Value); + CombineTo(N0.getNode(), Trunc, NewChain); + ExtendSetCCUses(SetCCs, Trunc, Value, SDLoc(N), ISD::SIGN_EXTEND); + return SDValue(N, 0); // Return N so it doesn't get rechecked! + } + } + } + // fold (sext (sextload x)) -> (sext (truncate (sextload x))) // fold (sext ( extload x)) -> (sext (truncate (sextload x))) if ((ISD::isSEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) && @@ -5568,17 +5630,16 @@ } // fold (zext (load x)) -> (zext (truncate (zextload x))) - // None of the supported targets knows how to perform load and vector_zext - // on vectors in one instruction. We only perform this transformation on - // scalars. - if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() && - ISD::isUNINDEXEDLoad(N0.getNode()) && - ((!LegalOperations && !cast(N0)->isVolatile()) || + if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) && + ((!LegalOperations && !VT.isVector() && + !cast(N0)->isVolatile()) || TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()))) { bool DoXform = true; SmallVector SetCCs; if (!N0.hasOneUse()) DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI); + if (VT.isVector()) + DoXform &= TLI.isVectorExtLdDesirable(SDValue(N, 0)); if (DoXform) { LoadSDNode *LN0 = cast(N0); SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, @@ -5596,6 +5657,68 @@ } } + // fold (zext (load x)) -> (zext (truncate (zextload x))) + // Only on illegal but splittable vectors. + if (ISD::isNON_EXTLoad(N0.getNode()) && VT.isVector() && + ISD::isUNINDEXEDLoad(N0.getNode()) && + (!LegalOperations && !cast(N0)->isVolatile())) { + bool DoXform = true; + SmallVector SetCCs; + if (VT.isVector()) + DoXform = TLI.isVectorExtLdDesirable(SDValue(N, 0)); + if (!N0.hasOneUse()) + DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI); + if (DoXform) { + LoadSDNode *LN0 = cast(N0); + EVT SrcVT = N0.getValueType(); + EVT DstVT = VT; + EVT SplitSrcVT = SrcVT; + EVT SplitDstVT = DstVT; + ISD::LoadExtType ExtType = ISD::ZEXTLOAD; + while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) && + SplitSrcVT.getVectorNumElements() > 1) { + SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first; + SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first; + } + + if (TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT)) { + SDLoc dl(N); + unsigned NumSplits = + DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements(); + unsigned Stride = SplitSrcVT.getSizeInBits() / 8; + SmallVector SplitDsts; + SmallVector LoadChains; + + SDValue BasePTR = LN0->getBasePtr(); + for (unsigned Idx = 0; Idx < NumSplits; Idx++) { + SDValue SplitLoad = DAG.getExtLoad( + ExtType, dl, SplitDstVT, LN0->getChain(), BasePTR, + LN0->getPointerInfo().getWithOffset(Idx * Stride), SplitSrcVT, + LN0->isVolatile(), LN0->isNonTemporal(), LN0->isInvariant(), + LN0->getAlignment(), LN0->getAAInfo()); + + BasePTR = + DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR, + DAG.getConstant(Stride, BasePTR.getValueType())); + + SplitDsts.push_back(SplitLoad.getValue(0)); + + LoadChains.push_back(SplitLoad.getValue(1)); + } + SDValue NewChain = + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains); + SDValue Value = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, SplitDsts); + + CombineTo(N, Value); + SDValue Trunc = + DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), Value); + CombineTo(N0.getNode(), Trunc, NewChain); + ExtendSetCCUses(SetCCs, Trunc, Value, SDLoc(N), ISD::ZERO_EXTEND); + return SDValue(N, 0); // Return N so it doesn't get rechecked! + } + } + } + // fold (zext (and/or/xor (load x), cst)) -> // (and/or/xor (zextload x), (zext cst)) if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR || Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -732,6 +732,8 @@ bool isZExtFree(EVT VT1, EVT VT2) const override; bool isZExtFree(SDValue Val, EVT VT2) const override; + bool isVectorExtLdDesirable(SDValue) const override; + /// Return true if an FMA operation is faster than a pair of fmul and fadd /// instructions. fmuladd intrinsics will be expanded to FMAs when this /// method returns true, otherwise fmuladd is expanded to fmul + fadd. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -15931,6 +15931,7 @@ // may emit an illegal shuffle but the expansion is still better than scalar // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise // we'll emit a shuffle and a arithmetic shift. +// FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during // the shuffle phase or after the shuffle. static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget *Subtarget, @@ -20105,6 +20106,10 @@ return false; } +bool X86TargetLowering::isVectorExtLdDesirable(SDValue) const { + return true; +} + bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { if (!(Subtarget->hasFMA() || Subtarget->hasFMA4())) Index: test/CodeGen/X86/vector-sext.ll =================================================================== --- test/CodeGen/X86/vector-sext.ll +++ test/CodeGen/X86/vector-sext.ll @@ -523,46 +523,35 @@ define <16 x i16> @sext_16i8_to_16i16(<16 x i8> *%ptr) { ; SSE2-LABEL: sext_16i8_to_16i16: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psllw $8, %xmm0 +; SSE2-NEXT: movq (%rdi), %xmm0 +; SSE2-NEXT: punpcklbw %xmm0, %xmm0 # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE2-NEXT: psllw $8, %xmm1 +; SSE2-NEXT: movq 8(%rdi), %xmm1 +; SSE2-NEXT: punpcklbw %xmm1, %xmm1 # xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psraw $8, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: sext_16i8_to_16i16: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movdqa (%rdi), %xmm1 -; SSSE3-NEXT: movdqa %xmm1, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: psllw $8, %xmm0 +; SSSE3-NEXT: movq (%rdi), %xmm0 +; SSSE3-NEXT: punpcklbw %xmm0, %xmm0 # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: psraw $8, %xmm0 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSSE3-NEXT: psllw $8, %xmm1 +; SSSE3-NEXT: movq 8(%rdi), %xmm1 +; SSSE3-NEXT: punpcklbw %xmm1, %xmm1 # xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: psraw $8, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: sext_16i8_to_16i16: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: pmovzxbw %xmm1, %xmm0 -; SSE41-NEXT: psllw $8, %xmm0 -; SSE41-NEXT: psraw $8, %xmm0 -; SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: psllw $8, %xmm1 -; SSE41-NEXT: psraw $8, %xmm1 +; SSE41-NEXT: pmovsxbw (%rdi), %xmm0 +; SSE41-NEXT: pmovsxbw 8(%rdi), %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: sext_16i8_to_16i16: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] -; AVX1-NEXT: vpmovsxbw %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpmovsxbw (%rdi), %xmm0 +; AVX1-NEXT: vpmovsxbw 8(%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: sext_16i8_to_16i16: @@ -573,13 +562,8 @@ ; X32-SSE41-LABEL: sext_16i8_to_16i16: ; X32-SSE41: # BB#0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movdqa (%eax), %xmm1 -; X32-SSE41-NEXT: pmovzxbw %xmm1, %xmm0 -; X32-SSE41-NEXT: psllw $8, %xmm0 -; X32-SSE41-NEXT: psraw $8, %xmm0 -; X32-SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; X32-SSE41-NEXT: psllw $8, %xmm1 -; X32-SSE41-NEXT: psraw $8, %xmm1 +; X32-SSE41-NEXT: pmovsxbw (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxbw 8(%eax), %xmm1 ; X32-SSE41-NEXT: retl entry: %X = load <16 x i8>* %ptr @@ -705,73 +689,36 @@ define <4 x i64> @load_sext_4i8_to_4i64(<4 x i8> *%ptr) { ; SSE2-LABEL: load_sext_4i8_to_4i64: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movd (%rdi), %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movsbq %al, %rax +; SSE2-NEXT: movsbq 1(%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movsbq (%rdi), %rax ; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movsbq %al, %rax +; SSE2-NEXT: punpcklqdq %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movsbq 3(%rdi), %rax ; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movsbq %al, %rax +; SSE2-NEXT: movsbq 2(%rdi), %rax ; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movsbq %al, %rax -; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: punpcklqdq %xmm2, %xmm1 # xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_4i8_to_4i64: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movd (%rdi), %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movsbq %al, %rax +; SSSE3-NEXT: movsbq 1(%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movsbq (%rdi), %rax ; SSSE3-NEXT: movd %rax, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movsbq %al, %rax +; SSSE3-NEXT: punpcklqdq %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movsbq 3(%rdi), %rax ; SSSE3-NEXT: movd %rax, %xmm2 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movsbq %al, %rax +; SSSE3-NEXT: movsbq 2(%rdi), %rax ; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movsbq %al, %rax -; SSSE3-NEXT: movd %rax, %xmm2 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: punpcklqdq %xmm2, %xmm1 # xmm1 = xmm1[0],xmm2[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_4i8_to_4i64: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: pmovzxbd (%rdi), %xmm1 -; SSE41-NEXT: pmovzxdq %xmm1, %xmm0 -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: movsbq %al, %rax -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: movsbq %al, %rax -; SSE41-NEXT: movd %rax, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: movsbq %al, %rax -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: movd %xmm1, %rax -; SSE41-NEXT: movsbq %al, %rax -; SSE41-NEXT: movd %rax, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: pmovsxbq (%rdi), %xmm0 +; SSE41-NEXT: pmovsxbq 2(%rdi), %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: load_sext_4i8_to_4i64: @@ -791,30 +738,8 @@ ; X32-SSE41-LABEL: load_sext_4i8_to_4i64: ; X32-SSE41: # BB#0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movd (%eax), %xmm0 -; X32-SSE41-NEXT: pmovzxbd %xmm0, %xmm1 -; X32-SSE41-NEXT: pmovzxbq %xmm0, %xmm2 -; X32-SSE41-NEXT: movd %xmm2, %eax -; X32-SSE41-NEXT: movsbl %al, %eax -; X32-SSE41-NEXT: movd %eax, %xmm0 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax -; X32-SSE41-NEXT: movsbl %al, %eax -; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; X32-SSE41-NEXT: movd %xmm2, %eax -; X32-SSE41-NEXT: movsbl %al, %eax -; X32-SSE41-NEXT: movd %eax, %xmm1 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 -; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax -; X32-SSE41-NEXT: movsbl %al, %eax -; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm1 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; X32-SSE41-NEXT: pmovsxbq (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxbq 2(%eax), %xmm1 ; X32-SSE41-NEXT: retl entry: %X = load <4 x i8>* %ptr @@ -825,72 +750,36 @@ define <4 x i64> @load_sext_4i16_to_4i64(<4 x i16> *%ptr) { ; SSE2-LABEL: load_sext_4i16_to_4i64: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: movq (%rdi), %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movswq %ax, %rax +; SSE2-NEXT: movswq 2(%rdi), %rax +; SSE2-NEXT: movd %rax, %xmm1 +; SSE2-NEXT: movswq (%rdi), %rax ; SSE2-NEXT: movd %rax, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movswq %ax, %rax +; SSE2-NEXT: punpcklqdq %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: movswq 6(%rdi), %rax ; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movswq %ax, %rax +; SSE2-NEXT: movswq 4(%rdi), %rax ; SSE2-NEXT: movd %rax, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSE2-NEXT: movd %xmm2, %rax -; SSE2-NEXT: movswq %ax, %rax -; SSE2-NEXT: movd %rax, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: punpcklqdq %xmm2, %xmm1 # xmm1 = xmm1[0],xmm2[0] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: load_sext_4i16_to_4i64: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: movq (%rdi), %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,1,1,3] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movswq %ax, %rax +; SSSE3-NEXT: movswq 2(%rdi), %rax +; SSSE3-NEXT: movd %rax, %xmm1 +; SSSE3-NEXT: movswq (%rdi), %rax ; SSSE3-NEXT: movd %rax, %xmm0 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movswq %ax, %rax +; SSSE3-NEXT: punpcklqdq %xmm1, %xmm0 # xmm0 = xmm0[0],xmm1[0] +; SSSE3-NEXT: movswq 6(%rdi), %rax ; SSSE3-NEXT: movd %rax, %xmm2 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movswq %ax, %rax +; SSSE3-NEXT: movswq 4(%rdi), %rax ; SSSE3-NEXT: movd %rax, %xmm1 -; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] -; SSSE3-NEXT: movd %xmm2, %rax -; SSSE3-NEXT: movswq %ax, %rax -; SSSE3-NEXT: movd %rax, %xmm2 -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSSE3-NEXT: punpcklqdq %xmm2, %xmm1 # xmm1 = xmm1[0],xmm2[0] ; SSSE3-NEXT: retq ; ; SSE41-LABEL: load_sext_4i16_to_4i64: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movq (%rdi), %xmm0 -; SSE41-NEXT: pmovzxwd %xmm0, %xmm1 -; SSE41-NEXT: pmovzxwq %xmm0, %xmm0 -; SSE41-NEXT: pextrq $1, %xmm0, %rax -; SSE41-NEXT: movswq %ax, %rax -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: movd %xmm0, %rax -; SSE41-NEXT: movswq %ax, %rax -; SSE41-NEXT: movd %rax, %xmm0 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; SSE41-NEXT: pextrq $1, %xmm1, %rax -; SSE41-NEXT: movswq %ax, %rax -; SSE41-NEXT: movd %rax, %xmm2 -; SSE41-NEXT: movd %xmm1, %rax -; SSE41-NEXT: movswq %ax, %rax -; SSE41-NEXT: movd %rax, %xmm1 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE41-NEXT: pmovsxwq (%rdi), %xmm0 +; SSE41-NEXT: pmovsxwq 4(%rdi), %xmm1 ; SSE41-NEXT: retq ; ; AVX1-LABEL: load_sext_4i16_to_4i64: @@ -910,30 +799,8 @@ ; X32-SSE41-LABEL: load_sext_4i16_to_4i64: ; X32-SSE41: # BB#0: # %entry ; X32-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE41-NEXT: movsd (%eax), %xmm0 -; X32-SSE41-NEXT: pmovzxwd %xmm0, %xmm1 -; X32-SSE41-NEXT: pmovzxwq %xmm0, %xmm2 -; X32-SSE41-NEXT: movd %xmm2, %eax -; X32-SSE41-NEXT: cwtl -; X32-SSE41-NEXT: movd %eax, %xmm0 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm0 -; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax -; X32-SSE41-NEXT: cwtl -; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm0 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm0 -; X32-SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3] -; X32-SSE41-NEXT: movd %xmm2, %eax -; X32-SSE41-NEXT: cwtl -; X32-SSE41-NEXT: movd %eax, %xmm1 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $1, %eax, %xmm1 -; X32-SSE41-NEXT: pextrd $2, %xmm2, %eax -; X32-SSE41-NEXT: cwtl -; X32-SSE41-NEXT: pinsrd $2, %eax, %xmm1 -; X32-SSE41-NEXT: sarl $31, %eax -; X32-SSE41-NEXT: pinsrd $3, %eax, %xmm1 +; X32-SSE41-NEXT: pmovsxwq (%eax), %xmm0 +; X32-SSE41-NEXT: pmovsxwq 4(%eax), %xmm1 ; X32-SSE41-NEXT: retl entry: %X = load <4 x i16>* %ptr Index: test/CodeGen/X86/vector-zext.ll =================================================================== --- test/CodeGen/X86/vector-zext.ll +++ test/CodeGen/X86/vector-zext.ll @@ -230,21 +230,15 @@ ; SSE41-LABEL: load_zext_16i8_to_16i16: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: pmovzxbw %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: punpckhbw %xmm1, %xmm1 # xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pmovzxbw (%rdi), %xmm0 +; SSE41-NEXT: pmovzxbw 8(%rdi), %xmm1 ; SSE41-NEXT: retq ; AVX1-LABEL: load_zext_16i8_to_16i16: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhbw %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] -; AVX1-NEXT: vpmovzxbw %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpmovzxbw (%rdi), %xmm0 +; AVX1-NEXT: vpmovzxbw 8(%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; AVX2-LABEL: load_zext_16i8_to_16i16: @@ -282,21 +276,15 @@ ; SSE41-LABEL: load_zext_8i16_to_8i32: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: pmovzxwd %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: punpckhwd %xmm1, %xmm1 # xmm1 = xmm1[4,4,5,5,6,6,7,7] -; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pmovzxwd (%rdi), %xmm0 +; SSE41-NEXT: pmovzxwd 8(%rdi), %xmm1 ; SSE41-NEXT: retq ; AVX1-LABEL: load_zext_8i16_to_8i32: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhwd %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX1-NEXT: vpmovzxwd %xmm0, %xmm0 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vpmovzxwd (%rdi), %xmm0 +; AVX1-NEXT: vpmovzxwd 8(%rdi), %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; AVX2-LABEL: load_zext_8i16_to_8i32: @@ -332,20 +320,14 @@ ; SSE41-LABEL: load_zext_4i32_to_4i64: ; SSE41: # BB#0: # %entry -; SSE41-NEXT: movdqa (%rdi), %xmm1 -; SSE41-NEXT: pmovzxdq %xmm1, %xmm0 -; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,4294967295] -; SSE41-NEXT: pand %xmm2, %xmm0 -; SSE41-NEXT: pshufd $-6, %xmm1, %xmm1 # xmm1 = xmm1[2,2,3,3] -; SSE41-NEXT: pand %xmm2, %xmm1 +; SSE41-NEXT: pmovzxdq (%rdi), %xmm0 +; SSE41-NEXT: pmovzxdq 8(%rdi), %xmm1 ; SSE41-NEXT: retq ; AVX1-LABEL: load_zext_4i32_to_4i64: ; AVX1: # BB#0: # %entry -; AVX1-NEXT: vmovdqa (%rdi), %xmm0 -; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vpunpckhdq %xmm1, %xmm0, %xmm1 # xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0 +; AVX1-NEXT: vpmovzxdq (%rdi), %xmm0 +; AVX1-NEXT: vpmovzxdq 8(%rdi), %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq Index: test/CodeGen/X86/widen_load-2.ll =================================================================== --- test/CodeGen/X86/widen_load-2.ll +++ test/CodeGen/X86/widen_load-2.ll @@ -73,8 +73,10 @@ %i16vec3 = type <3 x i16> define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp) nounwind { ; CHECK-LABEL: add3i16: -; CHECK: pmovzxwd (%{{.*}}), %[[R0:xmm[0-9]+]] -; CHECK-NEXT: pmovzxwd (%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK: movq (%rsi), %xmm0 +; CHECK-NEXT: pmovzxwd %xmm0, %xmm0 +; CHECK-NEXT: movq (%rdx), %xmm1 +; CHECK-NEXT: pmovzxwd %xmm1, %xmm1 ; CHECK-NEXT: paddd %[[R0]], %[[R1]] ; CHECK-NEXT: movdqa %[[R1]], %[[R0]] ; CHECK-NEXT: pshufb {{.*}}, %[[R0]] @@ -141,8 +143,10 @@ %i8vec3 = type <3 x i8> define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) nounwind { ; CHECK-LABEL: add3i8: -; CHECK: pmovzxbd (%{{.*}}), %[[R0:xmm[0-9]+]] -; CHECK-NEXT: pmovzxbd (%{{.*}}), %[[R1:xmm[0-9]+]] +; CHECK: movd (%rsi), %xmm0 +; CHECK-NEXT: pmovzxbd %xmm0, %xmm0 +; CHECK-NEXT: movd (%rdx), %xmm1 +; CHECK-NEXT: pmovzxbd %xmm1, %xmm1 ; CHECK-NEXT: paddd %[[R0]], %[[R1]] ; CHECK-NEXT: movdqa %[[R1]], %[[R0]] ; CHECK-NEXT: pshufb {{.*}}, %[[R0]]