Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -868,6 +868,13 @@ const SelectionDAG &DAG, unsigned Depth) const override; + bool SimplifyDemandedVectorEltsForTargetNode(SDValue Op, + const APInt &DemandedElts, + APInt &KnownUndef, + APInt &KnownZero, + TargetLoweringOpt &TLO, + unsigned Depth) const override; + SDValue unwrapAddress(SDValue N) const override; bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA, Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -31748,11 +31748,104 @@ {Op}, 0, Op, {0}, {}, /*Depth*/ 1, /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return Res; + + // Simplify source operands based on shuffle mask. + // TODO - merge this into combineX86ShufflesRecursively. + APInt KnownUndef, KnownZero; + APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements()); + if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, KnownUndef, KnownZero, DCI)) + return SDValue(N, 0); } return SDValue(); } +bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( + SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, + TargetLoweringOpt &TLO, unsigned Depth) const { + int NumElts = DemandedElts.getBitWidth(); + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + + // Handle special case opcodes. + switch (Opc) { + case X86ISD::VBROADCAST: { + SDValue Src = Op.getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + if (!SrcVT.isVector()) + return false; + APInt SrcUndef, SrcZero; + APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0); + if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO, + Depth + 1)) + return true; + break; + } + } + + // Simplify target shuffles. + if (!isTargetShuffle(Opc)) + return false; + + // Get target shuffle mask. + SmallVector OpMask; + SmallVector OpInputs; + if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, TLO.DAG)) + return false; + + // Shuffle inputs must be the same type as the result. + if (llvm::any_of(OpInputs, + [VT](SDValue V) { return VT != V.getValueType(); })) + return false; + + // Attempt to simplify inputs. + int NumSrcs = OpInputs.size(); + for (int Src = 0; Src != NumSrcs; ++Src) { + int Lo = Src * NumElts; + APInt SrcElts = APInt::getNullValue(NumElts); + for (int i = 0; i != NumElts; ++i) + if (DemandedElts[i]) { + int M = OpMask[i] - Lo; + if (0 <= M && M < NumElts) + SrcElts.setBit(M); + } + + APInt SrcUndef, SrcZero; + if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero, + TLO, Depth + 1)) + return true; + } + + // Check if shuffle mask can be simplified to undef/zero/identity. + for (int i = 0; i != NumElts; ++i) + if (!DemandedElts[i]) + OpMask[i] = SM_SentinelUndef; + + if (isUndefInRange(OpMask, 0, NumElts)) { + KnownUndef.setAllBits(); + return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT)); + } + if (isUndefOrZeroInRange(OpMask, 0, NumElts)) { + KnownZero.setAllBits(); + return TLO.CombineTo( + Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op))); + } + for (int Src = 0; Src != NumSrcs; ++Src) + if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts)) + return TLO.CombineTo(Op, OpInputs[Src]); + + // Extract known zero/undef elements. + // TODO - Propagate input undef/zero elts. + for (int i = 0; i != NumElts; ++i) { + if (OpMask[i] == SM_SentinelUndef) + KnownUndef.setBit(i); + if (OpMask[i] == SM_SentinelZero) + KnownZero.setBit(i); + } + + return false; +} + /// Check if a vector extract from a target-specific shuffle of a load can be /// folded into a single element load. /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but Index: test/CodeGen/X86/2012-01-12-extract-sv.ll =================================================================== --- test/CodeGen/X86/2012-01-12-extract-sv.ll +++ test/CodeGen/X86/2012-01-12-extract-sv.ll @@ -6,9 +6,8 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovaps (%eax), %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1] -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 Index: test/CodeGen/X86/combine-sdiv.ll =================================================================== --- test/CodeGen/X86/combine-sdiv.ll +++ test/CodeGen/X86/combine-sdiv.ll @@ -1160,10 +1160,8 @@ ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: psrad $3, %xmm3 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $2, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3] +; SSE2-NEXT: psrad $2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: movaps %xmm1, %xmm0 ; SSE2-NEXT: retq @@ -1183,14 +1181,13 @@ ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] ; SSE41-NEXT: paddd %xmm0, %xmm2 ; SSE41-NEXT: movdqa %xmm2, %xmm1 +; SSE41-NEXT: psrad $4, %xmm1 ; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: psrad $3, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: psrad $4, %xmm2 -; SSE41-NEXT: psrad $2, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3],xmm3[4,5],xmm1[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7] +; SSE41-NEXT: psrad $2, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_v4i32: @@ -1207,8 +1204,7 @@ ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpsrad $3, %xmm1, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX1-NEXT: retq @@ -1253,10 +1249,8 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: psrad $3, %xmm4 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: psrad $2, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3] +; SSE2-NEXT: psrad $2, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] ; SSE2-NEXT: movdqa %xmm1, %xmm2 ; SSE2-NEXT: psrad $31, %xmm2 @@ -1274,10 +1268,8 @@ ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: psrad $3, %xmm4 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm4 = xmm4[1],xmm3[1] -; SSE2-NEXT: movdqa %xmm2, %xmm3 -; SSE2-NEXT: psrad $2, %xmm3 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm4[0,3] +; SSE2-NEXT: psrad $2, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] ; SSE2-NEXT: movaps %xmm2, %xmm1 ; SSE2-NEXT: retq @@ -1285,45 +1277,42 @@ ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_v8i32: ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm0, %xmm2 +; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm3 -; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: psrld $28, %xmm0 -; SSE41-NEXT: movdqa %xmm3, %xmm4 +; SSE41-NEXT: psrld $28, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 ; SSE41-NEXT: psrld $30, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm3 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; SSE41-NEXT: paddd %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm0 -; SSE41-NEXT: psrad $3, %xmm0 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: psrad $4, %xmm3 -; SSE41-NEXT: psrad $2, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; SSE41-NEXT: psrld $29, %xmm0 +; SSE41-NEXT: pxor %xmm3, %xmm3 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; SSE41-NEXT: paddd %xmm2, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrad $4, %xmm4 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psrad $2, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm3 -; SSE41-NEXT: psrad $31, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: psrld $28, %xmm2 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: psrld $30, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; SSE41-NEXT: paddd %xmm1, %xmm3 -; SSE41-NEXT: movdqa %xmm3, %xmm4 -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: psrad $3, %xmm2 +; SSE41-NEXT: movdqa %xmm1, %xmm2 +; SSE41-NEXT: psrad $31, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psrld $28, %xmm4 +; SSE41-NEXT: movdqa %xmm2, %xmm5 +; SSE41-NEXT: psrld $30, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; SSE41-NEXT: psrld $29, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7] +; SSE41-NEXT: paddd %xmm1, %xmm2 +; SSE41-NEXT: movdqa %xmm2, %xmm3 ; SSE41-NEXT: psrad $4, %xmm3 +; SSE41-NEXT: movdqa %xmm2, %xmm4 ; SSE41-NEXT: psrad $2, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm2 ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0,1],xmm2[2,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm2, %xmm1 @@ -1414,10 +1403,8 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm6 ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE2-NEXT: movdqa %xmm0, %xmm5 -; SSE2-NEXT: psrad $2, %xmm5 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,3] +; SSE2-NEXT: psrad $2, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 @@ -1435,10 +1422,8 @@ ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrad $2, %xmm5 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm6[0,3] +; SSE2-NEXT: psrad $2, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm4[0],xmm1[1,2,3] ; SSE2-NEXT: movdqa %xmm2, %xmm4 ; SSE2-NEXT: psrad $31, %xmm4 @@ -1456,10 +1441,8 @@ ; SSE2-NEXT: movdqa %xmm4, %xmm6 ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm5[1] -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: psrad $2, %xmm5 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm6[0,3] +; SSE2-NEXT: psrad $2, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3] ; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: psrad $31, %xmm5 @@ -1477,10 +1460,8 @@ ; SSE2-NEXT: movdqa %xmm5, %xmm6 ; SSE2-NEXT: psrad $3, %xmm6 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm6 = xmm6[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: psrad $2, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm6[0,3] +; SSE2-NEXT: psrad $2, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm5 = xmm3[0],xmm5[1,2,3] ; SSE2-NEXT: movaps %xmm4, %xmm2 ; SSE2-NEXT: movaps %xmm5, %xmm3 @@ -1490,85 +1471,80 @@ ; SSE41: # %bb.0: ; SSE41-NEXT: movdqa %xmm1, %xmm4 ; SSE41-NEXT: movdqa %xmm0, %xmm1 -; SSE41-NEXT: movdqa %xmm0, %xmm6 -; SSE41-NEXT: psrad $31, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: psrld $28, %xmm0 -; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: psrad $31, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psrld $28, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm7 ; SSE41-NEXT: psrld $30, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm6 -; SSE41-NEXT: pxor %xmm5, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7] -; SSE41-NEXT: paddd %xmm1, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: movdqa %xmm6, %xmm0 -; SSE41-NEXT: psrad $3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: psrld $29, %xmm0 +; SSE41-NEXT: pxor %xmm6, %xmm6 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm6[0,1,2,3],xmm0[4,5,6,7] -; SSE41-NEXT: psrad $4, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7] +; SSE41-NEXT: paddd %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm5 +; SSE41-NEXT: psrad $4, %xmm5 +; SSE41-NEXT: movdqa %xmm0, %xmm7 ; SSE41-NEXT: psrad $2, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm0 ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3],xmm0[4,5],xmm7[6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm4, %xmm6 -; SSE41-NEXT: psrad $31, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm1 -; SSE41-NEXT: psrld $28, %xmm1 -; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: movdqa %xmm4, %xmm1 +; SSE41-NEXT: psrad $31, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: psrld $28, %xmm5 +; SSE41-NEXT: movdqa %xmm1, %xmm7 ; SSE41-NEXT: psrld $30, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7] -; SSE41-NEXT: paddd %xmm4, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: movdqa %xmm6, %xmm1 -; SSE41-NEXT: psrad $3, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: psrld $29, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: psrad $4, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7] +; SSE41-NEXT: paddd %xmm4, %xmm1 +; SSE41-NEXT: movdqa %xmm1, %xmm5 +; SSE41-NEXT: psrad $4, %xmm5 +; SSE41-NEXT: movdqa %xmm1, %xmm7 ; SSE41-NEXT: psrad $2, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm1 ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3],xmm1[4,5],xmm7[6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: psrad $31, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: psrld $28, %xmm4 -; SSE41-NEXT: movdqa %xmm6, %xmm7 +; SSE41-NEXT: movdqa %xmm2, %xmm4 +; SSE41-NEXT: psrad $31, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: psrld $28, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm7 ; SSE41-NEXT: psrld $30, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm4[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm6 -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm5[0,1,2,3],xmm6[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7] -; SSE41-NEXT: paddd %xmm2, %xmm6 -; SSE41-NEXT: movdqa %xmm6, %xmm7 -; SSE41-NEXT: movdqa %xmm6, %xmm4 -; SSE41-NEXT: psrad $3, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: psrld $29, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7] -; SSE41-NEXT: psrad $4, %xmm6 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7] +; SSE41-NEXT: paddd %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm5 +; SSE41-NEXT: psrad $4, %xmm5 +; SSE41-NEXT: movdqa %xmm4, %xmm7 ; SSE41-NEXT: psrad $2, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm4 ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3],xmm4[4,5],xmm7[6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm2[0,1],xmm4[2,3,4,5,6,7] -; SSE41-NEXT: movdqa %xmm3, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: psrld $28, %xmm6 -; SSE41-NEXT: movdqa %xmm2, %xmm7 +; SSE41-NEXT: movdqa %xmm3, %xmm5 +; SSE41-NEXT: psrad $31, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm2 +; SSE41-NEXT: psrld $28, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm7 ; SSE41-NEXT: psrld $30, %xmm7 -; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm6[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3],xmm2[4,5],xmm7[6,7] -; SSE41-NEXT: paddd %xmm3, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm6 -; SSE41-NEXT: movdqa %xmm2, %xmm5 -; SSE41-NEXT: psrad $3, %xmm5 -; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm7 = xmm7[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: psrld $29, %xmm5 +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3],xmm5[4,5],xmm7[6,7] +; SSE41-NEXT: paddd %xmm3, %xmm5 +; SSE41-NEXT: movdqa %xmm5, %xmm2 ; SSE41-NEXT: psrad $4, %xmm2 +; SSE41-NEXT: movdqa %xmm5, %xmm6 ; SSE41-NEXT: psrad $2, %xmm6 ; SSE41-NEXT: pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm2[4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm5 ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] ; SSE41-NEXT: pblendw {{.*#+}} xmm5 = xmm3[0,1],xmm5[2,3,4,5,6,7] ; SSE41-NEXT: movdqa %xmm4, %xmm2 @@ -2250,10 +2226,8 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psrad $3, %xmm3 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1] -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $2, %xmm2 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm3[0,3] +; SSE2-NEXT: psrad $2, %xmm0 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,3] ; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: psubd %xmm0, %xmm1 @@ -2264,30 +2238,28 @@ ; ; SSE41-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: ; SSE41: # %bb.0: +; SSE41-NEXT: movdqa %xmm0, %xmm1 +; SSE41-NEXT: psrad $31, %xmm0 ; SSE41-NEXT: movdqa %xmm0, %xmm2 -; SSE41-NEXT: psrad $31, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrld $28, %xmm1 -; SSE41-NEXT: movdqa %xmm2, %xmm3 +; SSE41-NEXT: psrld $28, %xmm2 +; SSE41-NEXT: movdqa %xmm0, %xmm3 ; SSE41-NEXT: psrld $30, %xmm3 -; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: psrld $29, %xmm2 -; SSE41-NEXT: pxor %xmm4, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] -; SSE41-NEXT: paddd %xmm0, %xmm2 -; SSE41-NEXT: movdqa %xmm2, %xmm3 -; SSE41-NEXT: movdqa %xmm2, %xmm1 -; SSE41-NEXT: psrad $3, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; SSE41-NEXT: psrad $4, %xmm2 -; SSE41-NEXT: psrad $2, %xmm3 ; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE41-NEXT: psubd %xmm1, %xmm4 -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; SSE41-NEXT: movdqa %xmm1, %xmm0 +; SSE41-NEXT: psrld $29, %xmm0 +; SSE41-NEXT: pxor %xmm2, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; SSE41-NEXT: paddd %xmm1, %xmm0 +; SSE41-NEXT: movdqa %xmm0, %xmm3 +; SSE41-NEXT: psrad $4, %xmm3 +; SSE41-NEXT: movdqa %xmm0, %xmm4 +; SSE41-NEXT: psrad $2, %xmm4 +; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; SSE41-NEXT: psrad $3, %xmm0 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7] +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] +; SSE41-NEXT: psubd %xmm0, %xmm2 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: combine_vec_sdiv_by_pow2b_PosAndNeg: @@ -2304,8 +2276,7 @@ ; AVX1-NEXT: vpsrad $4, %xmm1, %xmm2 ; AVX1-NEXT: vpsrad $2, %xmm1, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpsrad $3, %xmm1, %xmm4 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] +; AVX1-NEXT: vpsrad $3, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX1-NEXT: vpsubd %xmm0, %xmm3, %xmm1 Index: test/CodeGen/X86/combine-srem.ll =================================================================== --- test/CodeGen/X86/combine-srem.ll +++ test/CodeGen/X86/combine-srem.ll @@ -308,16 +308,15 @@ ; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] ; SSE-NEXT: paddd %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm2 +; SSE-NEXT: psrad $3, %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrad $2, %xmm3 -; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; SSE-NEXT: psrad $3, %xmm1 -; SSE-NEXT: psrad $1, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3,4,5,6,7] -; SSE-NEXT: pmulld {{.*}}(%rip), %xmm3 -; SSE-NEXT: psubd %xmm3, %xmm0 +; SSE-NEXT: psrad $1, %xmm3 +; SSE-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; SSE-NEXT: psrad $2, %xmm1 +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE-NEXT: pmulld {{.*}}(%rip), %xmm1 +; SSE-NEXT: psubd %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: combine_vec_srem_by_pow2b: @@ -334,8 +333,7 @@ ; AVX1-NEXT: vpsrad $3, %xmm1, %xmm2 ; AVX1-NEXT: vpsrad $1, %xmm1, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; AVX1-NEXT: vpsrad $2, %xmm1, %xmm3 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] +; AVX1-NEXT: vpsrad $2, %xmm1, %xmm1 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 Index: test/CodeGen/X86/dagcombine-cse.ll =================================================================== --- test/CodeGen/X86/dagcombine-cse.ll +++ test/CodeGen/X86/dagcombine-cse.ll @@ -9,10 +9,7 @@ ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: imull {{[0-9]+}}(%esp), %ecx ; X32-NEXT: addl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movzwl 4(%eax,%ecx), %edx ; X32-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: movd %edx, %xmm1 -; X32-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] ; X32-NEXT: movd %xmm0, %eax ; X32-NEXT: retl @@ -31,7 +28,6 @@ ; X64-NEXT: shlq $32, %rcx ; X64-NEXT: orq %rax, %rcx ; X64-NEXT: movq %rcx, %xmm0 -; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7] ; X64-NEXT: movd %xmm0, %eax ; X64-NEXT: retq entry: Index: test/CodeGen/X86/extractelement-load.ll =================================================================== --- test/CodeGen/X86/extractelement-load.ll +++ test/CodeGen/X86/extractelement-load.ll @@ -55,26 +55,30 @@ ; load/store instructions will have a leading 'v', so we don't ; need to special-case the checks. -define void @t3() { +define void @t3(<2 x double>* %a0) { ; X32-SSE2-LABEL: t3: ; X32-SSE2: # %bb.0: # %bb +; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE2-NEXT: movupd (%eax), %xmm0 ; X32-SSE2-NEXT: movhpd %xmm0, (%eax) +; X32-SSE2-NEXT: retl ; ; X64-SSSE3-LABEL: t3: ; X64-SSSE3: # %bb.0: # %bb -; X64-SSSE3-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] -; X64-SSSE3-NEXT: movlpd %xmm0, (%rax) +; X64-SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-SSSE3-NEXT: movsd %xmm0, (%rax) +; X64-SSSE3-NEXT: retq ; ; X64-AVX-LABEL: t3: ; X64-AVX: # %bb.0: # %bb -; X64-AVX-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; X64-AVX-NEXT: vmovlpd %xmm0, (%rax) +; X64-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-AVX-NEXT: vmovsd %xmm0, (%rax) +; X64-AVX-NEXT: retq bb: - %tmp13 = load <2 x double>, <2 x double>* undef, align 1 + %tmp13 = load <2 x double>, <2 x double>* %a0, align 1 %.sroa.3.24.vec.extract = extractelement <2 x double> %tmp13, i32 1 store double %.sroa.3.24.vec.extract, double* undef, align 8 - unreachable + ret void } ; Case where a load is unary shuffled, then bitcast (to a type with the same @@ -93,7 +97,8 @@ ; ; X64-SSSE3-LABEL: t4: ; X64-SSSE3: # %bb.0: -; X64-SSSE3-NEXT: movq (%rdi), %rax +; X64-SSSE3-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; X64-SSSE3-NEXT: movq %xmm0, %rax ; X64-SSSE3-NEXT: retq ; ; X64-AVX-LABEL: t4: Index: test/CodeGen/X86/insertps-combine.ll =================================================================== --- test/CodeGen/X86/insertps-combine.ll +++ test/CodeGen/X86/insertps-combine.ll @@ -284,12 +284,13 @@ define float @extract_lane_insertps_6123(<4 x float> %a0, <4 x float> *%p1) { ; SSE-LABEL: extract_lane_insertps_6123: ; SSE: # %bb.0: -; SSE-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE-NEXT: movaps (%rdi), %xmm0 +; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] ; SSE-NEXT: retq ; ; AVX-LABEL: extract_lane_insertps_6123: ; AVX: # %bb.0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = mem[1,0] ; AVX-NEXT: retq %a1 = load <4 x float>, <4 x float> *%p1 %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 128) Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -1269,48 +1269,44 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind { ; SSE2-LABEL: interleave_24i32_out: ; SSE2: # %bb.0: +; SSE2-NEXT: movdqu 64(%rdi), %xmm10 ; SSE2-NEXT: movups 80(%rdi), %xmm8 -; SSE2-NEXT: movups 64(%rdi), %xmm11 -; SSE2-NEXT: movups (%rdi), %xmm0 -; SSE2-NEXT: movups 16(%rdi), %xmm10 -; SSE2-NEXT: movups 32(%rdi), %xmm9 -; SSE2-NEXT: movdqu 48(%rdi), %xmm1 -; SSE2-NEXT: movaps %xmm0, %xmm6 -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm10[2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] -; SSE2-NEXT: movaps %xmm9, %xmm12 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[2,0] -; SSE2-NEXT: movaps %xmm0, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm9[2,0] -; SSE2-NEXT: movdqa %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm11[2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: movdqu (%rdi), %xmm0 +; SSE2-NEXT: movdqu 16(%rdi), %xmm11 +; SSE2-NEXT: movups 32(%rdi), %xmm5 +; SSE2-NEXT: movdqu 48(%rdi), %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1] +; SSE2-NEXT: movaps %xmm5, %xmm7 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm11[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[2,3] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm5[2,0] ; SSE2-NEXT: movaps %xmm8, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm1[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm8[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm11[0,0] -; SSE2-NEXT: movaps %xmm4, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm1[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm12[2,0] -; SSE2-NEXT: movups %xmm6, 16(%rsi) +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,3,0,1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm10[2,3] +; SSE2-NEXT: movdqa %xmm9, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,3],xmm8[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] +; SSE2-NEXT: movaps %xmm2, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm10[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] +; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm11[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm7[2,0] +; SSE2-NEXT: movups %xmm9, 16(%rsi) ; SSE2-NEXT: movups %xmm3, (%rsi) -; SSE2-NEXT: movups %xmm4, 16(%rdx) +; SSE2-NEXT: movups %xmm2, 16(%rdx) ; SSE2-NEXT: movups %xmm0, (%rdx) -; SSE2-NEXT: movups %xmm2, 16(%rcx) -; SSE2-NEXT: movups %xmm7, (%rcx) +; SSE2-NEXT: movups %xmm1, 16(%rcx) +; SSE2-NEXT: movups %xmm6, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_out: Index: test/CodeGen/X86/pr13577.ll =================================================================== --- test/CodeGen/X86/pr13577.ll +++ test/CodeGen/X86/pr13577.ll @@ -30,7 +30,6 @@ ; CHECK-LABEL: pr26070: ; CHECK: ## %bb.0: ; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-NEXT: retq %c = call float @copysignf(float 1.0, float undef) readnone ret float %c Index: test/CodeGen/X86/pr34592.ll =================================================================== --- test/CodeGen/X86/pr34592.ll +++ test/CodeGen/X86/pr34592.ll @@ -20,32 +20,31 @@ ; CHECK-NEXT: vmovaps 48(%rbp), %ymm14 ; CHECK-NEXT: vmovaps 16(%rbp), %ymm15 ; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; CHECK-NEXT: vxorps %xmm6, %xmm6, %xmm6 -; CHECK-NEXT: vpblendd {{.*#+}} ymm11 = ymm6[0,1,2,3],ymm11[4,5],ymm6[6,7] -; CHECK-NEXT: # kill: def $xmm9 killed $xmm9 killed $ymm9 +; CHECK-NEXT: vmovaps %xmm9, %xmm6 +; CHECK-NEXT: vmovdqa %xmm6, %xmm9 +; CHECK-NEXT: # kill: def $ymm9 killed $xmm9 ; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovdqa %xmm9, %xmm0 -; CHECK-NEXT: # kill: def $ymm0 killed $xmm0 -; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] -; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,0] +; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: # implicit-def: $ymm0 -; CHECK-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 +; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 +; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm2[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,3,2,0] ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5],ymm11[6,7] -; CHECK-NEXT: vmovaps %xmm2, %xmm9 +; CHECK-NEXT: vmovaps %xmm2, %xmm6 ; CHECK-NEXT: # implicit-def: $ymm2 -; CHECK-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; CHECK-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1],ymm6[2,3],ymm7[4,5],ymm6[6,7] -; CHECK-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] -; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] -; CHECK-NEXT: vmovaps %xmm7, %xmm9 -; CHECK-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,xmm9[0,1,2,3,4,5,6,7] -; CHECK-NEXT: # implicit-def: $ymm6 -; CHECK-NEXT: vmovaps %xmm9, %xmm6 +; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload -; CHECK-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,1,0,3] -; CHECK-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1],ymm11[2,3],ymm7[4,5],ymm11[6,7] +; CHECK-NEXT: vpermq {{.*#+}} ymm11 = ymm11[2,1,2,3] +; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] +; CHECK-NEXT: vmovaps %xmm7, %xmm6 +; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm6[0,1,2,3,4,5,6,7] +; CHECK-NEXT: # implicit-def: $ymm11 +; CHECK-NEXT: vmovaps %xmm6, %xmm11 +; CHECK-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,1,0,3] +; CHECK-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7] ; CHECK-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6,7] ; CHECK-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,1,3] ; CHECK-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,1,4,5,4,5] @@ -53,11 +52,11 @@ ; CHECK-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm5, %ymm1 ; CHECK-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovaps %ymm6, %ymm3 +; CHECK-NEXT: vmovaps %ymm9, %ymm3 ; CHECK-NEXT: vmovaps %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; CHECK-NEXT: vmovaps %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovaps %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm14, (%rsp) # 32-byte Spill ; CHECK-NEXT: movq %rbp, %rsp Index: test/CodeGen/X86/reduce-trunc-shl.ll =================================================================== --- test/CodeGen/X86/reduce-trunc-shl.ll +++ test/CodeGen/X86/reduce-trunc-shl.ll @@ -57,9 +57,7 @@ ; ; AVX2-LABEL: trunc_shl_16_v8i16_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: retq %shl = shl <8 x i32> %a, %conv = trunc <8 x i32> %shl to <8 x i16> Index: test/CodeGen/X86/urem-seteq-vec-nonsplat.ll =================================================================== --- test/CodeGen/X86/urem-seteq-vec-nonsplat.ll +++ test/CodeGen/X86/urem-seteq-vec-nonsplat.ll @@ -431,12 +431,9 @@ ; CHECK-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] ; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm2 -; CHECK-SSE41-NEXT: psrld $2, %xmm2 -; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm3 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7] -; CHECK-SSE41-NEXT: psrld $3, %xmm1 -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-SSE41-NEXT: psrld $3, %xmm2 +; CHECK-SSE41-NEXT: psrld $2, %xmm1 +; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] ; CHECK-SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; CHECK-SSE41-NEXT: pmulld {{.*}}(%rip), %xmm1 ; CHECK-SSE41-NEXT: psubd %xmm1, %xmm0 @@ -456,11 +453,9 @@ ; CHECK-AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm2 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm2[4,5,6,7] -; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm1 -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] -; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] +; CHECK-AVX1-NEXT: vpsrld $3, %xmm1, %xmm2 +; CHECK-AVX1-NEXT: vpsrld $2, %xmm1, %xmm1 +; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] ; CHECK-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] ; CHECK-AVX1-NEXT: vpmulld {{.*}}(%rip), %xmm1, %xmm1 ; CHECK-AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 Index: test/CodeGen/X86/vec_extract-sse4.ll =================================================================== --- test/CodeGen/X86/vec_extract-sse4.ll +++ test/CodeGen/X86/vec_extract-sse4.ll @@ -27,7 +27,7 @@ ; X32: # %bb.0: ; X32-NEXT: pushl %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] +; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: movss %xmm0, (%esp) ; X32-NEXT: flds (%esp) ; X32-NEXT: popl %eax @@ -35,7 +35,7 @@ ; ; X64-LABEL: t2: ; X64: # %bb.0: -; X64-NEXT: movddup {{.*#+}} xmm0 = mem[0,0] +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: retq %X = load <4 x float>, <4 x float>* %P1 %tmp = extractelement <4 x float> %X, i32 2 Index: test/CodeGen/X86/vec_insert-3.ll =================================================================== --- test/CodeGen/X86/vec_insert-3.ll +++ test/CodeGen/X86/vec_insert-3.ll @@ -6,7 +6,6 @@ ; X32-LABEL: t1: ; X32: # %bb.0: ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] ; X32-NEXT: movaps %xmm0, %xmm2 ; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero Index: test/CodeGen/X86/vector-constrained-fp-intrinsics.ll =================================================================== --- test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -268,26 +268,26 @@ define <3 x double> @constrained_vector_frem_v3f64() { ; NO-FMA-LABEL: constrained_vector_frem_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $56, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 64 +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NO-FMA-NEXT: callq fmod -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NO-FMA-NEXT: callq fmod -; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NO-FMA-NEXT: callq fmod ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NO-FMA-NEXT: addq $56, %rsp +; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; NO-FMA-NEXT: # xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; NO-FMA-NEXT: # xmm1 = mem[0],zero +; NO-FMA-NEXT: addq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -905,29 +905,29 @@ define <3 x double> @constrained_vector_fma_v3f64() { ; NO-FMA-LABEL: constrained_vector_fma_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $56, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 64 +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; NO-FMA-NEXT: callq fma -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; NO-FMA-NEXT: callq fma -; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; NO-FMA-NEXT: callq fma ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NO-FMA-NEXT: addq $56, %rsp +; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; NO-FMA-NEXT: # xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; NO-FMA-NEXT: # xmm1 = mem[0],zero +; NO-FMA-NEXT: addq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -1382,26 +1382,26 @@ define <3 x double> @constrained_vector_pow_v3f64() { ; NO-FMA-LABEL: constrained_vector_pow_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $56, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 64 +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NO-FMA-NEXT: callq pow -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NO-FMA-NEXT: callq pow -; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NO-FMA-NEXT: callq pow ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NO-FMA-NEXT: addq $56, %rsp +; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; NO-FMA-NEXT: # xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; NO-FMA-NEXT: # xmm1 = mem[0],zero +; NO-FMA-NEXT: addq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -1638,26 +1638,26 @@ define <3 x double> @constrained_vector_powi_v3f64() { ; NO-FMA-LABEL: constrained_vector_powi_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $56, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 64 +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movl $3, %edi ; NO-FMA-NEXT: callq __powidf2 -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movl $3, %edi ; NO-FMA-NEXT: callq __powidf2 -; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movl $3, %edi ; NO-FMA-NEXT: callq __powidf2 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NO-FMA-NEXT: addq $56, %rsp +; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; NO-FMA-NEXT: # xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; NO-FMA-NEXT: # xmm1 = mem[0],zero +; NO-FMA-NEXT: addq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -1878,23 +1878,23 @@ define <3 x double> @constrained_vector_sin_v3f64() { ; NO-FMA-LABEL: constrained_vector_sin_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $56, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 64 +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq sin -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq sin -; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq sin ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NO-FMA-NEXT: addq $56, %rsp +; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; NO-FMA-NEXT: # xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; NO-FMA-NEXT: # xmm1 = mem[0],zero +; NO-FMA-NEXT: addq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -2102,23 +2102,23 @@ define <3 x double> @constrained_vector_cos_v3f64() { ; NO-FMA-LABEL: constrained_vector_cos_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $56, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 64 +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq cos -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq cos -; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq cos ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NO-FMA-NEXT: addq $56, %rsp +; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; NO-FMA-NEXT: # xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; NO-FMA-NEXT: # xmm1 = mem[0],zero +; NO-FMA-NEXT: addq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -2326,23 +2326,23 @@ define <3 x double> @constrained_vector_exp_v3f64() { ; NO-FMA-LABEL: constrained_vector_exp_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $56, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 64 +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq exp -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq exp -; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq exp ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NO-FMA-NEXT: addq $56, %rsp +; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; NO-FMA-NEXT: # xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; NO-FMA-NEXT: # xmm1 = mem[0],zero +; NO-FMA-NEXT: addq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -2550,23 +2550,23 @@ define <3 x double> @constrained_vector_exp2_v3f64() { ; NO-FMA-LABEL: constrained_vector_exp2_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $56, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 64 +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq exp2 -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq exp2 -; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq exp2 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NO-FMA-NEXT: addq $56, %rsp +; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; NO-FMA-NEXT: # xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; NO-FMA-NEXT: # xmm1 = mem[0],zero +; NO-FMA-NEXT: addq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -2774,23 +2774,23 @@ define <3 x double> @constrained_vector_log_v3f64() { ; NO-FMA-LABEL: constrained_vector_log_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $56, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 64 +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log -; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NO-FMA-NEXT: addq $56, %rsp +; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; NO-FMA-NEXT: # xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; NO-FMA-NEXT: # xmm1 = mem[0],zero +; NO-FMA-NEXT: addq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -2998,23 +2998,23 @@ define <3 x double> @constrained_vector_log10_v3f64() { ; NO-FMA-LABEL: constrained_vector_log10_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $56, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 64 +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log10 -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log10 -; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log10 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NO-FMA-NEXT: addq $56, %rsp +; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; NO-FMA-NEXT: # xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; NO-FMA-NEXT: # xmm1 = mem[0],zero +; NO-FMA-NEXT: addq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -3222,23 +3222,23 @@ define <3 x double> @constrained_vector_log2_v3f64() { ; NO-FMA-LABEL: constrained_vector_log2_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $56, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 64 +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log2 -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log2 -; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log2 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NO-FMA-NEXT: addq $56, %rsp +; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; NO-FMA-NEXT: # xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; NO-FMA-NEXT: # xmm1 = mem[0],zero +; NO-FMA-NEXT: addq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -3424,23 +3424,23 @@ define <3 x double> @constrained_vector_rint_v3f64() { ; NO-FMA-LABEL: constrained_vector_rint_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $56, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 64 +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq rint -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq rint -; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq rint ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NO-FMA-NEXT: addq $56, %rsp +; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; NO-FMA-NEXT: # xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; NO-FMA-NEXT: # xmm1 = mem[0],zero +; NO-FMA-NEXT: addq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -3594,23 +3594,23 @@ define <3 x double> @constrained_vector_nearby_v3f64() { ; NO-FMA-LABEL: constrained_vector_nearby_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $56, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 64 +; NO-FMA-NEXT: subq $24, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 32 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq nearbyint -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq nearbyint -; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload -; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; NO-FMA-NEXT: movsd %xmm0, (%rsp) # 8-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq nearbyint ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; NO-FMA-NEXT: addq $56, %rsp +; NO-FMA-NEXT: movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload +; NO-FMA-NEXT: # xmm0 = mem[0],zero +; NO-FMA-NEXT: movsd (%rsp), %xmm1 # 8-byte Reload +; NO-FMA-NEXT: # xmm1 = mem[0],zero +; NO-FMA-NEXT: addq $24, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; Index: test/CodeGen/X86/vector-rotate-128.ll =================================================================== --- test/CodeGen/X86/vector-rotate-128.ll +++ test/CodeGen/X86/vector-rotate-128.ll @@ -763,7 +763,6 @@ ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: xorps %xmm3, %xmm3 ; SSE2-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm4 ; SSE2-NEXT: pslld %xmm3, %xmm4 ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32] @@ -856,7 +855,6 @@ ; X32-SSE-NEXT: xorps %xmm2, %xmm2 ; X32-SSE-NEXT: xorps %xmm3, %xmm3 ; X32-SSE-NEXT: movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3] -; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] ; X32-SSE-NEXT: movdqa %xmm0, %xmm4 ; X32-SSE-NEXT: pslld %xmm3, %xmm4 ; X32-SSE-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32] @@ -876,17 +874,15 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; SSE2-LABEL: splatvar_rotate_v8i16: ; SSE2: # %bb.0: -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7] +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; SSE2-NEXT: psubw %xmm1, %xmm2 ; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] ; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psllw %xmm1, %xmm3 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] -; SSE2-NEXT: psubw %xmm2, %xmm1 -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: psrlw %xmm1, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: psrlw %xmm2, %xmm0 ; SSE2-NEXT: por %xmm3, %xmm0 ; SSE2-NEXT: retq ; @@ -992,17 +988,15 @@ ; ; X32-SSE-LABEL: splatvar_rotate_v8i16: ; X32-SSE: # %bb.0: -; X32-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,0,2,3,4,5,6,7] +; X32-SSE-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16] +; X32-SSE-NEXT: psubw %xmm1, %xmm2 ; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] ; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] ; X32-SSE-NEXT: movdqa %xmm0, %xmm3 ; X32-SSE-NEXT: psllw %xmm1, %xmm3 -; X32-SSE-NEXT: movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16] -; X32-SSE-NEXT: psubw %xmm2, %xmm1 -; X32-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1] -; X32-SSE-NEXT: psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; X32-SSE-NEXT: psrlw %xmm1, %xmm0 +; X32-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1] +; X32-SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-SSE-NEXT: psrlw %xmm2, %xmm0 ; X32-SSE-NEXT: por %xmm3, %xmm0 ; X32-SSE-NEXT: retl %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer Index: test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v4.ll +++ test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1115,7 +1115,7 @@ ; SSE2: # %bb.0: ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] ; SSE2-NEXT: xorps %xmm2, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1124,7 +1124,7 @@ ; SSE3: # %bb.0: ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] ; SSE3-NEXT: xorps %xmm2, %xmm2 -; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] ; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] ; SSE3-NEXT: movaps %xmm2, %xmm0 ; SSE3-NEXT: retq @@ -1133,7 +1133,7 @@ ; SSSE3: # %bb.0: ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] ; SSSE3-NEXT: xorps %xmm2, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] ; SSSE3-NEXT: movaps %xmm2, %xmm0 ; SSSE3-NEXT: retq Index: test/CodeGen/X86/vector-trunc.ll =================================================================== --- test/CodeGen/X86/vector-trunc.ll +++ test/CodeGen/X86/vector-trunc.ll @@ -1923,7 +1923,6 @@ ; AVX2-SLOW-LABEL: PR32160: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,3,4,5,6,7] ; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 ; AVX2-SLOW-NEXT: vzeroupper @@ -1932,7 +1931,6 @@ ; AVX2-FAST-LABEL: PR32160: ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-FAST-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,4,5,4,5,4,5,4,5,4,5] ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq