Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -29635,7 +29635,8 @@ /// instruction but should only be used to replace chains over a certain depth. static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, - bool HasVariableMask, SelectionDAG &DAG, + bool HasVariableMask, + bool AllowVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!"); assert((Inputs.size() == 1 || Inputs.size() == 2) && @@ -29848,7 +29849,7 @@ // Depth threshold above which we can efficiently use variable mask shuffles. int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3; - bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask; + AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask; bool MaskContainsZeros = any_of(Mask, [](int M) { return M == SM_SentinelZero; }); @@ -30182,7 +30183,8 @@ static SDValue combineX86ShufflesRecursively( ArrayRef SrcOps, int SrcOpIndex, SDValue Root, ArrayRef RootMask, ArrayRef SrcNodes, unsigned Depth, - bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. const unsigned MaxRecursionDepth = 8; @@ -30337,18 +30339,23 @@ CombinedNodes.push_back(Op.getNode()); // See if we can recurse into each shuffle source op (if it's a target - // shuffle). The source op should only be combined if it either has a - // single use (i.e. current Op) or all its users have already been combined. + // shuffle). The source op should only be generally combined if it either has + // a single use (i.e. current Op) or all its users have already been combined, + // if not then we can still combine but should prevent generation of variable + // shuffles to avoid constant pool bloat. // Don't recurse if we already have more source ops than we can combine in // the remaining recursion depth. if (Ops.size() < (MaxRecursionDepth - Depth)) { - for (int i = 0, e = Ops.size(); i < e; ++i) + for (int i = 0, e = Ops.size(); i < e; ++i) { + bool AllowVar = false; if (Ops[i].getNode()->hasOneUse() || SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) - if (SDValue Res = combineX86ShufflesRecursively( - Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask, - DAG, Subtarget)) - return Res; + AllowVar = AllowVariableMask; + if (SDValue Res = combineX86ShufflesRecursively( + Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask, + AllowVar, DAG, Subtarget)) + return Res; + } } // Attempt to constant fold all of the constant source ops. @@ -30378,8 +30385,8 @@ } // Finally, try to combine into a single shuffle instruction. - return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG, - Subtarget); + return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, + AllowVariableMask, DAG, Subtarget); } /// Get the PSHUF-style mask from PSHUF node. @@ -30680,7 +30687,7 @@ DemandedMask[i] = i; if (SDValue Res = combineX86ShufflesRecursively( {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1, - /*HasVarMask*/ false, DAG, Subtarget)) + /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getBitcast(SrcVT, Res)); } @@ -31299,7 +31306,7 @@ // a particular chain. if (SDValue Res = combineX86ShufflesRecursively( {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, DAG, Subtarget)) + /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return Res; } @@ -34206,7 +34213,8 @@ SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, DAG, Subtarget)) + /*HasVarMask*/ false, + /*AllowVarMask*/ true, DAG, Subtarget)) return Res; return SDValue(); @@ -34266,7 +34274,7 @@ SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively( {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, DAG, Subtarget)) + /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return Res; } @@ -34305,7 +34313,8 @@ SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, DAG, Subtarget)) + /*HasVarMask*/ false, + /*AllowVarMask*/ true, DAG, Subtarget)) return Res; return SDValue(); @@ -34831,7 +34840,7 @@ SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively( {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, DAG, Subtarget)) + /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return Res; } @@ -34868,7 +34877,7 @@ if (SDValue Shuffle = combineX86ShufflesRecursively( {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2, - /*HasVarMask*/ false, DAG, Subtarget)) + /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle, N->getOperand(0).getOperand(1)); } @@ -37359,7 +37368,7 @@ SDValue Op(N, 0); if (SDValue Res = combineX86ShufflesRecursively( {Op}, 0, Op, {0}, {}, /*Depth*/ 1, - /*HasVarMask*/ false, DAG, Subtarget)) + /*HasVarMask*/ false, /*AllowVarMask*/ true, DAG, Subtarget)) return Res; } @@ -39620,6 +39629,15 @@ SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); + // Canonicalize constant to RHS. + if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) && + !DAG.isConstantIntBuildVectorOrConstantInt(RHS)) + return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS); + + // Multiply by zero. + if (ISD::isBuildVectorAllZeros(RHS.getNode())) + return RHS; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); Index: test/CodeGen/X86/2012-01-12-extract-sv.ll =================================================================== --- test/CodeGen/X86/2012-01-12-extract-sv.ll +++ test/CodeGen/X86/2012-01-12-extract-sv.ll @@ -6,10 +6,10 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vmovaps (%eax), %ymm0 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm0 -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] +; CHECK-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[0,0,1,1] +; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3] +; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7] Index: test/CodeGen/X86/avx2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -1823,13 +1823,11 @@ define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_mul_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllq $32, %ymm0, %ymm0 -; CHECK-NEXT: vpsrad $31, %ymm0, %ymm2 -; CHECK-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vpsllq $32, %ymm0, %ymm2 +; CHECK-NEXT: vpsrad $31, %ymm2, %ymm2 ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; CHECK-NEXT: vpsllq $32, %ymm1, %ymm1 -; CHECK-NEXT: vpsrad $31, %ymm1, %ymm2 -; CHECK-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vpsllq $32, %ymm1, %ymm2 +; CHECK-NEXT: vpsrad $31, %ymm2, %ymm2 ; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} Index: test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -6491,11 +6491,11 @@ ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-NEXT: vpsrlq $32, %xmm0, %xmm2 -; X86-NEXT: vpmuludq %xmm2, %xmm1, %xmm2 -; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 -; X86-NEXT: vpmuludq %xmm0, %xmm3, %xmm3 -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 +; X86-NEXT: vpsrlq $32, %xmm0, %xmm3 +; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 +; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 @@ -6525,11 +6525,11 @@ ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-NEXT: vpsrlq $32, %xmm0, %xmm2 -; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm2 -; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 -; X64-NEXT: vpmuludq %xmm0, %xmm3, %xmm3 -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 +; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 +; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 +; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 @@ -6696,11 +6696,11 @@ ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X86-NEXT: vpsrlq $32, %xmm0, %xmm2 -; X86-NEXT: vpmuludq %xmm2, %xmm1, %xmm2 -; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 -; X86-NEXT: vpmuludq %xmm0, %xmm3, %xmm3 -; X86-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 +; X86-NEXT: vpsrlq $32, %xmm0, %xmm3 +; X86-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 +; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; X86-NEXT: vpsllq $32, %xmm2, %xmm2 ; X86-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 @@ -6733,11 +6733,11 @@ ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X64-NEXT: vpsrlq $32, %xmm0, %xmm2 -; X64-NEXT: vpmuludq %xmm2, %xmm1, %xmm2 -; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 -; X64-NEXT: vpmuludq %xmm0, %xmm3, %xmm3 -; X64-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 +; X64-NEXT: vpsrlq $32, %xmm0, %xmm3 +; X64-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 +; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; X64-NEXT: vpsllq $32, %xmm2, %xmm2 ; X64-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 ; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 Index: test/CodeGen/X86/bitcast-and-setcc-128.ll =================================================================== --- test/CodeGen/X86/bitcast-and-setcc-128.ll +++ test/CodeGen/X86/bitcast-and-setcc-128.ll @@ -539,22 +539,18 @@ ; ; AVX1-LABEL: v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX1-NEXT: vpsrad $31, %xmm3, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; AVX1-NEXT: vpsllq $32, %xmm3, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7] -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX1-NEXT: vpsllq $32, %xmm2, %xmm4 +; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4 ; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] ; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7] -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpsllq $32, %xmm0, %xmm3 +; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 @@ -564,22 +560,18 @@ ; ; AVX2-LABEL: v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm3, %xmm3 -; AVX2-NEXT: vpsrad $31, %xmm3, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] +; AVX2-NEXT: vpsllq $32, %xmm3, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3] -; AVX2-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX2-NEXT: vpsrad $31, %xmm2, %xmm4 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX2-NEXT: vpsllq $32, %xmm2, %xmm4 +; AVX2-NEXT: vpsrad $31, %xmm4, %xmm4 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3] ; AVX2-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm3 +; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2],xmm3[3] -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm3 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpsllq $32, %xmm0, %xmm3 +; AVX2-NEXT: vpsrad $31, %xmm3, %xmm3 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2],xmm3[3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpand %xmm2, %xmm0, %xmm0 Index: test/CodeGen/X86/bitcast-setcc-128.ll =================================================================== --- test/CodeGen/X86/bitcast-setcc-128.ll +++ test/CodeGen/X86/bitcast-setcc-128.ll @@ -360,13 +360,11 @@ ; ; AVX1-LABEL: v2i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] ; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vmovmskpd %xmm0, %eax @@ -375,13 +373,11 @@ ; ; AVX2-LABEL: v2i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpsllq $32, %xmm1, %xmm2 +; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] -; AVX2-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX2-NEXT: vpsllq $32, %xmm0, %xmm2 +; AVX2-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3] ; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovmskpd %xmm0, %eax Index: test/CodeGen/X86/combine-shl.ll =================================================================== --- test/CodeGen/X86/combine-shl.ll +++ test/CodeGen/X86/combine-shl.ll @@ -406,21 +406,19 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrad $5, %xmm2 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $4, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrad $4, %xmm3 ; SSE2-NEXT: psrad $3, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,64,128,256] -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_shl_ge_ashr_extact1: @@ -472,21 +470,19 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrad $7, %xmm2 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrad $6, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrad $6, %xmm3 ; SSE2-NEXT: psrad $5, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,16,32,256] -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_shl_lt_ashr_extact1: @@ -541,21 +537,19 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld $5, %xmm2 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $4, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrld $4, %xmm3 ; SSE2-NEXT: psrld $3, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32,64,128,256] -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_shl_gt_lshr1: @@ -610,21 +604,19 @@ ; SSE2-NEXT: movdqa %xmm0, %xmm2 ; SSE2-NEXT: psrld $7, %xmm2 ; SSE2-NEXT: punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: psrld $6, %xmm1 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: psrld $6, %xmm3 ; SSE2-NEXT: psrld $5, %xmm0 -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [8,16,32,256] -; SSE2-NEXT: movaps %xmm0, %xmm1 -; SSE2-NEXT: pmuludq %xmm2, %xmm1 +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: combine_vec_shl_le_lshr1: Index: test/CodeGen/X86/extractelement-load.ll =================================================================== --- test/CodeGen/X86/extractelement-load.ll +++ test/CodeGen/X86/extractelement-load.ll @@ -85,10 +85,9 @@ ; X32-SSE2-LABEL: t4: ; X32-SSE2: # %bb.0: ; X32-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0] -; X32-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; X32-SSE2-NEXT: movd %xmm1, %eax -; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; X32-SSE2-NEXT: movdqa (%eax), %xmm0 +; X32-SSE2-NEXT: movd %xmm0, %eax +; X32-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1] ; X32-SSE2-NEXT: movd %xmm0, %edx ; X32-SSE2-NEXT: retl ; Index: test/CodeGen/X86/madd.ll =================================================================== --- test/CodeGen/X86/madd.ll +++ test/CodeGen/X86/madd.ll @@ -2077,26 +2077,20 @@ ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] ; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,7,42,32] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,7,42,32] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [32768,4294934528,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm4[0,2] ; SSE2-NEXT: pmuludq %xmm3, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32768,4294934528,0,0] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pmuludq %xmm5, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] +; SSE2-NEXT: paddd %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: retq ; ; AVX1-LABEL: pmaddwd_negative2: Index: test/CodeGen/X86/mmx-arith.ll =================================================================== --- test/CodeGen/X86/mmx-arith.ll +++ test/CodeGen/X86/mmx-arith.ll @@ -214,32 +214,28 @@ ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X32-NEXT: movdqa %xmm1, %xmm2 -; X32-NEXT: psrlq $32, %xmm2 ; X32-NEXT: pmuludq %xmm0, %xmm2 -; X32-NEXT: movdqa %xmm0, %xmm3 -; X32-NEXT: psrlq $32, %xmm3 -; X32-NEXT: pmuludq %xmm1, %xmm3 -; X32-NEXT: paddq %xmm2, %xmm3 -; X32-NEXT: psllq $32, %xmm3 -; X32-NEXT: pmuludq %xmm1, %xmm0 -; X32-NEXT: paddq %xmm3, %xmm0 -; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] -; X32-NEXT: movq %xmm1, (%eax) -; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X32-NEXT: andps %xmm0, %xmm1 +; X32-NEXT: psrlq $32, %xmm1 +; X32-NEXT: pmuludq %xmm0, %xmm1 +; X32-NEXT: psllq $32, %xmm1 +; X32-NEXT: paddq %xmm2, %xmm1 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X32-NEXT: orps %xmm1, %xmm0 +; X32-NEXT: andps %xmm1, %xmm0 ; X32-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3] ; X32-NEXT: movq %xmm1, (%eax) ; X32-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X32-NEXT: xorps %xmm0, %xmm1 +; X32-NEXT: orps %xmm0, %xmm1 ; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; X32-NEXT: movq %xmm0, (%eax) +; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,1,3] +; X32-NEXT: xorps %xmm1, %xmm0 +; X32-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X32-NEXT: movq %xmm0, (%eax) ; X32-NEXT: emms ; X32-NEXT: retl ; @@ -255,15 +251,11 @@ ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X64-NEXT: movdqa %xmm1, %xmm2 -; X64-NEXT: psrlq $32, %xmm2 ; X64-NEXT: pmuludq %xmm0, %xmm2 -; X64-NEXT: movdqa %xmm0, %xmm3 -; X64-NEXT: psrlq $32, %xmm3 -; X64-NEXT: pmuludq %xmm1, %xmm3 -; X64-NEXT: paddq %xmm2, %xmm3 -; X64-NEXT: psllq $32, %xmm3 +; X64-NEXT: psrlq $32, %xmm1 ; X64-NEXT: pmuludq %xmm0, %xmm1 -; X64-NEXT: paddq %xmm3, %xmm1 +; X64-NEXT: psllq $32, %xmm1 +; X64-NEXT: paddq %xmm2, %xmm1 ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; X64-NEXT: movq %xmm0, (%rdi) ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero Index: test/CodeGen/X86/oddshuffles.ll =================================================================== --- test/CodeGen/X86/oddshuffles.ll +++ test/CodeGen/X86/oddshuffles.ll @@ -563,24 +563,22 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, <12 x i32>* %p) nounwind { ; SSE2-LABEL: v12i32: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,2] -; SSE2-NEXT: movaps %xmm2, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0],xmm3[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] -; SSE2-NEXT: movaps %xmm2, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm1[1,0] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm1[2,2] +; SSE2-NEXT: movaps %xmm2, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[1,0] +; SSE2-NEXT: movaps %xmm0, %xmm4 +; SSE2-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[0,2] +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[2,1] +; SSE2-NEXT: movaps %xmm2, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm1[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm2[3,2] ; SSE2-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,3,2,2] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] -; SSE2-NEXT: movaps %xmm0, 32(%rdi) -; SSE2-NEXT: movaps %xmm4, 16(%rdi) -; SSE2-NEXT: movaps %xmm3, (%rdi) +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,3],xmm1[0,2] +; SSE2-NEXT: movaps %xmm2, 32(%rdi) +; SSE2-NEXT: movaps %xmm5, 16(%rdi) +; SSE2-NEXT: movaps %xmm4, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: v12i32: @@ -877,35 +875,33 @@ ; SSE2-LABEL: interleave_24i8_in: ; SSE2: # %bb.0: ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,2,2] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,2,2] ; SSE2-NEXT: movdqa {{.*#+}} xmm5 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE2-NEXT: pand %xmm5, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,1,3,3,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,6,7] -; SSE2-NEXT: pandn %xmm2, %xmm5 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,1,3,3,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,4,6,7] +; SSE2-NEXT: pandn %xmm3, %xmm5 ; SSE2-NEXT: por %xmm4, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,2,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,4,5] -; SSE2-NEXT: packuswb %xmm5, %xmm2 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,2,4,5,6,7] +; SSE2-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,4,5] +; SSE2-NEXT: packuswb %xmm5, %xmm3 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,0,255,255,0,255,255,0,255,255,0,255] -; SSE2-NEXT: pand %xmm4, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[0,1,0,1] ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,0,0,3,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,6,6] ; SSE2-NEXT: pandn %xmm5, %xmm4 -; SSE2-NEXT: por %xmm2, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3] +; SSE2-NEXT: por %xmm3, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,1,2,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,1,0,4,5,6,7] ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] ; SSE2-NEXT: packuswb %xmm0, %xmm1 @@ -993,11 +989,11 @@ ; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7] -; SSE2-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,1] -; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,0],xmm1[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,0] +; SSE2-NEXT: pshufhw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,7,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,2,1] +; SSE2-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,6,5] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,0],xmm4[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,0] ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535] ; SSE2-NEXT: movdqa %xmm4, %xmm5 ; SSE2-NEXT: pandn %xmm2, %xmm5 @@ -1277,44 +1273,48 @@ define void @interleave_24i32_out(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind { ; SSE2-LABEL: interleave_24i32_out: ; SSE2: # %bb.0: -; SSE2-NEXT: movups 80(%rdi), %xmm9 -; SSE2-NEXT: movups 64(%rdi), %xmm10 +; SSE2-NEXT: movups 80(%rdi), %xmm8 +; SSE2-NEXT: movups 64(%rdi), %xmm11 ; SSE2-NEXT: movups (%rdi), %xmm0 -; SSE2-NEXT: movups 16(%rdi), %xmm11 -; SSE2-NEXT: movups 32(%rdi), %xmm8 -; SSE2-NEXT: movups 48(%rdi), %xmm2 -; SSE2-NEXT: movaps %xmm2, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm10[2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm2[2,3,0,1] -; SSE2-NEXT: movaps %xmm9, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm10[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1] +; SSE2-NEXT: movups 16(%rdi), %xmm10 +; SSE2-NEXT: movups 32(%rdi), %xmm9 +; SSE2-NEXT: movdqu 48(%rdi), %xmm1 +; SSE2-NEXT: movaps %xmm0, %xmm6 +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm10[2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm0[2,3,0,1] +; SSE2-NEXT: movaps %xmm9, %xmm12 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm10[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm3[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm9[2,0] -; SSE2-NEXT: movaps %xmm0, %xmm5 -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm11[2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm11[1,1,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-NEXT: movaps %xmm8, %xmm4 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,3] -; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm5[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm11[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm0[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm10[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[3,3] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm6[2,0] -; SSE2-NEXT: movups %xmm3, 16(%rsi) -; SSE2-NEXT: movups %xmm5, (%rsi) -; SSE2-NEXT: movups %xmm2, 16(%rdx) +; SSE2-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[2,0] +; SSE2-NEXT: movaps %xmm0, %xmm3 +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm9[2,0] +; SSE2-NEXT: movdqa %xmm1, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm11[2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm11[1,1,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] +; SSE2-NEXT: movaps %xmm8, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm8[0,3] +; SSE2-NEXT: shufps {{.*#+}} xmm8 = xmm8[1,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,3],xmm8[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,0],xmm11[0,0] +; SSE2-NEXT: movaps %xmm4, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm11[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0] +; SSE2-NEXT: movaps %xmm0, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm10[3,3] +; SSE2-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm1[2,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm12[2,0] +; SSE2-NEXT: movups %xmm6, 16(%rsi) +; SSE2-NEXT: movups %xmm3, (%rsi) +; SSE2-NEXT: movups %xmm4, 16(%rdx) ; SSE2-NEXT: movups %xmm0, (%rdx) -; SSE2-NEXT: movups %xmm7, 16(%rcx) -; SSE2-NEXT: movups %xmm1, (%rcx) +; SSE2-NEXT: movups %xmm2, 16(%rcx) +; SSE2-NEXT: movups %xmm7, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_out: @@ -1507,48 +1507,44 @@ define void @interleave_24i32_in(<24 x i32>* %p, <8 x i32>* %q1, <8 x i32>* %q2, <8 x i32>* %q3) nounwind { ; SSE2-LABEL: interleave_24i32_in: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqu (%rsi), %xmm5 -; SSE2-NEXT: movdqu 16(%rsi), %xmm2 -; SSE2-NEXT: movdqu (%rdx), %xmm6 -; SSE2-NEXT: movdqu 16(%rdx), %xmm1 -; SSE2-NEXT: movups (%rcx), %xmm7 +; SSE2-NEXT: movups (%rsi), %xmm5 +; SSE2-NEXT: movups 16(%rsi), %xmm8 +; SSE2-NEXT: movups (%rdx), %xmm6 +; SSE2-NEXT: movups 16(%rdx), %xmm3 +; SSE2-NEXT: movups (%rcx), %xmm0 ; SSE2-NEXT: movups 16(%rcx), %xmm4 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; SSE2-NEXT: movaps %xmm7, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,0],xmm0[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[0,2] -; SSE2-NEXT: movaps %xmm7, %xmm3 -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[1,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm6[2,2] -; SSE2-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm7[0,3,2,2] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2],xmm5[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[0,2] -; SSE2-NEXT: movdqa %xmm2, %xmm6 -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,1,2,2] -; SSE2-NEXT: movaps %xmm4, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm6[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm7[0,2] +; SSE2-NEXT: movaps %xmm0, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,0],xmm5[1,0] +; SSE2-NEXT: movaps %xmm5, %xmm1 +; SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] +; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm7[0,2] +; SSE2-NEXT: movaps %xmm5, %xmm7 +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm6[2,1] +; SSE2-NEXT: movaps %xmm0, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm6[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,2],xmm0[3,2] +; SSE2-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,3],xmm6[0,2] +; SSE2-NEXT: movaps %xmm4, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,0],xmm8[1,0] +; SSE2-NEXT: movaps %xmm8, %xmm6 +; SSE2-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE2-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,2] +; SSE2-NEXT: movaps %xmm8, %xmm5 +; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[2,1] ; SSE2-NEXT: movaps %xmm4, %xmm7 -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[1,0] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[2,2] -; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm2[0,2] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,2] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm2[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] -; SSE2-NEXT: movups %xmm2, 80(%rdi) +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm3[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[0,2] +; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,2],xmm4[3,2] +; SSE2-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm8[2],xmm4[3],xmm8[3] +; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,3],xmm3[0,2] +; SSE2-NEXT: movups %xmm4, 80(%rdi) ; SSE2-NEXT: movups %xmm7, 64(%rdi) ; SSE2-NEXT: movups %xmm6, 48(%rdi) -; SSE2-NEXT: movups %xmm5, 32(%rdi) -; SSE2-NEXT: movups %xmm3, 16(%rdi) -; SSE2-NEXT: movups %xmm0, (%rdi) +; SSE2-NEXT: movups %xmm0, 32(%rdi) +; SSE2-NEXT: movups %xmm2, 16(%rdi) +; SSE2-NEXT: movups %xmm1, (%rdi) ; SSE2-NEXT: retq ; ; SSE42-LABEL: interleave_24i32_in: Index: test/CodeGen/X86/pmul.ll =================================================================== --- test/CodeGen/X86/pmul.ll +++ test/CodeGen/X86/pmul.ll @@ -1318,83 +1318,76 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE2-LABEL: mul_v8i64_sext: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: psrad $16, %xmm9 -; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 ; SSE2-NEXT: psrad $31, %xmm3 ; SSE2-NEXT: psrad $16, %xmm0 ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm8, %xmm3 -; SSE2-NEXT: psrad $31, %xmm3 -; SSE2-NEXT: psrad $16, %xmm8 -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm7 = xmm1[0,2,2,3,4,5,6,7] -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; SSE2-NEXT: movdqa %xmm3, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE2-NEXT: movdqa %xmm2, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm4[2,3,0,1] -; SSE2-NEXT: movdqa %xmm1, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; SSE2-NEXT: movdqa %xmm5, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm5, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm0, %xmm4 +; SSE2-NEXT: paddq %xmm3, %xmm4 +; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm8[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pmuludq %xmm5, %xmm0 +; SSE2-NEXT: movdqa %xmm3, %xmm5 ; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: psrad $16, %xmm3 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1] +; SSE2-NEXT: psllq $32, %xmm4 +; SSE2-NEXT: paddq %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm1, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm3, %xmm4 +; SSE2-NEXT: paddq %xmm5, %xmm4 +; SSE2-NEXT: movdqa %xmm6, %xmm5 ; SSE2-NEXT: psrad $31, %xmm5 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: movdqa %xmm4, %xmm5 -; SSE2-NEXT: psrlq $32, %xmm5 -; SSE2-NEXT: pmuludq %xmm0, %xmm5 -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: psrlq $32, %xmm6 -; SSE2-NEXT: pmuludq %xmm4, %xmm6 -; SSE2-NEXT: paddq %xmm5, %xmm6 -; SSE2-NEXT: psllq $32, %xmm6 -; SSE2-NEXT: pmuludq %xmm4, %xmm0 -; SSE2-NEXT: paddq %xmm6, %xmm0 -; SSE2-NEXT: movdqa %xmm7, %xmm4 -; SSE2-NEXT: psrlq $32, %xmm4 -; SSE2-NEXT: pmuludq %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm1, %xmm5 -; SSE2-NEXT: psrlq $32, %xmm5 -; SSE2-NEXT: pmuludq %xmm7, %xmm5 -; SSE2-NEXT: paddq %xmm4, %xmm5 -; SSE2-NEXT: psllq $32, %xmm5 -; SSE2-NEXT: pmuludq %xmm7, %xmm1 -; SSE2-NEXT: paddq %xmm5, %xmm1 +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] +; SSE2-NEXT: pmuludq %xmm3, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] +; SSE2-NEXT: psllq $32, %xmm4 +; SSE2-NEXT: paddq %xmm4, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: psrlq $32, %xmm4 -; SSE2-NEXT: pmuludq %xmm9, %xmm4 -; SSE2-NEXT: movdqa %xmm9, %xmm5 -; SSE2-NEXT: psrlq $32, %xmm5 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] ; SSE2-NEXT: pmuludq %xmm2, %xmm5 -; SSE2-NEXT: paddq %xmm4, %xmm5 -; SSE2-NEXT: psllq $32, %xmm5 -; SSE2-NEXT: pmuludq %xmm9, %xmm2 -; SSE2-NEXT: paddq %xmm5, %xmm2 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm6, %xmm4 +; SSE2-NEXT: paddq %xmm5, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm8[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pmuludq %xmm6, %xmm2 +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] +; SSE2-NEXT: psllq $32, %xmm4 +; SSE2-NEXT: paddq %xmm4, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 -; SSE2-NEXT: psrlq $32, %xmm4 -; SSE2-NEXT: pmuludq %xmm8, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm5 -; SSE2-NEXT: psrlq $32, %xmm5 -; SSE2-NEXT: pmuludq %xmm3, %xmm5 -; SSE2-NEXT: paddq %xmm4, %xmm5 -; SSE2-NEXT: psllq $32, %xmm5 -; SSE2-NEXT: pmuludq %xmm8, %xmm3 -; SSE2-NEXT: paddq %xmm5, %xmm3 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1] +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] +; SSE2-NEXT: pmuludq %xmm3, %xmm6 +; SSE2-NEXT: pmuludq %xmm5, %xmm4 +; SSE2-NEXT: paddq %xmm6, %xmm4 +; SSE2-NEXT: pmuludq %xmm5, %xmm3 +; SSE2-NEXT: psllq $32, %xmm4 +; SSE2-NEXT: paddq %xmm4, %xmm3 ; SSE2-NEXT: retq ; ; SSE41-LABEL: mul_v8i64_sext: Index: test/CodeGen/X86/pr29112.ll =================================================================== --- test/CodeGen/X86/pr29112.ll +++ test/CodeGen/X86/pr29112.ll @@ -8,63 +8,61 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <4 x float> %a4, <16 x float>%c1, <16 x float>%c2) { ; CHECK-LABEL: bar: ; CHECK: # %bb.0: -; CHECK-NEXT: subq $88, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 96 -; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: subq $72, %rsp +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: vmovaps %xmm1, %xmm8 ; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm1 -; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm8 -; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm8[0],xmm1[0],xmm8[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0,1],xmm2[1],xmm9[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm15 = xmm0[0,1,2],xmm3[1] -; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm8[0],xmm1[1],xmm8[2,3] +; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm5 +; CHECK-NEXT: vunpcklps {{.*#+}} xmm10 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm10[0,1],xmm2[1],xmm10[3] +; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm5[0],xmm1[1],xmm5[2,3] +; CHECK-NEXT: vmovshdup {{.*#+}} xmm6 = xmm5[1,1,3,3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm6[0,1],xmm2[1],xmm6[3] +; CHECK-NEXT: vextractf32x4 $3, %zmm3, %xmm7 +; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm5[0],xmm7[0],xmm5[1],xmm7[1] ; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm2[1],xmm4[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm11[0,1,2],xmm3[1] +; CHECK-NEXT: vaddps %xmm4, %xmm6, %xmm12 +; CHECK-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] +; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm7[2],zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm2[1],xmm7[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm7[0,1,2],xmm4[0] +; CHECK-NEXT: vpermilps {{.*#+}} xmm4 = xmm2[3,1,2,3] +; CHECK-NEXT: vunpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm5[0],xmm1[2],zero,zero +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm0[0,1,2],xmm3[1] +; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm9[0,1],xmm2[1],xmm9[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm5[0,1,2],xmm3[1] ; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm4[0,1],xmm2[1],xmm4[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm9 = xmm7[0,1],xmm2[1],xmm7[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0,1],xmm2[3],xmm10[3] ; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4 ; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3] -; CHECK-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[3,1,2,3] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1] -; CHECK-NEXT: vmovshdup {{.*#+}} xmm7 = xmm8[1,1,3,3] -; CHECK-NEXT: vunpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm10 = xmm7[0,1],xmm2[1],xmm7[3] -; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm3[3] -; CHECK-NEXT: vblendps {{.*#+}} xmm11 = xmm0[0,1,2],xmm3[3] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm12 = xmm3[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm1[1,0] -; CHECK-NEXT: vextractf32x4 $3, %zmm3, %xmm0 -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0,1,2],xmm3[1] -; CHECK-NEXT: vaddps %xmm14, %xmm1, %xmm10 -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm13[0] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm13[0],xmm8[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm9[0,1],xmm2[3],xmm9[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm12[0] -; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm2 -; CHECK-NEXT: vmovaps %xmm15, %xmm1 -; CHECK-NEXT: vmovaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vaddps %xmm0, %xmm15, %xmm9 -; CHECK-NEXT: vaddps %xmm15, %xmm15, %xmm8 -; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm0 +; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm3[3] +; CHECK-NEXT: vblendps {{.*#+}} xmm10 = xmm0[0,1,2],xmm3[3] +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm9[0,1,2],xmm3[1] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm2 ; CHECK-NEXT: vaddps %xmm10, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm15, %xmm0 -; CHECK-NEXT: vmovaps %xmm8, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vaddps %xmm13, %xmm1, %xmm9 +; CHECK-NEXT: vaddps %xmm12, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm3 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovaps %xmm9, (%rsp) -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload +; CHECK-NEXT: vmovaps %xmm8, %xmm3 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; CHECK-NEXT: vaddps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: addq $88, %rsp +; CHECK-NEXT: addq $72, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq %a1 = shufflevector <16 x float>%c1, <16 x float>%c2, <4 x i32> Index: test/CodeGen/X86/pr34592.ll =================================================================== --- test/CodeGen/X86/pr34592.ll +++ test/CodeGen/X86/pr34592.ll @@ -10,7 +10,7 @@ ; CHECK-NEXT: movq %rsp, %rbp ; CHECK-NEXT: .cfi_def_cfa_register %rbp ; CHECK-NEXT: andq $-32, %rsp -; CHECK-NEXT: subq $320, %rsp # imm = 0x140 +; CHECK-NEXT: subq $352, %rsp # imm = 0x160 ; CHECK-NEXT: vmovaps 240(%rbp), %ymm8 ; CHECK-NEXT: vmovaps 208(%rbp), %ymm9 ; CHECK-NEXT: vmovaps 176(%rbp), %ymm10 @@ -21,21 +21,23 @@ ; CHECK-NEXT: vmovaps 16(%rbp), %ymm15 ; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; CHECK-NEXT: vxorps %xmm6, %xmm6, %xmm6 -; CHECK-NEXT: vpblendd {{.*#+}} ymm8 = ymm6[0,1],ymm8[2,3,4,5,6,7] +; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1],ymm8[2,3,4,5,6,7] ; CHECK-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1],ymm11[2,3,4,5,6,7] ; CHECK-NEXT: # kill: def $xmm9 killed $xmm9 killed $ymm9 ; CHECK-NEXT: vmovdqa %xmm9, %xmm11 ; CHECK-NEXT: # kill: def $ymm11 killed $xmm11 ; CHECK-NEXT: vpalignr {{.*#+}} ymm6 = ymm2[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23] ; CHECK-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,3,2,0] -; CHECK-NEXT: vmovaps %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill +; CHECK-NEXT: vmovaps %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: # implicit-def: $ymm0 ; CHECK-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm0 ; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5],ymm6[6,7] ; CHECK-NEXT: vmovaps %xmm2, %xmm9 ; CHECK-NEXT: # implicit-def: $ymm2 ; CHECK-NEXT: vinserti128 $1, %xmm9, %ymm2, %ymm2 -; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm7[0],ymm8[0],ymm7[2],ymm8[2] +; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload +; CHECK-NEXT: vpunpcklqdq {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[2],ymm6[2] ; CHECK-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3] ; CHECK-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7] ; CHECK-NEXT: vmovaps %xmm7, %xmm9 @@ -49,15 +51,15 @@ ; CHECK-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,1,1,3] ; CHECK-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[0,1,0,1,4,5,4,5] ; CHECK-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7] -; CHECK-NEXT: vmovaps %ymm1, {{[0-9]+}}(%rsp) # 32-byte Spill +; CHECK-NEXT: vmovaps %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm5, %ymm1 -; CHECK-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp) # 32-byte Spill +; CHECK-NEXT: vmovaps %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm6, %ymm3 -; CHECK-NEXT: vmovaps %ymm15, {{[0-9]+}}(%rsp) # 32-byte Spill -; CHECK-NEXT: vmovaps %ymm10, {{[0-9]+}}(%rsp) # 32-byte Spill -; CHECK-NEXT: vmovaps %ymm13, {{[0-9]+}}(%rsp) # 32-byte Spill -; CHECK-NEXT: vmovaps %ymm12, {{[0-9]+}}(%rsp) # 32-byte Spill -; CHECK-NEXT: vmovaps %ymm4, {{[0-9]+}}(%rsp) # 32-byte Spill +; CHECK-NEXT: vmovaps %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovaps %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovaps %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovaps %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill +; CHECK-NEXT: vmovaps %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-NEXT: vmovaps %ymm14, (%rsp) # 32-byte Spill ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp Index: test/CodeGen/X86/shrink_vmul.ll =================================================================== --- test/CodeGen/X86/shrink_vmul.ll +++ test/CodeGen/X86/shrink_vmul.ll @@ -1235,17 +1235,11 @@ ; X86-SSE-NEXT: pxor %xmm2, %xmm2 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X86-SSE-NEXT: movdqa %xmm1, %xmm2 -; X86-SSE-NEXT: psrlq $32, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X86-SSE-NEXT: movdqa %xmm0, %xmm3 -; X86-SSE-NEXT: psrlq $32, %xmm3 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X86-SSE-NEXT: paddq %xmm2, %xmm3 -; X86-SSE-NEXT: psllq $32, %xmm3 ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE-NEXT: paddq %xmm3, %xmm1 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X86-SSE-NEXT: psllq $32, %xmm2 +; X86-SSE-NEXT: paddq %xmm1, %xmm2 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: .cfi_def_cfa_offset 4 @@ -1282,17 +1276,11 @@ ; X64-SSE-NEXT: pxor %xmm2, %xmm2 ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] -; X64-SSE-NEXT: movdqa %xmm1, %xmm2 -; X64-SSE-NEXT: psrlq $32, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X64-SSE-NEXT: movdqa %xmm0, %xmm3 -; X64-SSE-NEXT: psrlq $32, %xmm3 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm3 -; X64-SSE-NEXT: paddq %xmm2, %xmm3 -; X64-SSE-NEXT: psllq $32, %xmm3 ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X64-SSE-NEXT: paddq %xmm3, %xmm1 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 +; X64-SSE-NEXT: psllq $32, %xmm2 +; X64-SSE-NEXT: paddq %xmm1, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; @@ -2059,14 +2047,12 @@ ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65536,0] -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: psrlq $32, %xmm0 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: paddq %xmm2, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [0,0,65536,0] +; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 +; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 +; X86-SSE-NEXT: psllq $32, %xmm1 +; X86-SSE-NEXT: paddq %xmm0, %xmm1 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -2091,15 +2077,13 @@ ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000 -; X64-SSE-NEXT: movq %rcx, %xmm1 -; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 +; X64-SSE-NEXT: movq %rcx, %xmm2 +; X64-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; X64-SSE-NEXT: pmuludq %xmm2, %xmm0 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: psrlq $32, %xmm0 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: paddq %xmm2, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: psllq $32, %xmm2 +; X64-SSE-NEXT: paddq %xmm0, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; @@ -2145,13 +2129,12 @@ ; X86-SSE-NEXT: psrad $16, %xmm0 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = [0,0,32768,0] -; X86-SSE-NEXT: movdqa %xmm0, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: psrlq $32, %xmm0 ; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: psllq $32, %xmm0 -; X86-SSE-NEXT: paddq %xmm2, %xmm0 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X86-SSE-NEXT: pxor %xmm2, %xmm2 +; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X86-SSE-NEXT: psllq $32, %xmm2 +; X86-SSE-NEXT: paddq %xmm0, %xmm2 +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl ; @@ -2176,13 +2159,12 @@ ; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000 ; X64-SSE-NEXT: movq %rcx, %xmm1 ; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-SSE-NEXT: movdqa %xmm0, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: psrlq $32, %xmm0 ; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: psllq $32, %xmm0 -; X64-SSE-NEXT: paddq %xmm2, %xmm0 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-SSE-NEXT: pxor %xmm2, %xmm2 +; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 +; X64-SSE-NEXT: psllq $32, %xmm2 +; X64-SSE-NEXT: paddq %xmm0, %xmm2 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; Index: test/CodeGen/X86/sse2-schedule.ll =================================================================== --- test/CodeGen/X86/sse2-schedule.ll +++ test/CodeGen/X86/sse2-schedule.ll @@ -15003,141 +15003,131 @@ define <2 x double> @test_unpcklpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; GENERIC-LABEL: test_unpcklpd: ; GENERIC: # %bb.0: -; GENERIC-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; GENERIC-NEXT: movapd %xmm0, %xmm1 # sched: [1:1.00] -; GENERIC-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00] -; GENERIC-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] -; GENERIC-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; GENERIC-NEXT: movapd %xmm0, %xmm2 # sched: [1:1.00] +; GENERIC-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00] +; GENERIC-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00] +; GENERIC-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00] ; GENERIC-NEXT: retq # sched: [1:1.00] ; ; ATOM-LABEL: test_unpcklpd: ; ATOM: # %bb.0: -; ATOM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; ATOM-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.50] -; ATOM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [1:1.00] -; ATOM-NEXT: addpd %xmm0, %xmm1 # sched: [6:3.00] -; ATOM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50] +; ATOM-NEXT: movapd %xmm0, %xmm2 # sched: [1:0.50] +; ATOM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [1:1.00] +; ATOM-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00] +; ATOM-NEXT: addpd %xmm2, %xmm0 # sched: [6:3.00] ; ATOM-NEXT: retq # sched: [79:39.50] ; ; SLM-LABEL: test_unpcklpd: ; SLM: # %bb.0: -; SLM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; SLM-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.50] -; SLM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00] -; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] -; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50] +; SLM-NEXT: movapd %xmm0, %xmm2 # sched: [1:0.50] +; SLM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [4:1.00] +; SLM-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00] +; SLM-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-SSE-LABEL: test_unpcklpd: ; SANDY-SSE: # %bb.0: -; SANDY-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; SANDY-SSE-NEXT: movapd %xmm0, %xmm1 # sched: [1:1.00] -; SANDY-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00] -; SANDY-SSE-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] -; SANDY-SSE-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; SANDY-SSE-NEXT: movapd %xmm0, %xmm2 # sched: [1:1.00] +; SANDY-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00] +; SANDY-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00] +; SANDY-SSE-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00] ; SANDY-SSE-NEXT: retq # sched: [1:1.00] ; ; SANDY-LABEL: test_unpcklpd: ; SANDY: # %bb.0: -; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00] -; SANDY-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; SANDY-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:1.00] +; SANDY-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00] +; SANDY-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; SANDY-NEXT: retq # sched: [1:1.00] ; ; HASWELL-SSE-LABEL: test_unpcklpd: ; HASWELL-SSE: # %bb.0: -; HASWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; HASWELL-SSE-NEXT: movapd %xmm0, %xmm1 # sched: [1:1.00] -; HASWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00] -; HASWELL-SSE-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] -; HASWELL-SSE-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; HASWELL-SSE-NEXT: movapd %xmm0, %xmm2 # sched: [1:1.00] +; HASWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00] +; HASWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00] +; HASWELL-SSE-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00] ; HASWELL-SSE-NEXT: retq # sched: [7:1.00] ; ; HASWELL-LABEL: test_unpcklpd: ; HASWELL: # %bb.0: -; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00] -; HASWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:1.00] +; HASWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00] +; HASWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; HASWELL-NEXT: retq # sched: [7:1.00] ; ; BROADWELL-SSE-LABEL: test_unpcklpd: ; BROADWELL-SSE: # %bb.0: -; BROADWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; BROADWELL-SSE-NEXT: movapd %xmm0, %xmm1 # sched: [1:1.00] -; BROADWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] -; BROADWELL-SSE-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] -; BROADWELL-SSE-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00] +; BROADWELL-SSE-NEXT: movapd %xmm0, %xmm2 # sched: [1:1.00] +; BROADWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00] +; BROADWELL-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [6:1.00] +; BROADWELL-SSE-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00] ; BROADWELL-SSE-NEXT: retq # sched: [7:1.00] ; ; BROADWELL-LABEL: test_unpcklpd: ; BROADWELL: # %bb.0: -; BROADWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; BROADWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [6:1.00] -; BROADWELL-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BROADWELL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:1.00] +; BROADWELL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [6:1.00] +; BROADWELL-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BROADWELL-NEXT: retq # sched: [7:1.00] ; ; SKYLAKE-SSE-LABEL: test_unpcklpd: ; SKYLAKE-SSE: # %bb.0: -; SKYLAKE-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; SKYLAKE-SSE-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.33] -; SKYLAKE-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00] -; SKYLAKE-SSE-NEXT: addpd %xmm0, %xmm1 # sched: [4:0.50] -; SKYLAKE-SSE-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.33] +; SKYLAKE-SSE-NEXT: movapd %xmm0, %xmm2 # sched: [1:0.33] +; SKYLAKE-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00] +; SKYLAKE-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00] +; SKYLAKE-SSE-NEXT: addpd %xmm2, %xmm0 # sched: [4:0.50] ; SKYLAKE-SSE-NEXT: retq # sched: [7:1.00] ; ; SKYLAKE-LABEL: test_unpcklpd: ; SKYLAKE: # %bb.0: -; SKYLAKE-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; SKYLAKE-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00] -; SKYLAKE-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKYLAKE-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:1.00] +; SKYLAKE-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00] +; SKYLAKE-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50] ; SKYLAKE-NEXT: retq # sched: [7:1.00] ; ; SKX-SSE-LABEL: test_unpcklpd: ; SKX-SSE: # %bb.0: -; SKX-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; SKX-SSE-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.33] -; SKX-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [7:1.00] -; SKX-SSE-NEXT: addpd %xmm0, %xmm1 # sched: [4:0.50] -; SKX-SSE-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.33] +; SKX-SSE-NEXT: movapd %xmm0, %xmm2 # sched: [1:0.33] +; SKX-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:1.00] +; SKX-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00] +; SKX-SSE-NEXT: addpd %xmm2, %xmm0 # sched: [4:0.50] ; SKX-SSE-NEXT: retq # sched: [7:1.00] ; ; SKX-LABEL: test_unpcklpd: ; SKX: # %bb.0: -; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00] -; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [7:1.00] -; SKX-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [4:0.50] +; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:1.00] +; SKX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [7:1.00] +; SKX-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [4:0.50] ; SKX-NEXT: retq # sched: [7:1.00] ; ; BTVER2-SSE-LABEL: test_unpcklpd: ; BTVER2-SSE: # %bb.0: -; BTVER2-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] -; BTVER2-SSE-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.50] -; BTVER2-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [6:1.00] -; BTVER2-SSE-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] -; BTVER2-SSE-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50] +; BTVER2-SSE-NEXT: movapd %xmm0, %xmm2 # sched: [1:0.50] +; BTVER2-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [6:1.00] +; BTVER2-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:0.50] +; BTVER2-SSE-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00] ; BTVER2-SSE-NEXT: retq # sched: [4:1.00] ; ; BTVER2-LABEL: test_unpcklpd: ; BTVER2: # %bb.0: -; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] -; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [6:1.00] -; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:0.50] +; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [6:1.00] +; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: retq # sched: [4:1.00] ; ; ZNVER1-SSE-LABEL: test_unpcklpd: ; ZNVER1-SSE: # %bb.0: -; ZNVER1-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] -; ZNVER1-SSE-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.25] -; ZNVER1-SSE-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50] -; ZNVER1-SSE-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00] -; ZNVER1-SSE-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.25] +; ZNVER1-SSE-NEXT: movapd %xmm0, %xmm2 # sched: [1:0.25] +; ZNVER1-SSE-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [8:0.50] +; ZNVER1-SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] sched: [1:0.50] +; ZNVER1-SSE-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00] ; ZNVER1-SSE-NEXT: retq # sched: [1:0.50] ; ; ZNVER1-LABEL: test_unpcklpd: ; ZNVER1: # %bb.0: -; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50] -; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],mem[0] sched: [8:0.50] -; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00] +; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0] sched: [1:0.50] +; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],mem[0] sched: [8:0.50] +; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> %2 = load <2 x double>, <2 x double> *%a2, align 16 Index: test/CodeGen/X86/sse41-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse41-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse41-intrinsics-fast-isel.ll @@ -832,26 +832,25 @@ define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-LABEL: test_mm_mul_epi32: ; SSE: # %bb.0: +; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: psllq $32, %xmm2 +; SSE-NEXT: psrad $31, %xmm2 +; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] ; SSE-NEXT: psrad $31, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7] -; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE-NEXT: psrad $31, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] -; SSE-NEXT: pmuldq %xmm1, %xmm0 +; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] +; SSE-NEXT: pmuldq %xmm0, %xmm2 +; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: test_mm_mul_epi32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2 +; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: ret{{[l|q]}} Index: test/CodeGen/X86/vec_insert-3.ll =================================================================== --- test/CodeGen/X86/vec_insert-3.ll +++ test/CodeGen/X86/vec_insert-3.ll @@ -7,9 +7,10 @@ ; X32: # %bb.0: ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0] -; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] +; X32-NEXT: movaps %xmm0, %xmm2 +; X32-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] +; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm2[2,0] ; X32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] ; X32-NEXT: retl ; Index: test/CodeGen/X86/vector-constrained-fp-intrinsics.ll =================================================================== --- test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -537,30 +537,29 @@ define <3 x double> @constrained_vector_fma_v3f64() { ; NO-FMA-LABEL: constrained_vector_fma_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $24, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: subq $56, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 64 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; NO-FMA-NEXT: callq fma -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; NO-FMA-NEXT: callq fma -; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero ; NO-FMA-NEXT: callq fma ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps %xmm0, %xmm1 -; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; NO-FMA-NEXT: addq $56, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -964,27 +963,26 @@ define <3 x double> @constrained_vector_pow_v3f64() { ; NO-FMA-LABEL: constrained_vector_pow_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $24, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: subq $56, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 64 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NO-FMA-NEXT: callq pow -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NO-FMA-NEXT: callq pow -; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero ; NO-FMA-NEXT: callq pow ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps %xmm0, %xmm1 -; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; NO-FMA-NEXT: addq $56, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -1190,27 +1188,26 @@ define <3 x double> @constrained_vector_powi_v3f64() { ; NO-FMA-LABEL: constrained_vector_powi_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $24, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: subq $56, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 64 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movl $3, %edi ; NO-FMA-NEXT: callq __powidf2 -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movl $3, %edi ; NO-FMA-NEXT: callq __powidf2 -; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: movl $3, %edi ; NO-FMA-NEXT: callq __powidf2 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps %xmm0, %xmm1 -; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; NO-FMA-NEXT: addq $56, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -1404,24 +1401,23 @@ define <3 x double> @constrained_vector_sin_v3f64() { ; NO-FMA-LABEL: constrained_vector_sin_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $24, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: subq $56, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 64 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq sin -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq sin -; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq sin ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps %xmm0, %xmm1 -; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; NO-FMA-NEXT: addq $56, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -1601,24 +1597,23 @@ define <3 x double> @constrained_vector_cos_v3f64() { ; NO-FMA-LABEL: constrained_vector_cos_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $24, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: subq $56, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 64 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq cos -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq cos -; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq cos ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps %xmm0, %xmm1 -; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; NO-FMA-NEXT: addq $56, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -1798,24 +1793,23 @@ define <3 x double> @constrained_vector_exp_v3f64() { ; NO-FMA-LABEL: constrained_vector_exp_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $24, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: subq $56, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 64 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq exp -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq exp -; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq exp ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps %xmm0, %xmm1 -; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; NO-FMA-NEXT: addq $56, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -1995,24 +1989,23 @@ define <3 x double> @constrained_vector_exp2_v3f64() { ; NO-FMA-LABEL: constrained_vector_exp2_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $24, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: subq $56, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 64 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq exp2 -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq exp2 -; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq exp2 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps %xmm0, %xmm1 -; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; NO-FMA-NEXT: addq $56, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -2192,24 +2185,23 @@ define <3 x double> @constrained_vector_log_v3f64() { ; NO-FMA-LABEL: constrained_vector_log_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $24, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: subq $56, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 64 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log -; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps %xmm0, %xmm1 -; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; NO-FMA-NEXT: addq $56, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -2389,24 +2381,23 @@ define <3 x double> @constrained_vector_log10_v3f64() { ; NO-FMA-LABEL: constrained_vector_log10_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $24, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: subq $56, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 64 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log10 -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log10 -; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log10 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps %xmm0, %xmm1 -; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; NO-FMA-NEXT: addq $56, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -2586,24 +2577,23 @@ define <3 x double> @constrained_vector_log2_v3f64() { ; NO-FMA-LABEL: constrained_vector_log2_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $24, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: subq $56, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 64 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log2 -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log2 -; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq log2 ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps %xmm0, %xmm1 -; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; NO-FMA-NEXT: addq $56, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -2765,24 +2755,23 @@ define <3 x double> @constrained_vector_rint_v3f64() { ; NO-FMA-LABEL: constrained_vector_rint_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $24, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: subq $56, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 64 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq rint -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq rint -; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq rint ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps %xmm0, %xmm1 -; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; NO-FMA-NEXT: addq $56, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; @@ -2912,24 +2901,23 @@ define <3 x double> @constrained_vector_nearby_v3f64() { ; NO-FMA-LABEL: constrained_vector_nearby_v3f64: ; NO-FMA: # %bb.0: # %entry -; NO-FMA-NEXT: subq $24, %rsp -; NO-FMA-NEXT: .cfi_def_cfa_offset 32 +; NO-FMA-NEXT: subq $56, %rsp +; NO-FMA-NEXT: .cfi_def_cfa_offset 64 ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq nearbyint -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq nearbyint -; NO-FMA-NEXT: unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload +; NO-FMA-NEXT: unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; NO-FMA-NEXT: # xmm0 = xmm0[0],mem[0] -; NO-FMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill +; NO-FMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; NO-FMA-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; NO-FMA-NEXT: callq nearbyint ; NO-FMA-NEXT: movsd %xmm0, {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload -; NO-FMA-NEXT: movaps %xmm0, %xmm1 -; NO-FMA-NEXT: movhlps {{.*#+}} xmm1 = xmm0[1],xmm1[1] ; NO-FMA-NEXT: fldl {{[0-9]+}}(%rsp) -; NO-FMA-NEXT: addq $24, %rsp +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; NO-FMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; NO-FMA-NEXT: addq $56, %rsp ; NO-FMA-NEXT: .cfi_def_cfa_offset 8 ; NO-FMA-NEXT: retq ; Index: test/CodeGen/X86/vector-reduce-mul.ll =================================================================== --- test/CodeGen/X86/vector-reduce-mul.ll +++ test/CodeGen/X86/vector-reduce-mul.ll @@ -19,8 +19,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -34,7 +34,7 @@ ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -48,7 +48,7 @@ ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX512BW-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX512BW-NEXT: vpsrlq $32, %xmm1, %xmm3 +; AVX512BW-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 ; AVX512BW-NEXT: vpaddq %xmm2, %xmm3, %xmm2 ; AVX512BW-NEXT: vpsllq $32, %xmm2, %xmm2 @@ -107,8 +107,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -153,11 +153,11 @@ ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 +; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3 +; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 @@ -177,11 +177,11 @@ ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX512BW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 +; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm3 +; AVX512BW-NEXT: vpmuludq %ymm1, %ymm3, %ymm3 +; AVX512BW-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 @@ -274,8 +274,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -345,11 +345,11 @@ ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 +; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3 +; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 @@ -527,8 +527,8 @@ ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 ; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: pmuludq %xmm0, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 @@ -648,11 +648,11 @@ ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 +; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 +; AVX2-NEXT: vpmuludq %ymm1, %ymm3, %ymm3 +; AVX2-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 @@ -779,13 +779,13 @@ ; SSE2-LABEL: test_v4i32: ; SSE2: # %bb.0: ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,3,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -829,13 +829,13 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] +; SSE2-NEXT: pmuludq %xmm0, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,2,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2] ; SSE2-NEXT: pmuludq %xmm2, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -901,29 +901,29 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm4, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; SSE2-NEXT: pmuludq %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,2,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,2,2] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 @@ -993,63 +993,63 @@ ; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm2[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm6, %xmm2 ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm8, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm6[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm9 +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm9[0,2,2,3] ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm0[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm4, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm6, %xmm4 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm8, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] +; SSE2-NEXT: pmuludq %xmm2, %xmm6 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm7, %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm7[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] ; SSE2-NEXT: pmuludq %xmm5, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm5 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3] +; SSE2-NEXT: pmuludq %xmm2, %xmm5 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: pmuludq %xmm3, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm4, %xmm3 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2] +; SSE2-NEXT: pmuludq %xmm2, %xmm3 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: pmuludq %xmm4, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] +; SSE2-NEXT: pmuludq %xmm2, %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] +; SSE2-NEXT: pmuludq %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2] ; SSE2-NEXT: pmuludq %xmm2, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm3, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] ; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm1, %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; SSE2-NEXT: pmuludq %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: movd %xmm1, %eax +; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: retq ; ; SSE41-LABEL: test_v32i32: @@ -1841,9 +1841,9 @@ ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero @@ -1851,15 +1851,14 @@ ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero -; AVX1-NEXT: vpmullw %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 +; AVX1-NEXT: vpand %xmm1, %xmm2, %xmm2 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm2 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] @@ -2229,9 +2228,9 @@ ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm3, %xmm4, %xmm3 ; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero @@ -2239,15 +2238,14 @@ ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm1, %xmm3, %xmm1 +; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] @@ -2782,9 +2780,9 @@ ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm2, %xmm3, %xmm2 ; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero @@ -2792,15 +2790,14 @@ ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 ; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX1-NEXT: vpmullw %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpand %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpmullw %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpand %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX1-NEXT: vpmullw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0 -; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpackuswb %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: vpsrld $16, %xmm0, %xmm1 ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] Index: test/CodeGen/X86/vector-sext.ll =================================================================== --- test/CodeGen/X86/vector-sext.ll +++ test/CodeGen/X86/vector-sext.ll @@ -2190,8 +2190,8 @@ ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $6, %ecx ; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $5, %ecx ; SSE2-NEXT: andl $1, %ecx @@ -2199,9 +2199,9 @@ ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $4, %ecx ; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $3, %ecx ; SSE2-NEXT: andl $1, %ecx @@ -2209,22 +2209,20 @@ ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $2, %ecx ; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: movd %ecx, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: shrl %eax ; SSE2-NEXT: andl $1, %eax -; SSE2-NEXT: movd %eax, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSE2-NEXT: pslld $31, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: retq @@ -2238,8 +2236,8 @@ ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $6, %ecx ; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $5, %ecx ; SSSE3-NEXT: andl $1, %ecx @@ -2247,9 +2245,9 @@ ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $4, %ecx ; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $3, %ecx ; SSSE3-NEXT: andl $1, %ecx @@ -2257,22 +2255,20 @@ ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $2, %ecx ; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSSE3-NEXT: movd %ecx, %xmm2 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: shrl %eax ; SSSE3-NEXT: andl $1, %eax -; SSSE3-NEXT: movd %eax, %xmm0 -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: movd %eax, %xmm3 +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] ; SSSE3-NEXT: pslld $31, %xmm0 ; SSSE3-NEXT: psrad $31, %xmm0 -; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSSE3-NEXT: pslld $31, %xmm1 ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: retq @@ -3017,62 +3013,60 @@ ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $10, %ecx ; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $9, %ecx ; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm3 +; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $8, %ecx ; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $7, %ecx ; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $6, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $5, %ecx ; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $4, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm3 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $3, %ecx ; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: shrl $2, %ecx ; SSE2-NEXT: andl $1, %ecx ; SSE2-NEXT: movd %ecx, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: movl %eax, %ecx ; SSE2-NEXT: andl $1, %ecx -; SSE2-NEXT: movd %ecx, %xmm1 +; SSE2-NEXT: movd %ecx, %xmm0 ; SSE2-NEXT: shrl %eax ; SSE2-NEXT: andl $1, %eax ; SSE2-NEXT: movd %eax, %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: psllw $15, %xmm0 ; SSE2-NEXT: psraw $15, %xmm0 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: psllw $15, %xmm1 ; SSE2-NEXT: psraw $15, %xmm1 ; SSE2-NEXT: retq @@ -3105,62 +3099,60 @@ ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $10, %ecx ; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $9, %ecx ; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm3 +; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $8, %ecx ; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm0 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $7, %ecx ; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $6, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $5, %ecx ; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $4, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm3 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $3, %ecx ; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: shrl $2, %ecx ; SSSE3-NEXT: andl $1, %ecx ; SSSE3-NEXT: movd %ecx, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSSE3-NEXT: movl %eax, %ecx ; SSSE3-NEXT: andl $1, %ecx -; SSSE3-NEXT: movd %ecx, %xmm1 +; SSSE3-NEXT: movd %ecx, %xmm0 ; SSSE3-NEXT: shrl %eax ; SSSE3-NEXT: andl $1, %eax ; SSSE3-NEXT: movd %eax, %xmm4 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] -; SSSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] -; SSSE3-NEXT: movdqa %xmm1, %xmm0 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; SSSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: psllw $15, %xmm0 ; SSSE3-NEXT: psraw $15, %xmm0 -; SSSE3-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15] +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSSE3-NEXT: psllw $15, %xmm1 ; SSSE3-NEXT: psraw $15, %xmm1 ; SSSE3-NEXT: retq Index: test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v4.ll +++ test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1114,31 +1114,28 @@ ; SSE2-LABEL: shuffle_v4f32_0z24: ; SSE2: # %bb.0: ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: xorps %xmm2, %xmm2 +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,0] +; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq ; ; SSE3-LABEL: shuffle_v4f32_0z24: ; SSE3: # %bb.0: ; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] -; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSE3-NEXT: xorps %xmm1, %xmm1 -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSE3-NEXT: movaps %xmm1, %xmm0 +; SSE3-NEXT: xorps %xmm2, %xmm2 +; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,0] +; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSE3-NEXT: movaps %xmm2, %xmm0 ; SSE3-NEXT: retq ; ; SSSE3-LABEL: shuffle_v4f32_0z24: ; SSSE3: # %bb.0: ; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0] -; SSSE3-NEXT: xorps %xmm1, %xmm1 -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] -; SSSE3-NEXT: movaps %xmm1, %xmm0 +; SSSE3-NEXT: xorps %xmm2, %xmm2 +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[0,0] +; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[2,0] +; SSSE3-NEXT: movaps %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: shuffle_v4f32_0z24: Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1567,8 +1567,8 @@ define <4 x i64> @shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_1234: ; AVX1: # %bb.0: -; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX1-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,0,1] ; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm1[0],ymm0[3],ymm1[2] ; AVX1-NEXT: retq ; Index: test/CodeGen/X86/vector-shuffle-256-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v8.ll +++ test/CodeGen/X86/vector-shuffle-256-v8.ll @@ -2443,9 +2443,9 @@ define <8 x i32> @shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b) { ; AVX1-LABEL: shuffle_v8i32_12345678: ; AVX1: # %bb.0: -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[3,0],ymm1[4,4],ymm0[7,4] +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm0[1,2,3,4,5,6,7] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1] +; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,0],ymm1[3,0],ymm2[4,4],ymm1[7,4] ; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,2],ymm1[2,0],ymm0[5,6],ymm1[6,4] ; AVX1-NEXT: retq ; Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -1849,10 +1849,10 @@ ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSE2-NEXT: retq ; @@ -1861,10 +1861,10 @@ ; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSSE3-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] ; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] ; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] ; SSSE3-NEXT: retq ; @@ -2764,33 +2764,12 @@ } define <8 x float> @PR22412(<8 x float> %a, <8 x float> %b) { -; SSE2-LABEL: PR22412: -; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSE2-NEXT: movapd %xmm2, %xmm0 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2] -; SSE2-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2] -; SSE2-NEXT: movaps %xmm3, %xmm1 -; SSE2-NEXT: retq -; -; SSSE3-LABEL: PR22412: -; SSSE3: # %bb.0: # %entry -; SSSE3-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1] -; SSSE3-NEXT: movapd %xmm2, %xmm0 -; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2] -; SSSE3-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2] -; SSSE3-NEXT: movaps %xmm3, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: PR22412: -; SSE41: # %bb.0: # %entry -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3] -; SSE41-NEXT: movaps %xmm0, %xmm1 -; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm3[3,2] -; SSE41-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm0[3,2] -; SSE41-NEXT: movaps %xmm1, %xmm0 -; SSE41-NEXT: movaps %xmm3, %xmm1 -; SSE41-NEXT: retq +; SSE-LABEL: PR22412: +; SSE: # %bb.0: # %entry +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[3,2] +; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[3,2] +; SSE-NEXT: movaps %xmm3, %xmm1 +; SSE-NEXT: retq ; ; AVX1-LABEL: PR22412: ; AVX1: # %bb.0: # %entry Index: test/CodeGen/X86/vector-trunc-math.ll =================================================================== --- test/CodeGen/X86/vector-trunc-math.ll +++ test/CodeGen/X86/vector-trunc-math.ll @@ -5510,32 +5510,20 @@ define <4 x i32> @mul_add_const_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; SSE-LABEL: mul_add_const_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[2,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm2, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm0, %xmm4 -; SSE-NEXT: paddq %xmm1, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm3, %xmm0 -; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2] +; SSE-NEXT: pmuludq %xmm1, %xmm3 +; SSE-NEXT: pxor %xmm0, %xmm0 +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: psllq $32, %xmm1 +; SSE-NEXT: paddq %xmm3, %xmm1 +; SSE-NEXT: pmuludq %xmm4, %xmm2 +; SSE-NEXT: pmuludq %xmm4, %xmm0 +; SSE-NEXT: psllq $32, %xmm0 +; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq ; @@ -5559,37 +5547,34 @@ ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: psrad $31, %xmm3 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrad $31, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrad $31, %xmm4 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] -; SSE-NEXT: movdqa %xmm0, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm1, %xmm4 -; SSE-NEXT: movdqa %xmm1, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm0, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: movdqa %xmm0, %xmm6 +; SSE-NEXT: psrad $31, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] +; SSE-NEXT: movdqa %xmm4, %xmm5 +; SSE-NEXT: psrad $31, %xmm5 +; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] +; SSE-NEXT: movdqa %xmm1, %xmm7 +; SSE-NEXT: psrad $31, %xmm7 +; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1] +; SSE-NEXT: pxor %xmm8, %xmm8 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm1, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm0, %xmm7 +; SSE-NEXT: paddq %xmm6, %xmm7 +; SSE-NEXT: psllq $32, %xmm7 ; SSE-NEXT: pmuludq %xmm0, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm0 -; SSE-NEXT: psrlq $32, %xmm0 -; SSE-NEXT: pmuludq %xmm3, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 +; SSE-NEXT: paddq %xmm7, %xmm1 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm4, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] +; SSE-NEXT: pmuludq %xmm2, %xmm5 +; SSE-NEXT: paddq %xmm3, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 ; SSE-NEXT: pmuludq %xmm2, %xmm4 -; SSE-NEXT: paddq %xmm0, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: paddq %xmm4, %xmm3 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] +; SSE-NEXT: paddq %xmm5, %xmm4 +; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] ; SSE-NEXT: paddd %xmm1, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -5614,28 +5599,17 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm1, %xmm5 -; SSE-NEXT: movdqa %xmm1, %xmm6 -; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm3, %xmm6 -; SSE-NEXT: paddq %xmm5, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 ; SSE-NEXT: pmuludq %xmm1, %xmm3 -; SSE-NEXT: paddq %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm1 -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: pmuludq %xmm4, %xmm1 -; SSE-NEXT: movdqa %xmm4, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm2, %xmm5 -; SSE-NEXT: paddq %xmm1, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: pxor %xmm5, %xmm5 +; SSE-NEXT: pmuludq %xmm5, %xmm1 +; SSE-NEXT: psllq $32, %xmm1 +; SSE-NEXT: paddq %xmm3, %xmm1 ; SSE-NEXT: pmuludq %xmm4, %xmm2 -; SSE-NEXT: paddq %xmm5, %xmm2 -; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2] -; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: pmuludq %xmm4, %xmm5 +; SSE-NEXT: psllq $32, %xmm5 +; SSE-NEXT: paddq %xmm2, %xmm5 +; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] +; SSE-NEXT: paddd %xmm5, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: Index: test/CodeGen/X86/x86-interleaved-access.ll =================================================================== --- test/CodeGen/X86/x86-interleaved-access.ll +++ test/CodeGen/X86/x86-interleaved-access.ll @@ -1304,59 +1304,59 @@ ; AVX1-LABEL: interleaved_store_vf32_i8_stride3: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm3[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6 -; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm9 +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0 -; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 -; AVX1-NEXT: vmovups %ymm2, 64(%rdi) -; AVX1-NEXT: vmovups %ymm1, 32(%rdi) -; AVX1-NEXT: vmovups %ymm0, (%rdi) +; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1-NEXT: vpshufb %xmm4, %xmm6, %xmm6 +; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vmovups %ymm0, 64(%rdi) +; AVX1-NEXT: vmovups %ymm2, 32(%rdi) +; AVX1-NEXT: vmovups %ymm1, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; ; AVX2-LABEL: interleaved_store_vf32_i8_stride3: ; AVX2: # %bb.0: ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm3 +; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] +; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX2-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX2-NEXT: vpshufb %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi) -; AVX2-NEXT: vmovdqu %ymm2, 32(%rdi) +; AVX2-NEXT: vmovdqu %ymm1, 32(%rdi) ; AVX2-NEXT: vmovdqu %ymm3, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1364,21 +1364,21 @@ ; AVX512-LABEL: interleaved_store_vf32_i8_stride3: ; AVX512: # %bb.0: ; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] -; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] -; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] -; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] -; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] -; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] -; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] -; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm3 +; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] +; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] +; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] +; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] +; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] +; AVX512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] +; AVX512-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] +; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm3 ; AVX512-NEXT: vmovdqa {{.*#+}} ymm4 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX512-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] -; AVX512-NEXT: vpshufb %ymm4, %ymm2, %ymm2 -; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3] +; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] +; AVX512-NEXT: vpshufb %ymm4, %ymm1, %ymm1 +; AVX512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX512-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm1 +; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 ; AVX512-NEXT: vmovdqu %ymm0, 64(%rdi) ; AVX512-NEXT: vmovdqu64 %zmm1, (%rdi) ; AVX512-NEXT: vzeroupper @@ -1397,65 +1397,67 @@ ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm6[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7 -; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm7[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm7[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] ; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 -; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm6[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm10 +; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm10[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] ; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm11 +; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm11[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] ; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6 -; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7 -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm12 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] ; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm11[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm6 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm7 = xmm10[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm15 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] +; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm5 +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm13[5,6,7,8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm10 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4] ; AVX1-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[5,6,7,8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm14 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4] -; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4] -; AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1-NEXT: vpshufb %xmm5, %xmm0, %xmm0 -; AVX1-NEXT: vpshufb %xmm5, %xmm7, %xmm7 -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm7, %ymm0 -; AVX1-NEXT: vpshufb %xmm5, %xmm6, %xmm6 -; AVX1-NEXT: vpshufb %xmm5, %xmm2, %xmm2 +; AVX1-NEXT: vpalignr {{.*#+}} xmm13 = xmm5[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm7[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm5 = xmm9[5,6,7,8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm9 = xmm15[5,6,7,8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm11 = xmm12[5,6,7,8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm6[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr $5, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload +; AVX1-NEXT: # xmm6 = mem[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1-NEXT: vpshufb %xmm7, %xmm6, %xmm6 +; AVX1-NEXT: vpshufb %xmm7, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 -; AVX1-NEXT: vpshufb %xmm5, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm5, %xmm14, %xmm6 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1 -; AVX1-NEXT: vpshufb %xmm5, %xmm9, %xmm6 -; AVX1-NEXT: vpshufb %xmm5, %xmm15, %xmm7 -; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6 -; AVX1-NEXT: vpshufb %xmm5, %xmm11, %xmm7 -; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm4 -; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm4, %ymm4 -; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm3 -; AVX1-NEXT: vpshufb %xmm5, %xmm8, %xmm5 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 -; AVX1-NEXT: vmovups %ymm3, 160(%rdi) +; AVX1-NEXT: vpshufb %xmm7, %xmm1, %xmm1 +; AVX1-NEXT: vpshufb %xmm7, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; AVX1-NEXT: vpshufb %xmm7, %xmm4, %xmm1 +; AVX1-NEXT: vpshufb %xmm7, %xmm11, %xmm4 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm4, %ymm1 +; AVX1-NEXT: vpshufb %xmm7, %xmm5, %xmm4 +; AVX1-NEXT: vpshufb %xmm7, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 +; AVX1-NEXT: vpshufb %xmm7, %xmm9, %xmm4 +; AVX1-NEXT: vpshufb %xmm7, %xmm10, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4 +; AVX1-NEXT: vpshufb %xmm7, %xmm13, %xmm5 +; AVX1-NEXT: vpshufb %xmm7, %xmm8, %xmm6 +; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm6, %ymm5 +; AVX1-NEXT: vmovups %ymm5, 160(%rdi) ; AVX1-NEXT: vmovups %ymm4, 128(%rdi) -; AVX1-NEXT: vmovups %ymm6, 96(%rdi) +; AVX1-NEXT: vmovups %ymm3, 96(%rdi) ; AVX1-NEXT: vmovups %ymm1, 64(%rdi) -; AVX1-NEXT: vmovups %ymm2, 32(%rdi) -; AVX1-NEXT: vmovups %ymm0, (%rdi) +; AVX1-NEXT: vmovups %ymm0, 32(%rdi) +; AVX1-NEXT: vmovups %ymm2, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1463,38 +1465,38 @@ ; AVX2: # %bb.0: ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20,21] -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] -; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm6[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm6[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[5,6,7,8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4],ymm3[21,22,23,24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20] -; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm2[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm5, %ymm6 +; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm3[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] +; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm2[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26] +; AVX2-NEXT: vpalignr {{.*#+}} ymm8 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4],ymm6[21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm9[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm9[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm8[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4],ymm1[21,22,23,24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4],ymm0[21,22,23,24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm5[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20] +; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[5,6,7,8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4],ymm4[21,22,23,24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] ; AVX2-NEXT: vpshufb %ymm7, %ymm6, %ymm6 -; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3],ymm5[4,5,6,7] -; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm5 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] -; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm4, %ymm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3] +; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm4 ; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] +; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7] +; AVX2-NEXT: vpshufb %ymm7, %ymm3, %ymm3 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm5[2,3] ; AVX2-NEXT: vpshufb %ymm7, %ymm1, %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, 160(%rdi) -; AVX2-NEXT: vmovdqu %ymm4, 128(%rdi) +; AVX2-NEXT: vmovdqu %ymm3, 128(%rdi) ; AVX2-NEXT: vmovdqu %ymm0, 64(%rdi) -; AVX2-NEXT: vmovdqu %ymm5, 32(%rdi) -; AVX2-NEXT: vmovdqu %ymm2, 96(%rdi) +; AVX2-NEXT: vmovdqu %ymm2, 32(%rdi) +; AVX2-NEXT: vmovdqu %ymm4, 96(%rdi) ; AVX2-NEXT: vmovdqu %ymm6, (%rdi) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq