Index: llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1460,6 +1460,23 @@ TLO, Depth + 1)) return true; + // Try calling SimplifyDemandedBits, converting demanded elts to the bits + // of the large element. + // TODO - bigendian once we have test coverage. + if (TLO.DAG.getDataLayout().isLittleEndian()) { + unsigned SrcEltSizeInBits = SrcVT.getScalarSizeInBits(); + APInt SrcDemandedBits = APInt::getNullValue(SrcEltSizeInBits); + for (unsigned i = 0; i != NumElts; ++i) + if (DemandedElts[i]) { + unsigned Ofs = (i % Scale) * EltSizeInBits; + SrcDemandedBits.setBits(Ofs, Ofs + EltSizeInBits); + } + + KnownBits Known; + if (SimplifyDemandedBits(Src, SrcDemandedBits, Known, TLO, Depth + 1)) + return true; + } + // If the src element is zero/undef then all the output elements will be - // only demanded elements are guaranteed to be correct. for (unsigned i = 0; i != NumSrcElts; ++i) { Index: llvm/trunk/test/CodeGen/X86/known-bits-vector.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/known-bits-vector.ll +++ llvm/trunk/test/CodeGen/X86/known-bits-vector.ll @@ -158,20 +158,12 @@ define <4 x float> @knownbits_mask_or_shuffle_uitofp(<4 x i32> %a0) nounwind { ; X32-LABEL: knownbits_mask_or_shuffle_uitofp: ; X32: # %bb.0: -; X32-NEXT: vpor {{\.LCPI.*}}, %xmm0, %xmm0 -; X32-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X32-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6] -; X32-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X32-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X32-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4] ; X32-NEXT: retl ; ; X64-LABEL: knownbits_mask_or_shuffle_uitofp: ; X64: # %bb.0: -; X64-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0 -; X64-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; X64-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6] -; X64-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; X64-NEXT: vcvtdq2ps %xmm0, %xmm0 +; X64-NEXT: vmovaps {{.*#+}} xmm0 = [6.5535E+4,6.5535E+4,6.5535E+4,6.5535E+4] ; X64-NEXT: retq %1 = and <4 x i32> %a0, %2 = or <4 x i32> %1, Index: llvm/trunk/test/CodeGen/X86/known-signbits-vector.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/known-signbits-vector.ll +++ llvm/trunk/test/CodeGen/X86/known-signbits-vector.ll @@ -230,6 +230,7 @@ ret <4 x double> %3 } +; TODO: Fix vpshufd+vpsrlq -> vpshufd/vpermilps define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind { ; X32-LABEL: signbits_ashr_concat_ashr_extract_sitofp: ; X32: # %bb.0: @@ -239,7 +240,8 @@ ; ; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp: ; X64: # %bb.0: -; X64-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,3,2,3] +; X64-NEXT: vpsrlq $32, %xmm0, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X64-NEXT: vcvtdq2pd %xmm0, %xmm0 ; X64-NEXT: retq %1 = ashr <2 x i64> %a0, @@ -255,20 +257,13 @@ ; X32-LABEL: signbits_ashr_sext_sextinreg_and_extract_sitofp: ; X32: # %bb.0: ; X32-NEXT: pushl %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: vpsrlq $60, %xmm0, %xmm2 ; X32-NEXT: vpsrlq $61, %xmm0, %xmm0 ; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] ; X32-NEXT: vmovdqa {{.*#+}} xmm2 = [4,0,0,0,8,0,0,0] ; X32-NEXT: vpxor %xmm2, %xmm0, %xmm0 ; X32-NEXT: vpsubq %xmm2, %xmm0, %xmm0 -; X32-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1 -; X32-NEXT: sarl $31, %eax -; X32-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; X32-NEXT: vpsllq $20, %xmm1, %xmm1 -; X32-NEXT: vpsrad $20, %xmm1, %xmm2 -; X32-NEXT: vpsrlq $20, %xmm1, %xmm1 -; X32-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; X32-NEXT: vpinsrd $0, {{[0-9]+}}(%esp), %xmm1, %xmm1 ; X32-NEXT: vpand %xmm1, %xmm0, %xmm0 ; X32-NEXT: vmovd %xmm0, %eax ; X32-NEXT: vcvtsi2ssl %eax, %xmm3, %xmm0 Index: llvm/trunk/test/CodeGen/X86/reduce-trunc-shl.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/reduce-trunc-shl.ll +++ llvm/trunk/test/CodeGen/X86/reduce-trunc-shl.ll @@ -72,11 +72,7 @@ ; ; AVX2-LABEL: trunc_shl_17_v8i16_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $17, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] -; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 -; AVX2-NEXT: vzeroupper +; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: retq %shl = shl <8 x i32> %a, %conv = trunc <8 x i32> %shl to <8 x i16> Index: llvm/trunk/test/CodeGen/X86/shrink_vmul.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/shrink_vmul.ll +++ llvm/trunk/test/CodeGen/X86/shrink_vmul.ll @@ -1154,10 +1154,7 @@ ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; X86-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X86-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X86-SSE-NEXT: psllq $32, %xmm2 -; X86-SSE-NEXT: paddq %xmm1, %xmm2 -; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; X86-SSE-NEXT: movq %xmm0, (%esi,%ecx,4) ; X86-SSE-NEXT: popl %esi ; X86-SSE-NEXT: retl @@ -1191,10 +1188,7 @@ ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] ; X64-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,1,3] ; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 -; X64-SSE-NEXT: pmuludq %xmm0, %xmm2 -; X64-SSE-NEXT: psllq $32, %xmm2 -; X64-SSE-NEXT: paddq %xmm1, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rdx,4) ; X64-SSE-NEXT: retq ; @@ -1952,15 +1946,7 @@ ; X86-SSE-NEXT: pxor %xmm1, %xmm1 ; X86-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = <0,u,65536,u> -; X86-SSE-NEXT: pmuludq %xmm2, %xmm1 -; X86-SSE-NEXT: movdqa %xmm2, %xmm3 -; X86-SSE-NEXT: psrlq $32, %xmm3 -; X86-SSE-NEXT: pmuludq %xmm0, %xmm3 -; X86-SSE-NEXT: paddq %xmm1, %xmm3 -; X86-SSE-NEXT: psllq $32, %xmm3 -; X86-SSE-NEXT: pmuludq %xmm2, %xmm0 -; X86-SSE-NEXT: paddq %xmm3, %xmm0 +; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl @@ -1986,13 +1972,10 @@ ; X64-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] ; X64-SSE-NEXT: movl $65536, %ecx # imm = 0x10000 -; X64-SSE-NEXT: movq %rcx, %xmm2 -; X64-SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; X64-SSE-NEXT: pmuludq %xmm2, %xmm0 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: psllq $32, %xmm2 -; X64-SSE-NEXT: paddq %xmm0, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; X64-SSE-NEXT: movq %rcx, %xmm1 +; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] +; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; @@ -2037,16 +2020,7 @@ ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7] ; X86-SSE-NEXT: psrad $16, %xmm0 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3] -; X86-SSE-NEXT: movdqa {{.*#+}} xmm1 = <0,u,32768,u> -; X86-SSE-NEXT: pxor %xmm2, %xmm2 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X86-SSE-NEXT: movdqa %xmm1, %xmm3 -; X86-SSE-NEXT: psrlq $32, %xmm3 -; X86-SSE-NEXT: pmuludq %xmm0, %xmm3 -; X86-SSE-NEXT: paddq %xmm2, %xmm3 -; X86-SSE-NEXT: psllq $32, %xmm3 -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X86-SSE-NEXT: paddq %xmm3, %xmm0 +; X86-SSE-NEXT: pmuludq {{\.LCPI.*}}, %xmm0 ; X86-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; X86-SSE-NEXT: movq %xmm0, (%edx,%eax,4) ; X86-SSE-NEXT: retl @@ -2072,12 +2046,8 @@ ; X64-SSE-NEXT: movl $32768, %ecx # imm = 0x8000 ; X64-SSE-NEXT: movq %rcx, %xmm1 ; X64-SSE-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7] -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 -; X64-SSE-NEXT: pxor %xmm2, %xmm2 -; X64-SSE-NEXT: pmuludq %xmm1, %xmm2 -; X64-SSE-NEXT: psllq $32, %xmm2 -; X64-SSE-NEXT: paddq %xmm0, %xmm2 -; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] +; X64-SSE-NEXT: pmuludq %xmm0, %xmm1 +; X64-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] ; X64-SSE-NEXT: movq %xmm0, (%rax,%rsi,4) ; X64-SSE-NEXT: retq ; Index: llvm/trunk/test/CodeGen/X86/vector-trunc-math-widen.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-trunc-math-widen.ll +++ llvm/trunk/test/CodeGen/X86/vector-trunc-math-widen.ll @@ -1823,26 +1823,8 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; SSE-LABEL: trunc_mul_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm1, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 ; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm0, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: retq ; @@ -2089,94 +2071,14 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { ; SSE-LABEL: trunc_mul_v16i64_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: psrlq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm0, %xmm10 -; SSE-NEXT: paddq %xmm9, %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm8, %xmm0 -; SSE-NEXT: paddq %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm1, %xmm10 -; SSE-NEXT: paddq %xmm8, %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm9, %xmm1 -; SSE-NEXT: paddq %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: psrlq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm2, %xmm10 -; SSE-NEXT: paddq %xmm9, %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm8, %xmm2 -; SSE-NEXT: paddq %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm3, %xmm10 -; SSE-NEXT: paddq %xmm8, %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm9, %xmm3 -; SSE-NEXT: paddq %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: psrlq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm4, %xmm10 -; SSE-NEXT: paddq %xmm9, %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm8, %xmm4 -; SSE-NEXT: paddq %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm5, %xmm10 -; SSE-NEXT: paddq %xmm8, %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm9, %xmm5 -; SSE-NEXT: paddq %xmm10, %xmm5 -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: psrlq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm6, %xmm10 -; SSE-NEXT: paddq %xmm9, %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm8, %xmm6 -; SSE-NEXT: paddq %xmm10, %xmm6 -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm7, %xmm10 -; SSE-NEXT: paddq %xmm8, %xmm10 -; SSE-NEXT: pmuludq %xmm9, %xmm7 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: paddq %xmm10, %xmm7 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -2601,22 +2503,11 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,3] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: pmuludq %xmm2, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: paddq %xmm3, %xmm1 ; SSE-NEXT: movl $1, %eax ; SSE-NEXT: movq %rax, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: psrlq $32, %xmm0 ; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: retq ; @@ -2773,61 +2664,14 @@ ; SSE-NEXT: movl $1, %eax ; SSE-NEXT: movq %rax, %xmm8 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm0 ; SSE-NEXT: pmuludq %xmm8, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: paddq %xmm9, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2,3] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: pmuludq %xmm8, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: paddq %xmm9, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [4,5] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm8, %xmm2 -; SSE-NEXT: psllq $32, %xmm2 -; SSE-NEXT: paddq %xmm9, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [6,7] -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm8, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: paddq %xmm9, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [8,9] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm8, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: paddq %xmm9, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [10,11] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm8, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: paddq %xmm9, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [12,13] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm8, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 -; SSE-NEXT: paddq %xmm9, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [14,15] -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm7 -; SSE-NEXT: pmuludq %xmm8, %xmm7 -; SSE-NEXT: psllq $32, %xmm7 -; SSE-NEXT: paddq %xmm9, %xmm7 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm5 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm6 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -5538,17 +5382,10 @@ ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; SSE-NEXT: pmuludq %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pmuludq %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pmuludq %xmm0, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: paddq %xmm3, %xmm1 -; SSE-NEXT: pmuludq %xmm4, %xmm2 -; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: pmuludq %xmm3, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq @@ -5569,40 +5406,14 @@ define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; SSE-LABEL: mul_add_self_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: pmuludq %xmm1, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; SSE-NEXT: pmuludq %xmm0, %xmm5 -; SSE-NEXT: paddq %xmm7, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm0, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE-NEXT: pmuludq %xmm4, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; SSE-NEXT: pmuludq %xmm2, %xmm6 -; SSE-NEXT: paddq %xmm3, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm2, %xmm4 -; SSE-NEXT: paddq %xmm6, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] -; SSE-NEXT: paddd %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: paddd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: mul_add_self_v4i64_v4i32: @@ -5624,18 +5435,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] +; SSE-NEXT: pmuludq %xmm2, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pmuludq %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: pmuludq %xmm5, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: paddq %xmm3, %xmm1 -; SSE-NEXT: pmuludq %xmm4, %xmm2 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: paddq %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] -; SSE-NEXT: paddd %xmm5, %xmm0 +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2] +; SSE-NEXT: paddd %xmm4, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: Index: llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll +++ llvm/trunk/test/CodeGen/X86/vector-trunc-math.ll @@ -1823,26 +1823,8 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; SSE-LABEL: trunc_mul_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm1, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 ; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm0, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm4, %xmm0 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: retq ; @@ -2089,94 +2071,14 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind { ; SSE-LABEL: trunc_mul_v16i64_v16i8: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: psrlq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm0, %xmm10 -; SSE-NEXT: paddq %xmm9, %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm8, %xmm0 -; SSE-NEXT: paddq %xmm10, %xmm0 -; SSE-NEXT: movdqa %xmm1, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm1, %xmm10 -; SSE-NEXT: paddq %xmm8, %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm9, %xmm1 -; SSE-NEXT: paddq %xmm10, %xmm1 -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: psrlq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm2, %xmm10 -; SSE-NEXT: paddq %xmm9, %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm8, %xmm2 -; SSE-NEXT: paddq %xmm10, %xmm2 -; SSE-NEXT: movdqa %xmm3, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm3, %xmm10 -; SSE-NEXT: paddq %xmm8, %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm9, %xmm3 -; SSE-NEXT: paddq %xmm10, %xmm3 -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: psrlq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm4, %xmm10 -; SSE-NEXT: paddq %xmm9, %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm8, %xmm4 -; SSE-NEXT: paddq %xmm10, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm5, %xmm10 -; SSE-NEXT: paddq %xmm8, %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm9, %xmm5 -; SSE-NEXT: paddq %xmm10, %xmm5 -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: psrlq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: movdqa %xmm8, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm6, %xmm10 -; SSE-NEXT: paddq %xmm9, %xmm10 -; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm9 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm8, %xmm6 -; SSE-NEXT: paddq %xmm10, %xmm6 -; SSE-NEXT: movdqa %xmm7, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm9, %xmm8 -; SSE-NEXT: movdqa %xmm9, %xmm10 -; SSE-NEXT: psrlq $32, %xmm10 -; SSE-NEXT: pmuludq %xmm7, %xmm10 -; SSE-NEXT: paddq %xmm8, %xmm10 -; SSE-NEXT: pmuludq %xmm9, %xmm7 -; SSE-NEXT: psllq $32, %xmm10 -; SSE-NEXT: paddq %xmm10, %xmm7 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm0 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm1 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm2 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm3 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm4 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm5 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm6 +; SSE-NEXT: pmuludq {{[0-9]+}}(%rsp), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -2601,22 +2503,11 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind { ; SSE-LABEL: trunc_mul_const_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,3] -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: pmuludq %xmm2, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: paddq %xmm3, %xmm1 ; SSE-NEXT: movl $1, %eax ; SSE-NEXT: movq %rax, %xmm2 ; SSE-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: psrlq $32, %xmm0 ; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: retq ; @@ -2773,61 +2664,14 @@ ; SSE-NEXT: movl $1, %eax ; SSE-NEXT: movq %rax, %xmm8 ; SSE-NEXT: pslldq {{.*#+}} xmm8 = zero,zero,zero,zero,zero,zero,zero,zero,xmm8[0,1,2,3,4,5,6,7] -; SSE-NEXT: movdqa %xmm0, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm0 ; SSE-NEXT: pmuludq %xmm8, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: paddq %xmm9, %xmm0 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [2,3] -; SSE-NEXT: movdqa %xmm1, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm1 -; SSE-NEXT: pmuludq %xmm8, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: paddq %xmm9, %xmm1 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [4,5] -; SSE-NEXT: movdqa %xmm2, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm8, %xmm2 -; SSE-NEXT: psllq $32, %xmm2 -; SSE-NEXT: paddq %xmm9, %xmm2 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [6,7] -; SSE-NEXT: movdqa %xmm3, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm8, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: paddq %xmm9, %xmm3 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [8,9] -; SSE-NEXT: movdqa %xmm4, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm8, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: paddq %xmm9, %xmm4 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [10,11] -; SSE-NEXT: movdqa %xmm5, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm8, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: paddq %xmm9, %xmm5 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [12,13] -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm8, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 -; SSE-NEXT: paddq %xmm9, %xmm6 -; SSE-NEXT: movdqa {{.*#+}} xmm8 = [14,15] -; SSE-NEXT: movdqa %xmm7, %xmm9 -; SSE-NEXT: pmuludq %xmm8, %xmm9 -; SSE-NEXT: psrlq $32, %xmm7 -; SSE-NEXT: pmuludq %xmm8, %xmm7 -; SSE-NEXT: psllq $32, %xmm7 -; SSE-NEXT: paddq %xmm9, %xmm7 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm1 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm2 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm3 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm4 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm5 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm6 +; SSE-NEXT: pmuludq {{.*}}(%rip), %xmm7 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0] ; SSE-NEXT: pand %xmm8, %xmm7 ; SSE-NEXT: pand %xmm8, %xmm6 @@ -5538,17 +5382,10 @@ ; SSE: # %bb.0: ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; SSE-NEXT: pmuludq %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pmuludq %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm0, %xmm0 -; SSE-NEXT: pmuludq %xmm0, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: paddq %xmm3, %xmm1 -; SSE-NEXT: pmuludq %xmm4, %xmm2 -; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: paddq %xmm2, %xmm0 +; SSE-NEXT: pmuludq %xmm3, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE-NEXT: paddd {{.*}}(%rip), %xmm0 ; SSE-NEXT: retq @@ -5569,40 +5406,14 @@ define <4 x i32> @mul_add_self_v4i64_v4i32(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; SSE-LABEL: mul_add_self_v4i64_v4i32: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1] -; SSE-NEXT: pxor %xmm8, %xmm8 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: pcmpgtd %xmm2, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: pxor %xmm7, %xmm7 -; SSE-NEXT: pcmpgtd %xmm0, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE-NEXT: pxor %xmm6, %xmm6 -; SSE-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: pcmpgtd %xmm1, %xmm5 -; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] -; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] -; SSE-NEXT: pmuludq %xmm1, %xmm7 -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; SSE-NEXT: pmuludq %xmm0, %xmm5 -; SSE-NEXT: paddq %xmm7, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm0, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE-NEXT: pmuludq %xmm4, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; SSE-NEXT: pmuludq %xmm2, %xmm6 -; SSE-NEXT: paddq %xmm3, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm2, %xmm4 -; SSE-NEXT: paddq %xmm6, %xmm4 -; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm4[0,2] -; SSE-NEXT: paddd %xmm1, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,1,3] +; SSE-NEXT: pmuludq %xmm2, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; SSE-NEXT: paddd %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: mul_add_self_v4i64_v4i32: @@ -5624,18 +5435,11 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,1,3,3] ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[0,1,1,3] +; SSE-NEXT: pmuludq %xmm2, %xmm4 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,1,3,3] -; SSE-NEXT: pmuludq %xmm1, %xmm3 -; SSE-NEXT: pxor %xmm5, %xmm5 -; SSE-NEXT: pmuludq %xmm5, %xmm1 -; SSE-NEXT: psllq $32, %xmm1 -; SSE-NEXT: paddq %xmm3, %xmm1 -; SSE-NEXT: pmuludq %xmm4, %xmm2 -; SSE-NEXT: pmuludq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 -; SSE-NEXT: paddq %xmm2, %xmm5 -; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2] -; SSE-NEXT: paddd %xmm5, %xmm0 +; SSE-NEXT: pmuludq %xmm3, %xmm1 +; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm1[0,2] +; SSE-NEXT: paddd %xmm4, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: mul_add_multiuse_v4i64_v4i32: