Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -41273,9 +41273,20 @@ return RHS; // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. + // We need to simplify each operand using the User/OpIdx signature which will + // allow the first node to have multiple uses. This is important since the + // PMULDQ/PMULUDQ IR pattern contains an and or sign_extend_inreg. We would + // like to bypass that even its used by multiple PMULDQ/PMULDUDQs. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnesValue(64), DCI)) - return SDValue(N, 0); + APInt DemandedMask = APInt::getLowBitsSet(64, 32); + for (unsigned OpIdx = 0; OpIdx != 2; ++OpIdx) { + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + if (TLI.SimplifyDemandedBits(N, OpIdx, DemandedMask, DCI, TLO)) { + DCI.AddToWorklist(N); + return SDValue(N, 0); + } + } return SDValue(); } Index: test/CodeGen/X86/avx2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -1823,12 +1823,6 @@ define <4 x i64> @test_mm256_mul_epi32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_mul_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllq $32, %ymm0, %ymm2 -; CHECK-NEXT: vpsrad $31, %ymm2, %ymm2 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; CHECK-NEXT: vpsllq $32, %ymm1, %ymm2 -; CHECK-NEXT: vpsrad $31, %ymm2, %ymm2 -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; CHECK-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %A = shl <4 x i64> %a0, @@ -1843,9 +1837,6 @@ define <4 x i64> @test_mm256_mul_epu32(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_mul_epu32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7] -; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2],ymm2[3],ymm1[4],ymm2[5],ymm1[6],ymm2[7] ; CHECK-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} %A = and <4 x i64> %a0, Index: test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -1651,10 +1651,6 @@ define <8 x i64> @test_mm512_mul_epi32(<8 x i64> %__A, <8 x i64> %__B) nounwind { ; CHECK-LABEL: test_mm512_mul_epi32: ; CHECK: # %bb.0: -; CHECK-NEXT: vpsllq $32, %zmm0, %zmm0 -; CHECK-NEXT: vpsraq $32, %zmm0, %zmm0 -; CHECK-NEXT: vpsllq $32, %zmm1, %zmm1 -; CHECK-NEXT: vpsraq $32, %zmm1, %zmm1 ; CHECK-NEXT: vpmuldq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %tmp = shl <8 x i64> %__A, @@ -1718,11 +1714,6 @@ define <8 x i64> @test_mm512_mul_epu32(<8 x i64> %__A, <8 x i64> %__B) nounwind { ; CHECK-LABEL: test_mm512_mul_epu32: ; CHECK: # %bb.0: -; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA -; CHECK-NEXT: kmovw %eax, %k0 -; CHECK-NEXT: knotw %k0, %k1 -; CHECK-NEXT: vmovdqa32 %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vmovdqa32 %zmm1, %zmm1 {%k1} {z} ; CHECK-NEXT: vpmuludq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: ret{{[l|q]}} %tmp = and <8 x i64> %__A, @@ -6482,14 +6473,7 @@ ; X86-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-NEXT: vpsrlq $32, %xmm0, %xmm2 -; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 -; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; X86-NEXT: vpsllq $32, %xmm2, %xmm2 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 @@ -6516,14 +6500,7 @@ ; X64-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-NEXT: vpsrlq $32, %xmm0, %xmm2 -; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 -; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; X64-NEXT: vpsllq $32, %xmm2, %xmm2 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 @@ -6687,14 +6664,7 @@ ; X86-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 ; X86-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; X86-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X86-NEXT: vpsrlq $32, %xmm0, %xmm2 -; X86-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; X86-NEXT: vpsrlq $32, %xmm1, %xmm3 -; X86-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; X86-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; X86-NEXT: vpsllq $32, %xmm2, %xmm2 ; X86-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; X86-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; X86-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X86-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X86-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 @@ -6724,14 +6694,7 @@ ; X64-NEXT: vpmuludq %ymm0, %ymm1, %ymm0 ; X64-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; X64-NEXT: vextracti128 $1, %ymm0, %xmm1 -; X64-NEXT: vpsrlq $32, %xmm0, %xmm2 -; X64-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; X64-NEXT: vpsrlq $32, %xmm1, %xmm3 -; X64-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; X64-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; X64-NEXT: vpsllq $32, %xmm2, %xmm2 ; X64-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; X64-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; X64-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; X64-NEXT: vpsrldq {{.*#+}} xmm2 = xmm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; X64-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 Index: test/CodeGen/X86/pmul.ll =================================================================== --- test/CodeGen/X86/pmul.ll +++ test/CodeGen/X86/pmul.ll @@ -1251,75 +1251,54 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE2-LABEL: mul_v8i64_sext: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm11 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pcmpgtd %xmm0, %xmm3 -; SSE2-NEXT: psrad $16, %xmm0 -; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm6 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm4, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm0, %xmm6 -; SSE2-NEXT: paddq %xmm3, %xmm6 -; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm5[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pmuludq %xmm4, %xmm0 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm4 -; SSE2-NEXT: psrad $16, %xmm3 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1] -; SSE2-NEXT: psllq $32, %xmm6 -; SSE2-NEXT: paddq %xmm6, %xmm0 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm1, %xmm6 -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm1, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm3, %xmm6 -; SSE2-NEXT: paddq %xmm4, %xmm6 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pcmpgtd %xmm7, %xmm4 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pcmpgtd %xmm7, %xmm2 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] -; SSE2-NEXT: pmuludq %xmm3, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; SSE2-NEXT: psllq $32, %xmm6 -; SSE2-NEXT: paddq %xmm6, %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm2, %xmm6 -; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm2, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm7, %xmm6 -; SSE2-NEXT: paddq %xmm4, %xmm6 -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm5[2,3,0,1] -; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7] -; SSE2-NEXT: pmuludq %xmm7, %xmm2 -; SSE2-NEXT: pxor %xmm5, %xmm5 -; SSE2-NEXT: pcmpgtd %xmm4, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pcmpgtd %xmm4, %xmm0 ; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1] -; SSE2-NEXT: psllq $32, %xmm6 -; SSE2-NEXT: paddq %xmm6, %xmm2 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pcmpgtd %xmm3, %xmm6 -; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1] -; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1] -; SSE2-NEXT: pmuludq %xmm3, %xmm5 -; SSE2-NEXT: pmuludq %xmm4, %xmm6 -; SSE2-NEXT: paddq %xmm5, %xmm6 -; SSE2-NEXT: pmuludq %xmm4, %xmm3 -; SSE2-NEXT: psllq $32, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm6 = xmm3[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pcmpgtd %xmm6, %xmm3 +; SSE2-NEXT: psrad $16, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm1[0,2,2,3,4,5,6,7] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pcmpgtd %xmm5, %xmm1 +; SSE2-NEXT: psrad $16, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm9 = xmm11[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,1,1,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm12 = xmm10[2,1,3,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,1,3] +; SSE2-NEXT: pmuludq %xmm10, %xmm4 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1] +; SSE2-NEXT: pmuludq %xmm10, %xmm0 +; SSE2-NEXT: psllq $32, %xmm0 +; SSE2-NEXT: paddq %xmm4, %xmm0 +; SSE2-NEXT: pmuludq %xmm12, %xmm5 +; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1] +; SSE2-NEXT: pmuludq %xmm12, %xmm1 +; SSE2-NEXT: psllq $32, %xmm1 +; SSE2-NEXT: paddq %xmm5, %xmm1 +; SSE2-NEXT: pmuludq %xmm11, %xmm7 +; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm8[0],xmm2[1],xmm8[1] +; SSE2-NEXT: pmuludq %xmm11, %xmm2 +; SSE2-NEXT: psllq $32, %xmm2 +; SSE2-NEXT: paddq %xmm7, %xmm2 +; SSE2-NEXT: pmuludq %xmm9, %xmm6 +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] +; SSE2-NEXT: pmuludq %xmm9, %xmm3 +; SSE2-NEXT: psllq $32, %xmm3 ; SSE2-NEXT: paddq %xmm6, %xmm3 ; SSE2-NEXT: retq ; Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -2758,23 +2758,13 @@ declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone define <2 x i64> @test_mm_mul_epu32(<2 x i64> %a0, <2 x i64> %a1) nounwind { -; X86-SSE-LABEL: test_mm_mul_epu32: -; X86-SSE: # %bb.0: -; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; X86-SSE-NEXT: # encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] -; X86-SSE-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}, kind: FK_Data_4 -; X86-SSE-NEXT: pand %xmm2, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc2] -; X86-SSE-NEXT: pand %xmm2, %xmm1 # encoding: [0x66,0x0f,0xdb,0xca] -; X86-SSE-NEXT: pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1] -; X86-SSE-NEXT: retl # encoding: [0xc3] +; SSE-LABEL: test_mm_mul_epu32: +; SSE: # %bb.0: +; SSE-NEXT: pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1] +; SSE-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; ; AVX1-LABEL: test_mm_mul_epu32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 # encoding: [0xc5,0xe9,0xef,0xd2] -; AVX1-NEXT: vpblendw $204, %xmm2, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x0e,0xc2,0xcc] -; AVX1-NEXT: # xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpblendw $204, %xmm2, %xmm1, %xmm1 # encoding: [0xc4,0xe3,0x71,0x0e,0xca,0xcc] -; AVX1-NEXT: # xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0xf4,0xc1] ; AVX1-NEXT: ret{{[l|q]}} # encoding: [0xc3] ; @@ -2787,16 +2777,6 @@ ; AVX512-NEXT: # xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3] ; AVX512-NEXT: vpmullq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] -; -; X64-SSE-LABEL: test_mm_mul_epu32: -; X64-SSE: # %bb.0: -; X64-SSE-NEXT: movdqa {{.*#+}} xmm2 = [4294967295,0,4294967295,0] -; X64-SSE-NEXT: # encoding: [0x66,0x0f,0x6f,0x15,A,A,A,A] -; X64-SSE-NEXT: # fixup A - offset: 4, value: {{\.LCPI.*}}-4, kind: reloc_riprel_4byte -; X64-SSE-NEXT: pand %xmm2, %xmm0 # encoding: [0x66,0x0f,0xdb,0xc2] -; X64-SSE-NEXT: pand %xmm2, %xmm1 # encoding: [0x66,0x0f,0xdb,0xca] -; X64-SSE-NEXT: pmuludq %xmm1, %xmm0 # encoding: [0x66,0x0f,0xf4,0xc1] -; X64-SSE-NEXT: retq # encoding: [0xc3] %A = and <2 x i64> %a0, %B = and <2 x i64> %a1, %res = mul nuw <2 x i64> %A, %B Index: test/CodeGen/X86/sse41-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse41-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse41-intrinsics-fast-isel.ll @@ -832,26 +832,11 @@ define <2 x i64> @test_mm_mul_epi32(<2 x i64> %a0, <2 x i64> %a1) { ; SSE-LABEL: test_mm_mul_epi32: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psllq $32, %xmm2 -; SSE-NEXT: psrad $31, %xmm2 -; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: psllq $32, %xmm0 -; SSE-NEXT: psrad $31, %xmm0 -; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; SSE-NEXT: pmuldq %xmm0, %xmm2 -; SSE-NEXT: movdqa %xmm2, %xmm0 +; SSE-NEXT: pmuldq %xmm1, %xmm0 ; SSE-NEXT: ret{{[l|q]}} ; ; AVX1-LABEL: test_mm_mul_epi32: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsllq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2 -; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] ; AVX1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: ret{{[l|q]}} ; Index: test/CodeGen/X86/vector-mul.ll =================================================================== --- test/CodeGen/X86/vector-mul.ll +++ test/CodeGen/X86/vector-mul.ll @@ -456,7 +456,7 @@ define <2 x i64> @mul_v2i64_17_65(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_17_65: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [17,0,65,0] +; X86-NEXT: movdqa {{.*#+}} xmm1 = <17,u,65,u> ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm0 @@ -797,7 +797,7 @@ define <2 x i64> @mul_v2i64_15_63(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_15_63: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [15,0,63,0] +; X86-NEXT: movdqa {{.*#+}} xmm1 = <15,u,63,u> ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm0 @@ -837,7 +837,7 @@ ; X86-NEXT: pmuludq %xmm0, %xmm1 ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: psrlq $32, %xmm2 -; X86-NEXT: movdqa {{.*#+}} xmm3 = [4294967281,4294967295,4294967233,4294967295] +; X86-NEXT: movdqa {{.*#+}} xmm3 = <4294967281,u,4294967233,u> ; X86-NEXT: pmuludq %xmm3, %xmm2 ; X86-NEXT: paddq %xmm1, %xmm2 ; X86-NEXT: psllq $32, %xmm2 @@ -881,7 +881,7 @@ ; X86-NEXT: pmuludq %xmm0, %xmm1 ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: psrlq $32, %xmm2 -; X86-NEXT: movdqa {{.*#+}} xmm3 = [4294967279,4294967295,4294967231,4294967295] +; X86-NEXT: movdqa {{.*#+}} xmm3 = <4294967279,u,4294967231,u> ; X86-NEXT: pmuludq %xmm3, %xmm2 ; X86-NEXT: paddq %xmm1, %xmm2 ; X86-NEXT: psllq $32, %xmm2 @@ -921,7 +921,7 @@ define <2 x i64> @mul_v2i64_0_1(<2 x i64> %a0) nounwind { ; X86-LABEL: mul_v2i64_0_1: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [0,0,1,0] +; X86-NEXT: movdqa {{.*#+}} xmm1 = <0,u,1,u> ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm0 @@ -963,7 +963,7 @@ ; X86: # %bb.0: ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrlq $32, %xmm1 -; X86-NEXT: movdqa {{.*#+}} xmm2 = [0,0,4294967295,4294967295] +; X86-NEXT: movdqa {{.*#+}} xmm2 = <0,u,4294967295,u> ; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: movdqa %xmm2, %xmm3 ; X86-NEXT: psrlq $32, %xmm3 @@ -1017,7 +1017,7 @@ ; X86: # %bb.0: ; X86-NEXT: movdqa %xmm0, %xmm1 ; X86-NEXT: psrlq $32, %xmm1 -; X86-NEXT: movdqa {{.*#+}} xmm2 = [15,0,4294967233,4294967295] +; X86-NEXT: movdqa {{.*#+}} xmm2 = <15,u,4294967233,u> ; X86-NEXT: pmuludq %xmm2, %xmm1 ; X86-NEXT: movdqa %xmm2, %xmm3 ; X86-NEXT: psrlq $32, %xmm3 @@ -1155,7 +1155,7 @@ define <2 x i64> @mul_v2i64_68_132(<2 x i64> %x) nounwind { ; X86-LABEL: mul_v2i64_68_132: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [68,0,132,0] +; X86-NEXT: movdqa {{.*#+}} xmm1 = <68,u,132,u> ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm0 @@ -1191,7 +1191,7 @@ define <2 x i64> @mul_v2i64_60_120(<2 x i64> %x) nounwind { ; X86-LABEL: mul_v2i64_60_120: ; X86: # %bb.0: -; X86-NEXT: movdqa {{.*#+}} xmm1 = [60,0,124,0] +; X86-NEXT: movdqa {{.*#+}} xmm1 = <60,u,124,u> ; X86-NEXT: movdqa %xmm0, %xmm2 ; X86-NEXT: pmuludq %xmm1, %xmm2 ; X86-NEXT: psrlq $32, %xmm0 Index: test/CodeGen/X86/vector-reduce-mul-widen.ll =================================================================== --- test/CodeGen/X86/vector-reduce-mul-widen.ll +++ test/CodeGen/X86/vector-reduce-mul-widen.ll @@ -93,16 +93,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE-LABEL: test_v4i64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 @@ -120,14 +111,7 @@ ; AVX1-LABEL: test_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 @@ -144,14 +128,7 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 @@ -168,14 +145,7 @@ ; AVX512BW-LABEL: test_v4i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 @@ -192,14 +162,7 @@ ; AVX512BWVL-LABEL: test_v4i64: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 @@ -240,36 +203,9 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE-LABEL: test_v8i64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm1, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 ; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm0, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 @@ -288,30 +224,9 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1 -; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm3 -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 @@ -327,23 +242,9 @@ ; ; AVX2-LABEL: test_v8i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 @@ -360,23 +261,9 @@ ; AVX512BW-LABEL: test_v8i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 @@ -393,23 +280,9 @@ ; AVX512BWVL-LABEL: test_v8i64: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 @@ -453,152 +326,40 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE-LABEL: test_v16i64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: psrlq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm2, %xmm9 -; SSE-NEXT: paddq %xmm8, %xmm9 -; SSE-NEXT: psllq $32, %xmm9 ; SSE-NEXT: pmuludq %xmm6, %xmm2 -; SSE-NEXT: paddq %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm0, %xmm6 -; SSE-NEXT: paddq %xmm8, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 ; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: paddq %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm3, %xmm6 -; SSE-NEXT: paddq %xmm4, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm2, %xmm0 ; SSE-NEXT: pmuludq %xmm7, %xmm3 -; SSE-NEXT: paddq %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm1, %xmm6 -; SSE-NEXT: paddq %xmm4, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 ; SSE-NEXT: pmuludq %xmm5, %xmm1 -; SSE-NEXT: paddq %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm1, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 ; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm0, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pmuludq %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pmuludq %xmm0, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: paddq %xmm3, %xmm1 +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5 -; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 -; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6 -; AVX1-NEXT: vpmuludq %xmm3, %xmm6, %xmm6 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm7 -; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm7 -; AVX1-NEXT: vpaddq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 -; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm1 -; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm2 -; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2 -; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmuludq %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 @@ -614,39 +375,11 @@ ; ; AVX2-LABEL: test_v16i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm4 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5 -; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5 -; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 ; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4 -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 @@ -662,32 +395,11 @@ ; ; AVX512BW-LABEL: test_v16i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 @@ -703,32 +415,11 @@ ; ; AVX512BWVL-LABEL: test_v16i64: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 Index: test/CodeGen/X86/vector-reduce-mul.ll =================================================================== --- test/CodeGen/X86/vector-reduce-mul.ll +++ test/CodeGen/X86/vector-reduce-mul.ll @@ -93,16 +93,7 @@ define i64 @test_v4i64(<4 x i64> %a0) { ; SSE-LABEL: test_v4i64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 @@ -120,14 +111,7 @@ ; AVX1-LABEL: test_v4i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 @@ -144,14 +128,7 @@ ; AVX2-LABEL: test_v4i64: ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 @@ -168,14 +145,7 @@ ; AVX512BW-LABEL: test_v4i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BW-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512BW-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512BW-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512BW-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX512BW-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512BW-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 @@ -192,14 +162,7 @@ ; AVX512BWVL-LABEL: test_v4i64: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX512BWVL-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX512BWVL-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX512BWVL-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX512BWVL-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BWVL-NEXT: vpsrlq $32, %ymm0, %ymm2 ; AVX512BWVL-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 @@ -240,36 +203,9 @@ define i64 @test_v8i64(<8 x i64> %a0) { ; SSE-LABEL: test_v8i64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm1, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 ; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm0, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 ; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 ; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE-NEXT: movdqa %xmm0, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 @@ -288,30 +224,9 @@ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm4 -; AVX1-NEXT: vpmuludq %xmm2, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm3, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 ; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpmuludq %xmm4, %xmm0, %xmm4 -; AVX1-NEXT: vpaddq %xmm3, %xmm4, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm1 -; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm3 -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm1, %xmm3, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 @@ -327,23 +242,9 @@ ; ; AVX2-LABEL: test_v8i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 @@ -360,23 +261,9 @@ ; AVX512BW-LABEL: test_v8i64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 @@ -393,23 +280,9 @@ ; AVX512BWVL-LABEL: test_v8i64: ; AVX512BWVL: # %bb.0: ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 @@ -453,152 +326,40 @@ define i64 @test_v16i64(<16 x i64> %a0) { ; SSE-LABEL: test_v16i64: ; SSE: # %bb.0: -; SSE-NEXT: movdqa %xmm2, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm6, %xmm8 -; SSE-NEXT: movdqa %xmm6, %xmm9 -; SSE-NEXT: psrlq $32, %xmm9 -; SSE-NEXT: pmuludq %xmm2, %xmm9 -; SSE-NEXT: paddq %xmm8, %xmm9 -; SSE-NEXT: psllq $32, %xmm9 ; SSE-NEXT: pmuludq %xmm6, %xmm2 -; SSE-NEXT: paddq %xmm9, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm8 -; SSE-NEXT: psrlq $32, %xmm8 -; SSE-NEXT: pmuludq %xmm4, %xmm8 -; SSE-NEXT: movdqa %xmm4, %xmm6 -; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm0, %xmm6 -; SSE-NEXT: paddq %xmm8, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 ; SSE-NEXT: pmuludq %xmm4, %xmm0 -; SSE-NEXT: paddq %xmm6, %xmm0 -; SSE-NEXT: movdqa %xmm3, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm7, %xmm4 -; SSE-NEXT: movdqa %xmm7, %xmm6 -; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm3, %xmm6 -; SSE-NEXT: paddq %xmm4, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 +; SSE-NEXT: pmuludq %xmm2, %xmm0 ; SSE-NEXT: pmuludq %xmm7, %xmm3 -; SSE-NEXT: paddq %xmm6, %xmm3 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm5, %xmm4 -; SSE-NEXT: movdqa %xmm5, %xmm6 -; SSE-NEXT: psrlq $32, %xmm6 -; SSE-NEXT: pmuludq %xmm1, %xmm6 -; SSE-NEXT: paddq %xmm4, %xmm6 -; SSE-NEXT: psllq $32, %xmm6 ; SSE-NEXT: pmuludq %xmm5, %xmm1 -; SSE-NEXT: paddq %xmm6, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm3, %xmm4 -; SSE-NEXT: movdqa %xmm3, %xmm5 -; SSE-NEXT: psrlq $32, %xmm5 -; SSE-NEXT: pmuludq %xmm1, %xmm5 -; SSE-NEXT: paddq %xmm4, %xmm5 -; SSE-NEXT: psllq $32, %xmm5 ; SSE-NEXT: pmuludq %xmm3, %xmm1 -; SSE-NEXT: paddq %xmm5, %xmm1 -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm2, %xmm4 -; SSE-NEXT: psrlq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm0, %xmm4 -; SSE-NEXT: paddq %xmm3, %xmm4 -; SSE-NEXT: psllq $32, %xmm4 -; SSE-NEXT: pmuludq %xmm2, %xmm0 -; SSE-NEXT: paddq %xmm4, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm2 +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] +; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 +; SSE-NEXT: pmuludq %xmm0, %xmm2 ; SSE-NEXT: movdqa %xmm1, %xmm3 -; SSE-NEXT: psrlq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm0, %xmm3 -; SSE-NEXT: paddq %xmm2, %xmm3 -; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pmuludq %xmm1, %xmm2 -; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: psrldq {{.*#+}} xmm3 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pmuludq %xmm0, %xmm3 +; SSE-NEXT: pmuludq %xmm1, %xmm3 ; SSE-NEXT: paddq %xmm2, %xmm3 ; SSE-NEXT: psllq $32, %xmm3 -; SSE-NEXT: pmuludq %xmm1, %xmm0 -; SSE-NEXT: paddq %xmm3, %xmm0 -; SSE-NEXT: movq %xmm0, %rax +; SSE-NEXT: pmuludq %xmm0, %xmm1 +; SSE-NEXT: paddq %xmm3, %xmm1 +; SSE-NEXT: movq %xmm1, %rax ; SSE-NEXT: retq ; ; AVX1-LABEL: test_v16i64: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 -; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 -; AVX1-NEXT: vpmuludq %xmm5, %xmm1, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsllq $32, %xmm4, %xmm4 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm5 -; AVX1-NEXT: vpaddq %xmm4, %xmm5, %xmm4 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm5 -; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm5 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 -; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 -; AVX1-NEXT: vpsllq $32, %xmm5, %xmm5 -; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm5, %xmm6, %xmm5 +; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm4 +; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm5 +; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm4 ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm6 -; AVX1-NEXT: vpmuludq %xmm3, %xmm6, %xmm6 -; AVX1-NEXT: vpsrlq $32, %xmm3, %xmm7 -; AVX1-NEXT: vpmuludq %xmm7, %xmm1, %xmm7 -; AVX1-NEXT: vpaddq %xmm6, %xmm7, %xmm6 -; AVX1-NEXT: vpsllq $32, %xmm6, %xmm6 ; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm1 -; AVX1-NEXT: vpaddq %xmm6, %xmm1, %xmm1 ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm2, %xmm3, %xmm3 -; AVX1-NEXT: vpsrlq $32, %xmm2, %xmm6 -; AVX1-NEXT: vpmuludq %xmm6, %xmm0, %xmm6 -; AVX1-NEXT: vpaddq %xmm3, %xmm6, %xmm3 -; AVX1-NEXT: vpsllq $32, %xmm3, %xmm3 ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 -; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm3 -; AVX1-NEXT: vpmuludq %xmm3, %xmm0, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 -; AVX1-NEXT: vpsrlq $32, %xmm5, %xmm1 -; AVX1-NEXT: vpmuludq %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm4, %xmm2 -; AVX1-NEXT: vpmuludq %xmm2, %xmm5, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsllq $32, %xmm1, %xmm1 -; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm2 -; AVX1-NEXT: vpaddq %xmm1, %xmm2, %xmm1 -; AVX1-NEXT: vpsrlq $32, %xmm1, %xmm2 -; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2 -; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm3 -; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm3 -; AVX1-NEXT: vpaddq %xmm2, %xmm3, %xmm2 -; AVX1-NEXT: vpsllq $32, %xmm2, %xmm2 -; AVX1-NEXT: vpmuludq %xmm0, %xmm1, %xmm0 -; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 +; AVX1-NEXT: vpmuludq %xmm0, %xmm4, %xmm0 ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX1-NEXT: vpsrlq $32, %xmm0, %xmm2 ; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm2 @@ -614,39 +375,11 @@ ; ; AVX2-LABEL: test_v16i64: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm4 -; AVX2-NEXT: vpmuludq %ymm3, %ymm4, %ymm4 -; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm5 -; AVX2-NEXT: vpmuludq %ymm5, %ymm1, %ymm5 -; AVX2-NEXT: vpaddq %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vpsllq $32, %ymm4, %ymm4 ; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vpaddq %ymm4, %ymm1, %ymm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm3 -; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm3 -; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4 -; AVX2-NEXT: vpmuludq %ymm4, %ymm0, %ymm4 -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 -; AVX2-NEXT: vpsllq $32, %ymm3, %ymm3 ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX2-NEXT: vpsrlq $32, %ymm0, %ymm2 -; AVX2-NEXT: vpmuludq %ymm1, %ymm2, %ymm2 -; AVX2-NEXT: vpsrlq $32, %ymm1, %ymm3 -; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm3 -; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpsllq $32, %ymm2, %ymm2 ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX2-NEXT: vpsrldq {{.*#+}} ymm2 = ymm0[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[28,29,30,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm2 @@ -662,32 +395,11 @@ ; ; AVX512BW-LABEL: test_v16i64: ; AVX512BW: # %bb.0: -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BW-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BW-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BW-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2 ; AVX512BW-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 @@ -703,32 +415,11 @@ ; ; AVX512BWVL-LABEL: test_v16i64: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 -; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2 -; AVX512BWVL-NEXT: vpsrlq $32, %zmm1, %zmm3 -; AVX512BWVL-NEXT: vpmuludq %zmm3, %zmm0, %zmm3 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm3, %zmm2 -; AVX512BWVL-NEXT: vpsllq $32, %zmm2, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm0, %zmm0 -; AVX512BWVL-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; AVX512BWVL-NEXT: vpsrlq $32, %zmm0, %zmm2 ; AVX512BWVL-NEXT: vpmuludq %zmm1, %zmm2, %zmm2