diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -12927,7 +12927,12 @@ // If we can't broadcast from a register, check that the input is a load. if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) return SDValue(); - } else if (MayFoldLoad(V) && cast(V)->isSimple()) { + } else if (ISD::isNormalLoad(V.getNode()) && + cast(V)->isSimple()) { + // We do not check for one-use of the vector load because a broadcast load + // is expected to be a win for code size, register pressure, and possibly + // uops even if the original vector load is not eliminated. + // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); @@ -12936,8 +12941,7 @@ : Opcode; } - // If we are broadcasting a load that is only used by the shuffle - // then we can reduce the vector load to the broadcasted scalar load. + // Reduce the vector load and shuffle to a broadcasted scalar load. LoadSDNode *Ld = cast(V); SDValue BaseAddr = Ld->getOperand(1); EVT SVT = BroadcastVT.getScalarType(); diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -377,66 +377,60 @@ ; ; AVX2-LABEL: avg_v48i8: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero +; AVX2-NEXT: vpbroadcastq 8(%rdi), %xmm0 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero -; AVX2-NEXT: vmovdqa (%rsi), %xmm6 -; AVX2-NEXT: vmovdqa 16(%rsi), %xmm7 -; AVX2-NEXT: vmovdqa 32(%rsi), %xmm2 -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm5, %ymm3, %ymm3 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm5, %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm5, %ymm4, %ymm4 -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero,xmm7[4],zero,zero,zero,xmm7[5],zero,zero,zero,xmm7[6],zero,zero,zero,xmm7[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = xmm5[0],zero,zero,zero,xmm5[1],zero,zero,zero,xmm5[2],zero,zero,zero,xmm5[3],zero,zero,zero,xmm5[4],zero,zero,zero,xmm5[5],zero,zero,zero,xmm5[6],zero,zero,zero,xmm5[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm5, %ymm9, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpbroadcastq 24(%rdi), %xmm2 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpbroadcastq 40(%rdi), %xmm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpbroadcastq 8(%rsi), %xmm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm6, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm6, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastq 24(%rsi), %xmm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpbroadcastq 40(%rsi), %xmm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpcmpeqd %ymm6, %ymm6, %ymm6 -; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3 ; AVX2-NEXT: vpsubd %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4 ; AVX2-NEXT: vpsubd %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpsubd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 +; AVX2-NEXT: vpsubd %ymm6, %ymm3, %ymm3 +; AVX2-NEXT: vpsubd %ymm6, %ymm4, %ymm4 +; AVX2-NEXT: vpsubd %ymm6, %ymm5, %ymm5 ; AVX2-NEXT: vpsrld $1, %ymm5, %ymm5 -; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 ; AVX2-NEXT: vpsrld $1, %ymm4, %ymm4 -; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 ; AVX2-NEXT: vpsrld $1, %ymm3, %ymm3 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm3[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0 +; AVX2-NEXT: vpsrld $1, %ymm2, %ymm2 +; AVX2-NEXT: vpsrld $1, %ymm1, %ymm1 +; AVX2-NEXT: vpsrld $1, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],ymm0[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: vpackusdw %ymm6, %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm1[2,3],ymm4[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm1, %ymm1 -; AVX2-NEXT: vpackusdw %ymm6, %ymm1, %ymm1 -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm3[2,3],ymm2[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2 +; AVX2-NEXT: vpackusdw %ymm6, %ymm2, %ymm2 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm2 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm3 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-NEXT: vpackuswb %ymm0, %ymm4, %ymm0 -; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm5[2,3] -; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX2-NEXT: vpackusdw %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vpackuswb %ymm0, %ymm3, %ymm0 +; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm5[2,3],ymm4[2,3] +; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm5, %ymm3 +; AVX2-NEXT: vpackusdw %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vpand %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vpackuswb %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovdqu %xmm1, (%rax) diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll --- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll +++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll @@ -1730,15 +1730,15 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp) { ; CHECK-LABEL: test_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm0 -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm1 -; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1] -; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 -; CHECK-NEXT: vpextrd $3, %xmm1, %eax -; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm1 -; CHECK-NEXT: vpextrd $2, %xmm0, %eax -; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm0 +; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm0 +; CHECK-NEXT: vmovdqa (%rdi), %xmm1 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm2 +; CHECK-NEXT: vmovd %xmm1, %eax +; CHECK-NEXT: vpinsrd $1, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vextractps $3, %xmm2, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm0, %xmm0 +; CHECK-NEXT: vpextrd $2, %xmm1, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm0, %xmm0 ; CHECK-NEXT: retq %vec = load <16 x i32>, <16 x i32>* %vp %res = shufflevector <16 x i32> %vec, <16 x i32> undef, <4 x i32> @@ -1747,15 +1747,15 @@ define <4 x i32> @test_masked_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %vec2, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm2 -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm3 -; CHECK-NEXT: vmovd %xmm2, %eax -; CHECK-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,0,1] -; CHECK-NEXT: vpinsrd $1, %eax, %xmm4, %xmm4 -; CHECK-NEXT: vpextrd $3, %xmm3, %eax -; CHECK-NEXT: vpinsrd $2, %eax, %xmm4, %xmm3 -; CHECK-NEXT: vpextrd $2, %xmm2, %eax -; CHECK-NEXT: vpinsrd $3, %eax, %xmm3, %xmm2 +; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm2 +; CHECK-NEXT: vmovdqa (%rdi), %xmm3 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm4 +; CHECK-NEXT: vmovd %xmm3, %eax +; CHECK-NEXT: vpinsrd $1, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vextractps $3, %xmm4, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm2, %xmm2 +; CHECK-NEXT: vpextrd $2, %xmm3, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm2 ; CHECK-NEXT: vptestnmd %xmm1, %xmm1, %k1 ; CHECK-NEXT: vmovdqa32 %xmm2, %xmm0 {%k1} ; CHECK-NEXT: retq @@ -1769,15 +1769,15 @@ define <4 x i32> @test_masked_z_16xi32_to_4xi32_perm_mem_mask3(<16 x i32>* %vp, <4 x i32> %mask) { ; CHECK-LABEL: test_masked_z_16xi32_to_4xi32_perm_mem_mask3: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovdqa (%rdi), %xmm1 -; CHECK-NEXT: vmovdqa 16(%rdi), %xmm2 -; CHECK-NEXT: vmovd %xmm1, %eax -; CHECK-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,3,0,1] -; CHECK-NEXT: vpinsrd $1, %eax, %xmm3, %xmm3 -; CHECK-NEXT: vpextrd $3, %xmm2, %eax -; CHECK-NEXT: vpinsrd $2, %eax, %xmm3, %xmm2 -; CHECK-NEXT: vpextrd $2, %xmm1, %eax -; CHECK-NEXT: vpinsrd $3, %eax, %xmm2, %xmm1 +; CHECK-NEXT: vpbroadcastd 24(%rdi), %xmm1 +; CHECK-NEXT: vmovdqa (%rdi), %xmm2 +; CHECK-NEXT: vmovaps 16(%rdi), %xmm3 +; CHECK-NEXT: vmovd %xmm2, %eax +; CHECK-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vextractps $3, %xmm3, %eax +; CHECK-NEXT: vpinsrd $2, %eax, %xmm1, %xmm1 +; CHECK-NEXT: vpextrd $2, %xmm2, %eax +; CHECK-NEXT: vpinsrd $3, %eax, %xmm1, %xmm1 ; CHECK-NEXT: vptestnmd %xmm0, %xmm0, %k1 ; CHECK-NEXT: vmovdqa32 %xmm1, %xmm0 {%k1} {z} ; CHECK-NEXT: retq @@ -4205,12 +4205,11 @@ define <4 x double> @test_masked_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %vec2, <4 x double> %mask) { ; CHECK-LABEL: test_masked_8xdouble_to_4xdouble_perm_mem_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 -; CHECK-NEXT: vmovapd {{.*#+}} ymm3 = [6,1,1,1] -; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm3 -; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vcmpeqpd %ymm2, %ymm1, %k1 -; CHECK-NEXT: vmovapd %ymm3, %ymm0 {%k1} +; CHECK-NEXT: vmovapd (%rdi), %ymm2 +; CHECK-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm2[2,3],mem[0,1] +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vcmpeqpd %ymm3, %ymm1, %k1 +; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm2, %ymm0 {%k1} ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> @@ -4222,12 +4221,11 @@ define <4 x double> @test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5(<8 x double>* %vp, <4 x double> %mask) { ; CHECK-LABEL: test_masked_z_8xdouble_to_4xdouble_perm_mem_mask5: ; CHECK: # %bb.0: -; CHECK-NEXT: vmovapd 32(%rdi), %ymm2 -; CHECK-NEXT: vmovapd {{.*#+}} ymm1 = [6,1,1,1] -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 -; CHECK-NEXT: vcmpeqpd %ymm3, %ymm0, %k1 -; CHECK-NEXT: vpermi2pd (%rdi), %ymm2, %ymm1 {%k1} {z} -; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: vmovapd (%rdi), %ymm1 +; CHECK-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],mem[0,1] +; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vcmpeqpd %ymm2, %ymm0, %k1 +; CHECK-NEXT: vshufpd $14, 40(%rdi){1to4}, %ymm1, %ymm0 {%k1} {z} ; CHECK-NEXT: retq %vec = load <8 x double>, <8 x double>* %vp %shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll --- a/llvm/test/CodeGen/X86/extract-concat.ll +++ b/llvm/test/CodeGen/X86/extract-concat.ll @@ -126,11 +126,10 @@ ; ; AVX2-LABEL: load_catcat: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovaps (%rdi), %ymm3 ; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm3[1,1,1,1] -; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm3[2,2,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[3,3,3,3] +; AVX2-NEXT: vbroadcastsd 8(%rdi), %ymm1 +; AVX2-NEXT: vbroadcastsd 16(%rdi), %ymm2 +; AVX2-NEXT: vbroadcastsd 24(%rdi), %ymm3 ; AVX2-NEXT: retq ; ; AVX512F-LABEL: load_catcat: diff --git a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll --- a/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll +++ b/llvm/test/CodeGen/X86/merge-consecutive-stores-nt.ll @@ -334,12 +334,12 @@ ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE4A-NEXT: movups (%ecx), %xmm0 ; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1 +; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax) ; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) -; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; X86-SSE4A-NEXT: movntsd %xmm0, 8(%eax) +; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax) ; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax) -; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; X86-SSE4A-NEXT: movntsd %xmm1, 24(%eax) ; X86-SSE4A-NEXT: retl ; ; X64-SSE2-LABEL: merge_2_v4f32_align1_ntstore: @@ -362,12 +362,12 @@ ; X64-SSE4A: # %bb.0: ; X64-SSE4A-NEXT: movups (%rdi), %xmm0 ; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) -; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; X64-SSE4A-NEXT: movntsd %xmm0, 8(%rsi) +; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi) -; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; X64-SSE4A-NEXT: movntsd %xmm1, 24(%rsi) ; X64-SSE4A-NEXT: retq ; ; X64-SSE41-LABEL: merge_2_v4f32_align1_ntstore: @@ -447,12 +447,12 @@ ; X86-SSE4A-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SSE4A-NEXT: movups (%ecx), %xmm0 ; X86-SSE4A-NEXT: movups 16(%ecx), %xmm1 +; X86-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; X86-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; X86-SSE4A-NEXT: movntsd %xmm2, 8(%eax) ; X86-SSE4A-NEXT: movntsd %xmm0, (%eax) -; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; X86-SSE4A-NEXT: movntsd %xmm0, 8(%eax) +; X86-SSE4A-NEXT: movntsd %xmm3, 24(%eax) ; X86-SSE4A-NEXT: movntsd %xmm1, 16(%eax) -; X86-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; X86-SSE4A-NEXT: movntsd %xmm1, 24(%eax) ; X86-SSE4A-NEXT: retl ; ; X64-SSE2-LABEL: merge_2_v4f32_align1: @@ -475,12 +475,12 @@ ; X64-SSE4A: # %bb.0: ; X64-SSE4A-NEXT: movups (%rdi), %xmm0 ; X64-SSE4A-NEXT: movups 16(%rdi), %xmm1 +; X64-SSE4A-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; X64-SSE4A-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; X64-SSE4A-NEXT: movntsd %xmm2, 8(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm0, (%rsi) -; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] -; X64-SSE4A-NEXT: movntsd %xmm0, 8(%rsi) +; X64-SSE4A-NEXT: movntsd %xmm3, 24(%rsi) ; X64-SSE4A-NEXT: movntsd %xmm1, 16(%rsi) -; X64-SSE4A-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] -; X64-SSE4A-NEXT: movntsd %xmm1, 24(%rsi) ; X64-SSE4A-NEXT: retq ; ; X64-SSE41-LABEL: merge_2_v4f32_align1: diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll --- a/llvm/test/CodeGen/X86/oddshuffles.ll +++ b/llvm/test/CodeGen/X86/oddshuffles.ll @@ -1510,35 +1510,32 @@ ; ; AVX1-LABEL: interleave_24i32_in: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovupd (%rsi), %ymm0 -; AVX1-NEXT: vmovups (%rdx), %xmm1 -; AVX1-NEXT: vmovups 16(%rdx), %xmm2 -; AVX1-NEXT: vmovups (%rsi), %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm1[2,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 -; AVX1-NEXT: vpermilps {{.*#+}} xmm3 = mem[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm3, %ymm3 -; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] -; AVX1-NEXT: vmovups 16(%rcx), %xmm3 -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,0],xmm3[3,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,1],xmm4[0,2] -; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[1,0] -; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 -; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = ymm0[1,1,3,3] -; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm3[2,3,2,3] -; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX1-NEXT: vmovups (%rdx), %xmm0 +; AVX1-NEXT: vmovups 16(%rdx), %xmm1 +; AVX1-NEXT: vmovups (%rsi), %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,0],xmm0[2,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm0[1,1],xmm3[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm2[0,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,1] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; AVX1-NEXT: vbroadcastsd (%rcx), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7] +; AVX1-NEXT: vmovups 16(%rcx), %xmm2 +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[3,0],xmm2[3,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm2[2,1],xmm3[0,2] +; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,0],xmm1[1,0] +; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm2[2,0],xmm1[2,2] +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 +; AVX1-NEXT: vbroadcastsd 24(%rsi), %ymm2 +; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] +; AVX1-NEXT: vpermilpd {{.*#+}} ymm2 = mem[1,0,2,2] ; AVX1-NEXT: vpermilpd {{.*#+}} ymm3 = mem[1,1,2,2] -; AVX1-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,2,2] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm0[1],ymm3[2,3],ymm0[4],ymm3[5,6],ymm0[7] +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] ; AVX1-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7] -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] -; AVX1-NEXT: vmovups %ymm0, 32(%rdi) -; AVX1-NEXT: vmovups %ymm2, 64(%rdi) -; AVX1-NEXT: vmovups %ymm1, (%rdi) +; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; AVX1-NEXT: vmovups %ymm2, 32(%rdi) +; AVX1-NEXT: vmovups %ymm1, 64(%rdi) +; AVX1-NEXT: vmovups %ymm0, (%rdi) ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -1557,7 +1554,7 @@ ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} ymm5 = ymm1[1,2,3,3,5,6,7,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[2,2,2,3] ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6],ymm4[7] -; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3] +; AVX2-SLOW-NEXT: vbroadcastsd 24(%rsi), %ymm5 ; AVX2-SLOW-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-SLOW-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] @@ -1586,7 +1583,7 @@ ; AVX2-FAST-NEXT: vpermps %ymm1, %ymm4, %ymm4 ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm2[2,1,3,3] ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5,6],ymm5[7] -; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[0,3,3,3] +; AVX2-FAST-NEXT: vbroadcastsd 24(%rsi), %ymm5 ; AVX2-FAST-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm5[2],ymm4[3,4],ymm5[5],ymm4[6,7] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,1,2,2] ; AVX2-FAST-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[1,1,2,2] @@ -1601,34 +1598,32 @@ ; ; XOP-LABEL: interleave_24i32_in: ; XOP: # %bb.0: -; XOP-NEXT: vmovupd (%rsi), %ymm0 +; XOP-NEXT: vmovups (%rsi), %ymm0 ; XOP-NEXT: vmovups (%rcx), %ymm1 -; XOP-NEXT: vmovups (%rdx), %xmm2 -; XOP-NEXT: vmovups 16(%rdx), %xmm3 -; XOP-NEXT: vmovups (%rsi), %xmm4 -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,0],xmm2[2,0] -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[1,1],xmm5[0,2] -; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,0],xmm4[0,0] -; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm2[2,0],xmm4[2,1] -; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 -; XOP-NEXT: vpermilps {{.*#+}} xmm4 = mem[0,1,0,1] -; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm4, %ymm4 -; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm4[2],ymm2[3,4],ymm4[5],ymm2[6,7] -; XOP-NEXT: vmovups 16(%rcx), %xmm4 -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm3[3,0],xmm4[3,0] -; XOP-NEXT: vshufps {{.*#+}} xmm5 = xmm4[2,1],xmm5[0,2] -; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0],xmm3[1,0] -; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,0],xmm3[2,2] -; XOP-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3 -; XOP-NEXT: vpermilpd {{.*#+}} ymm4 = ymm0[1,1,3,3] -; XOP-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm4[2,3,2,3] -; XOP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] ; XOP-NEXT: vpermil2ps {{.*#+}} ymm0 = ymm1[2],ymm0[3],ymm1[2,3],ymm0[4],ymm1[5,4],ymm0[5] -; XOP-NEXT: vpermilps {{.*#+}} ymm1 = mem[0,0,3,3,4,4,7,7] -; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] +; XOP-NEXT: vmovups (%rdx), %xmm1 +; XOP-NEXT: vmovups 16(%rdx), %xmm2 +; XOP-NEXT: vmovups (%rsi), %xmm3 +; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,0],xmm1[2,0] +; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm1[1,1],xmm4[0,2] +; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0],xmm3[0,0] +; XOP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[2,1] +; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm1, %ymm1 +; XOP-NEXT: vbroadcastsd (%rcx), %ymm3 +; XOP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] +; XOP-NEXT: vmovups 16(%rcx), %xmm3 +; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm2[3,0],xmm3[3,0] +; XOP-NEXT: vshufps {{.*#+}} xmm4 = xmm3[2,1],xmm4[0,2] +; XOP-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0],xmm2[1,0] +; XOP-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,0],xmm2[2,2] +; XOP-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2 +; XOP-NEXT: vbroadcastsd 24(%rsi), %ymm3 +; XOP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] +; XOP-NEXT: vpermilps {{.*#+}} ymm3 = mem[0,0,3,3,4,4,7,7] +; XOP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7] ; XOP-NEXT: vmovups %ymm0, 32(%rdi) -; XOP-NEXT: vmovups %ymm3, 64(%rdi) -; XOP-NEXT: vmovups %ymm2, (%rdi) +; XOP-NEXT: vmovups %ymm2, 64(%rdi) +; XOP-NEXT: vmovups %ymm1, (%rdi) ; XOP-NEXT: vzeroupper ; XOP-NEXT: retq %s1 = load <8 x i32>, <8 x i32>* %q1, align 4 diff --git a/llvm/test/CodeGen/X86/pr34653.ll b/llvm/test/CodeGen/X86/pr34653.ll --- a/llvm/test/CodeGen/X86/pr34653.ll +++ b/llvm/test/CodeGen/X86/pr34653.ll @@ -16,25 +16,23 @@ ; CHECK-NEXT: leaq {{[0-9]+}}(%rsp), %rdi ; CHECK-NEXT: callq test ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: vmovaps %xmm0, %xmm1 -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm2 -; CHECK-NEXT: vmovaps %xmm2, %xmm3 -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm4 -; CHECK-NEXT: vmovaps %xmm4, %xmm5 -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm6 -; CHECK-NEXT: vmovaps %xmm6, %xmm7 -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm8 -; CHECK-NEXT: vmovaps %xmm8, %xmm9 -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm10 -; CHECK-NEXT: vmovaps %xmm10, %xmm11 -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm12 -; CHECK-NEXT: vmovaps %xmm12, %xmm13 -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm14 -; CHECK-NEXT: vmovaps %xmm14, %xmm15 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 -; CHECK-NEXT: vmovaps %zmm0, %zmm16 -; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm1 +; CHECK-NEXT: vmovaps %xmm1, %xmm2 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm3 +; CHECK-NEXT: vmovaps %xmm3, %xmm4 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm5 +; CHECK-NEXT: vmovaps %xmm5, %xmm6 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm7 +; CHECK-NEXT: vmovaps %xmm7, %xmm8 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm9 +; CHECK-NEXT: vmovaps %xmm9, %xmm10 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm11 +; CHECK-NEXT: vmovaps %xmm11, %xmm12 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm13 +; CHECK-NEXT: vmovaps %xmm13, %xmm14 +; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm15 +; CHECK-NEXT: vmovaps %zmm15, %zmm16 +; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovupd {{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: vmovaps %zmm0, %zmm17 ; CHECK-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill @@ -67,19 +65,14 @@ ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm2 = xmm2[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm4 = xmm4[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm6 = xmm6[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm8 = xmm8[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm10 = xmm10[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm12 = xmm12[1,0] -; CHECK-NEXT: vpermilpd {{.*#+}} xmm14 = xmm14[1,0] -; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm5[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm7 = xmm7[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm9 = xmm9[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm11 = xmm11[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm13[1,0] +; CHECK-NEXT: vpermilpd {{.*#+}} xmm15 = xmm15[1,0] ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] @@ -110,8 +103,7 @@ ; CHECK-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 8-byte Reload -; CHECK-NEXT: # xmm24 = mem[0],zero +; CHECK-NEXT: vmovsd {{.*#+}} xmm24 = mem[0],zero ; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm25 # 8-byte Reload ; CHECK-NEXT: # xmm25 = mem[0],zero ; CHECK-NEXT: vmovsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm26 # 8-byte Reload diff --git a/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll b/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll --- a/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-cmp-sub128.ll @@ -60,15 +60,13 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-16, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3 -; AVX-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX-32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX-32-NEXT: xorl %eax, %eax -; AVX-32-NEXT: vcomiss %xmm4, %xmm5 +; AVX-32-NEXT: vcomiss 12(%ebp), %xmm3 ; AVX-32-NEXT: movl $-1, %ecx ; AVX-32-NEXT: movl $0, %edx ; AVX-32-NEXT: cmoval %ecx, %edx -; AVX-32-NEXT: vcomiss %xmm3, %xmm2 +; AVX-32-NEXT: vcomiss 8(%ebp), %xmm2 ; AVX-32-NEXT: cmoval %ecx, %eax ; AVX-32-NEXT: vmovd %eax, %xmm2 ; AVX-32-NEXT: vpinsrd $1, %edx, %xmm2, %xmm2 @@ -99,17 +97,15 @@ ; AVX512-32-NEXT: movl %esp, %ebp ; AVX512-32-NEXT: andl $-16, %esp ; AVX512-32-NEXT: subl $16, %esp -; AVX512-32-NEXT: vmovaps 8(%ebp), %xmm3 ; AVX512-32-NEXT: movw $-3, %ax ; AVX512-32-NEXT: kmovw %eax, %k0 -; AVX512-32-NEXT: vcomiss %xmm3, %xmm2 +; AVX512-32-NEXT: vcomiss 8(%ebp), %xmm2 ; AVX512-32-NEXT: seta %al ; AVX512-32-NEXT: andl $1, %eax ; AVX512-32-NEXT: kmovw %eax, %k1 ; AVX512-32-NEXT: kandw %k0, %k1, %k0 -; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] ; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX512-32-NEXT: vcomiss %xmm3, %xmm2 +; AVX512-32-NEXT: vcomiss 12(%ebp), %xmm2 ; AVX512-32-NEXT: seta %al ; AVX512-32-NEXT: kmovw %eax, %k1 ; AVX512-32-NEXT: kshiftlw $15, %k1, %k1 @@ -148,17 +144,15 @@ ; AVX512F-32-NEXT: subl $16, %esp ; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %xmm3 ; AVX512F-32-NEXT: movw $-3, %ax ; AVX512F-32-NEXT: kmovw %eax, %k0 -; AVX512F-32-NEXT: vcomiss %xmm3, %xmm2 +; AVX512F-32-NEXT: vcomiss 8(%ebp), %xmm2 ; AVX512F-32-NEXT: seta %al ; AVX512F-32-NEXT: andl $1, %eax ; AVX512F-32-NEXT: kmovw %eax, %k1 ; AVX512F-32-NEXT: kandw %k0, %k1, %k0 -; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm3[1,1,3,3] ; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX512F-32-NEXT: vcomiss %xmm3, %xmm2 +; AVX512F-32-NEXT: vcomiss 12(%ebp), %xmm2 ; AVX512F-32-NEXT: seta %al ; AVX512F-32-NEXT: kmovw %eax, %k1 ; AVX512F-32-NEXT: kshiftlw $15, %k1, %k1 @@ -257,16 +251,14 @@ ; AVX-32-NEXT: movl %esp, %ebp ; AVX-32-NEXT: andl $-16, %esp ; AVX-32-NEXT: subl $16, %esp -; AVX-32-NEXT: vmovaps 8(%ebp), %xmm3 -; AVX-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX-32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] +; AVX-32-NEXT: vmovshdup {{.*#+}} xmm3 = xmm2[1,1,3,3] ; AVX-32-NEXT: xorl %eax, %eax -; AVX-32-NEXT: vucomiss %xmm4, %xmm5 +; AVX-32-NEXT: vucomiss 12(%ebp), %xmm3 ; AVX-32-NEXT: movl $-1, %ecx ; AVX-32-NEXT: movl $-1, %edx ; AVX-32-NEXT: cmovnel %eax, %edx ; AVX-32-NEXT: cmovpl %eax, %edx -; AVX-32-NEXT: vucomiss %xmm3, %xmm2 +; AVX-32-NEXT: vucomiss 8(%ebp), %xmm2 ; AVX-32-NEXT: cmovnel %eax, %ecx ; AVX-32-NEXT: cmovpl %eax, %ecx ; AVX-32-NEXT: vmovd %ecx, %xmm2 @@ -300,28 +292,26 @@ ; AVX512-32-NEXT: movl %esp, %ebp ; AVX512-32-NEXT: andl $-16, %esp ; AVX512-32-NEXT: subl $16, %esp -; AVX512-32-NEXT: vmovaps 8(%ebp), %xmm3 -; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] -; AVX512-32-NEXT: vucomiss %xmm4, %xmm5 +; AVX512-32-NEXT: movw $-3, %ax +; AVX512-32-NEXT: kmovw %eax, %k0 +; AVX512-32-NEXT: vucomiss 8(%ebp), %xmm2 ; AVX512-32-NEXT: setnp %al ; AVX512-32-NEXT: sete %cl ; AVX512-32-NEXT: testb %al, %cl ; AVX512-32-NEXT: setne %al -; AVX512-32-NEXT: kmovw %eax, %k0 -; AVX512-32-NEXT: kshiftlw $15, %k0, %k0 -; AVX512-32-NEXT: kshiftrw $14, %k0, %k0 -; AVX512-32-NEXT: vucomiss %xmm3, %xmm2 +; AVX512-32-NEXT: andl $1, %eax +; AVX512-32-NEXT: kmovw %eax, %k1 +; AVX512-32-NEXT: kandw %k0, %k1, %k0 +; AVX512-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX512-32-NEXT: vucomiss 12(%ebp), %xmm2 ; AVX512-32-NEXT: setnp %al ; AVX512-32-NEXT: sete %cl ; AVX512-32-NEXT: testb %al, %cl ; AVX512-32-NEXT: setne %al -; AVX512-32-NEXT: andl $1, %eax ; AVX512-32-NEXT: kmovw %eax, %k1 -; AVX512-32-NEXT: movw $-3, %ax -; AVX512-32-NEXT: kmovw %eax, %k2 -; AVX512-32-NEXT: kandw %k2, %k1, %k1 -; AVX512-32-NEXT: korw %k0, %k1, %k1 +; AVX512-32-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-32-NEXT: kshiftrw $14, %k1, %k1 +; AVX512-32-NEXT: korw %k1, %k0, %k1 ; AVX512-32-NEXT: vpblendmd %xmm0, %xmm1, %xmm0 {%k1} ; AVX512-32-NEXT: movl %ebp, %esp ; AVX512-32-NEXT: popl %ebp @@ -361,28 +351,26 @@ ; AVX512F-32-NEXT: subl $16, %esp ; AVX512F-32-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-32-NEXT: vmovaps 8(%ebp), %xmm3 -; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm4 = xmm3[1,1,3,3] -; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm5 = xmm2[1,1,3,3] -; AVX512F-32-NEXT: vucomiss %xmm4, %xmm5 +; AVX512F-32-NEXT: movw $-3, %ax +; AVX512F-32-NEXT: kmovw %eax, %k0 +; AVX512F-32-NEXT: vucomiss 8(%ebp), %xmm2 ; AVX512F-32-NEXT: setnp %al ; AVX512F-32-NEXT: sete %cl ; AVX512F-32-NEXT: testb %al, %cl ; AVX512F-32-NEXT: setne %al -; AVX512F-32-NEXT: kmovw %eax, %k0 -; AVX512F-32-NEXT: kshiftlw $15, %k0, %k0 -; AVX512F-32-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-32-NEXT: vucomiss %xmm3, %xmm2 +; AVX512F-32-NEXT: andl $1, %eax +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: kandw %k0, %k1, %k0 +; AVX512F-32-NEXT: vmovshdup {{.*#+}} xmm2 = xmm2[1,1,3,3] +; AVX512F-32-NEXT: vucomiss 12(%ebp), %xmm2 ; AVX512F-32-NEXT: setnp %al ; AVX512F-32-NEXT: sete %cl ; AVX512F-32-NEXT: testb %al, %cl ; AVX512F-32-NEXT: setne %al -; AVX512F-32-NEXT: andl $1, %eax ; AVX512F-32-NEXT: kmovw %eax, %k1 -; AVX512F-32-NEXT: movw $-3, %ax -; AVX512F-32-NEXT: kmovw %eax, %k2 -; AVX512F-32-NEXT: kandw %k2, %k1, %k1 -; AVX512F-32-NEXT: korw %k0, %k1, %k1 +; AVX512F-32-NEXT: kshiftlw $15, %k1, %k1 +; AVX512F-32-NEXT: kshiftrw $14, %k1, %k1 +; AVX512F-32-NEXT: korw %k1, %k0, %k1 ; AVX512F-32-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; AVX512F-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; AVX512F-32-NEXT: movl %ebp, %esp diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll @@ -1106,34 +1106,61 @@ } define double @test_v16f64(double %a0, <16 x double> %a1) { -; SSE-LABEL: test_v16f64: -; SSE: # %bb.0: -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: addsd %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: addsd %xmm1, %xmm0 -; SSE-NEXT: addsd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: addsd %xmm2, %xmm0 -; SSE-NEXT: addsd %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; SSE-NEXT: addsd %xmm3, %xmm0 -; SSE-NEXT: addsd %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-NEXT: addsd %xmm4, %xmm0 -; SSE-NEXT: addsd %xmm5, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] -; SSE-NEXT: addsd %xmm5, %xmm0 -; SSE-NEXT: addsd %xmm6, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] -; SSE-NEXT: addsd %xmm6, %xmm0 -; SSE-NEXT: addsd %xmm7, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] -; SSE-NEXT: addsd %xmm7, %xmm0 -; SSE-NEXT: addsd %xmm8, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] -; SSE-NEXT: addsd %xmm8, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v16f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: addsd %xmm1, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE2-NEXT: addsd %xmm1, %xmm0 +; SSE2-NEXT: addsd %xmm2, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE2-NEXT: addsd %xmm2, %xmm0 +; SSE2-NEXT: addsd %xmm3, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE2-NEXT: addsd %xmm3, %xmm0 +; SSE2-NEXT: addsd %xmm4, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE2-NEXT: addsd %xmm4, %xmm0 +; SSE2-NEXT: addsd %xmm5, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] +; SSE2-NEXT: addsd %xmm5, %xmm0 +; SSE2-NEXT: addsd %xmm6, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] +; SSE2-NEXT: addsd %xmm6, %xmm0 +; SSE2-NEXT: addsd %xmm7, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] +; SSE2-NEXT: addsd %xmm7, %xmm0 +; SSE2-NEXT: addsd %xmm8, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] +; SSE2-NEXT: addsd %xmm8, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16f64: +; SSE41: # %bb.0: +; SSE41-NEXT: addsd %xmm1, %xmm0 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE41-NEXT: addsd %xmm1, %xmm0 +; SSE41-NEXT: addsd %xmm2, %xmm0 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE41-NEXT: addsd %xmm2, %xmm0 +; SSE41-NEXT: addsd %xmm3, %xmm0 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE41-NEXT: addsd %xmm3, %xmm0 +; SSE41-NEXT: addsd %xmm4, %xmm0 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE41-NEXT: addsd %xmm4, %xmm0 +; SSE41-NEXT: addsd %xmm5, %xmm0 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] +; SSE41-NEXT: addsd %xmm5, %xmm0 +; SSE41-NEXT: addsd %xmm6, %xmm0 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] +; SSE41-NEXT: addsd %xmm6, %xmm0 +; SSE41-NEXT: addsd %xmm7, %xmm0 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] +; SSE41-NEXT: addsd %xmm7, %xmm0 +; SSE41-NEXT: addsd {{[0-9]+}}(%rsp), %xmm0 +; SSE41-NEXT: addsd {{[0-9]+}}(%rsp), %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f64: ; AVX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll --- a/llvm/test/CodeGen/X86/vector-reduce-fmul.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-fmul.ll @@ -1075,34 +1075,61 @@ } define double @test_v16f64(double %a0, <16 x double> %a1) { -; SSE-LABEL: test_v16f64: -; SSE: # %bb.0: -; SSE-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 -; SSE-NEXT: mulsd %xmm1, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] -; SSE-NEXT: mulsd %xmm1, %xmm0 -; SSE-NEXT: mulsd %xmm2, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] -; SSE-NEXT: mulsd %xmm2, %xmm0 -; SSE-NEXT: mulsd %xmm3, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] -; SSE-NEXT: mulsd %xmm3, %xmm0 -; SSE-NEXT: mulsd %xmm4, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] -; SSE-NEXT: mulsd %xmm4, %xmm0 -; SSE-NEXT: mulsd %xmm5, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] -; SSE-NEXT: mulsd %xmm5, %xmm0 -; SSE-NEXT: mulsd %xmm6, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] -; SSE-NEXT: mulsd %xmm6, %xmm0 -; SSE-NEXT: mulsd %xmm7, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] -; SSE-NEXT: mulsd %xmm7, %xmm0 -; SSE-NEXT: mulsd %xmm8, %xmm0 -; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] -; SSE-NEXT: mulsd %xmm8, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: test_v16f64: +; SSE2: # %bb.0: +; SSE2-NEXT: movapd {{[0-9]+}}(%rsp), %xmm8 +; SSE2-NEXT: mulsd %xmm1, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE2-NEXT: mulsd %xmm1, %xmm0 +; SSE2-NEXT: mulsd %xmm2, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE2-NEXT: mulsd %xmm2, %xmm0 +; SSE2-NEXT: mulsd %xmm3, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE2-NEXT: mulsd %xmm3, %xmm0 +; SSE2-NEXT: mulsd %xmm4, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE2-NEXT: mulsd %xmm4, %xmm0 +; SSE2-NEXT: mulsd %xmm5, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] +; SSE2-NEXT: mulsd %xmm5, %xmm0 +; SSE2-NEXT: mulsd %xmm6, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] +; SSE2-NEXT: mulsd %xmm6, %xmm0 +; SSE2-NEXT: mulsd %xmm7, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] +; SSE2-NEXT: mulsd %xmm7, %xmm0 +; SSE2-NEXT: mulsd %xmm8, %xmm0 +; SSE2-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1,1] +; SSE2-NEXT: mulsd %xmm8, %xmm0 +; SSE2-NEXT: retq +; +; SSE41-LABEL: test_v16f64: +; SSE41: # %bb.0: +; SSE41-NEXT: mulsd %xmm1, %xmm0 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1,1] +; SSE41-NEXT: mulsd %xmm1, %xmm0 +; SSE41-NEXT: mulsd %xmm2, %xmm0 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1,1] +; SSE41-NEXT: mulsd %xmm2, %xmm0 +; SSE41-NEXT: mulsd %xmm3, %xmm0 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1,1] +; SSE41-NEXT: mulsd %xmm3, %xmm0 +; SSE41-NEXT: mulsd %xmm4, %xmm0 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1,1] +; SSE41-NEXT: mulsd %xmm4, %xmm0 +; SSE41-NEXT: mulsd %xmm5, %xmm0 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1,1] +; SSE41-NEXT: mulsd %xmm5, %xmm0 +; SSE41-NEXT: mulsd %xmm6, %xmm0 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1,1] +; SSE41-NEXT: mulsd %xmm6, %xmm0 +; SSE41-NEXT: mulsd %xmm7, %xmm0 +; SSE41-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1,1] +; SSE41-NEXT: mulsd %xmm7, %xmm0 +; SSE41-NEXT: mulsd {{[0-9]+}}(%rsp), %xmm0 +; SSE41-NEXT: mulsd {{[0-9]+}}(%rsp), %xmm0 +; SSE41-NEXT: retq ; ; AVX-LABEL: test_v16f64: ; AVX: # %bb.0: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll @@ -3008,11 +3008,10 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0] ; AVX-NEXT: vmovaps %xmm0, (%rax) -; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3] -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: vmovss %xmm0, (%rax) ; AVX-NEXT: retq store <4 x float> , <4 x float>* undef, align 16