diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -18443,11 +18443,12 @@ bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode()); // Try to collapse shuffles into using a vector type with fewer elements but - // wider element types. We cap this to not form integers or floating point - // elements wider than 64 bits, but it might be interesting to form i128 - // integers to handle flipping the low and high halves of AVX 256-bit vectors. + // wider element types. We cap this to not form floating point elements wider + // than 64 bits, but allow forming i128 integers to handle flipping the low + // and high halves of AVX 256-bit vectors. SmallVector WidenedMask; - if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && + if (VT.getScalarSizeInBits() < (VT.isFloatingPoint() ? 64 : 128) && + !Is1BitVector && canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) { // Shuffle mask widening should not interfere with a broadcast opportunity // by obfuscating the operands with bitcasts. diff --git a/llvm/test/CodeGen/X86/pr34592.ll b/llvm/test/CodeGen/X86/pr34592.ll --- a/llvm/test/CodeGen/X86/pr34592.ll +++ b/llvm/test/CodeGen/X86/pr34592.ll @@ -13,36 +13,35 @@ ; CHECK-NEXT: subq $32, %rsp ; CHECK-NEXT: vmovaps %ymm4, %ymm10 ; CHECK-NEXT: vmovaps %ymm3, %ymm9 -; CHECK-NEXT: vmovaps %ymm1, %ymm8 +; CHECK-NEXT: vmovaps %ymm1, %ymm4 ; CHECK-NEXT: vmovaps %ymm0, %ymm3 ; CHECK-NEXT: vmovaps 240(%rbp), %ymm1 -; CHECK-NEXT: vmovaps 208(%rbp), %ymm4 +; CHECK-NEXT: vmovaps 208(%rbp), %ymm8 ; CHECK-NEXT: vmovaps 176(%rbp), %ymm0 ; CHECK-NEXT: vmovaps 144(%rbp), %ymm0 ; CHECK-NEXT: vmovaps 112(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 80(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 48(%rbp), %ymm11 ; CHECK-NEXT: vmovaps 16(%rbp), %ymm11 -; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm2[6,7] -; CHECK-NEXT: vmovaps %xmm4, %xmm6 -; CHECK-NEXT: # implicit-def: $ymm2 -; CHECK-NEXT: vinserti128 $1, %xmm6, %ymm2, %ymm2 -; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm3[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] -; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,0] -; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6,7] +; CHECK-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm2[6,7] +; CHECK-NEXT: vmovaps %xmm8, %xmm4 ; CHECK-NEXT: vextracti128 $1, %ymm7, %xmm2 -; CHECK-NEXT: vmovq {{.*#+}} xmm6 = xmm2[0],zero +; CHECK-NEXT: vmovq {{.*#+}} xmm3 = xmm2[0],zero ; CHECK-NEXT: # implicit-def: $ymm2 -; CHECK-NEXT: vmovaps %xmm6, %xmm2 -; CHECK-NEXT: # kill: def $xmm3 killed $xmm3 killed $ymm3 +; CHECK-NEXT: vmovaps %xmm3, %xmm2 +; CHECK-NEXT: vmovaps %xmm6, %xmm3 ; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2 ; CHECK-NEXT: vmovaps %xmm7, %xmm3 -; CHECK-NEXT: vpslldq {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] +; CHECK-NEXT: vpslldq {{.*#+}} xmm9 = zero,zero,zero,zero,zero,zero,zero,zero,xmm3[0,1,2,3,4,5,6,7] ; CHECK-NEXT: # implicit-def: $ymm3 -; CHECK-NEXT: vmovaps %xmm6, %xmm3 -; CHECK-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] -; CHECK-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,3] -; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7] +; CHECK-NEXT: vmovaps %xmm9, %xmm3 +; CHECK-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,0,3] +; CHECK-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm8[4,5,6,7] +; CHECK-NEXT: vpalignr {{.*#+}} ymm0 = ymm6[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23] +; CHECK-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,0] +; CHECK-NEXT: vpbroadcastq %xmm4, %ymm4 +; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm4[4,5],ymm0[6,7] ; CHECK-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1],ymm1[2,3],ymm7[4,5,6,7] ; CHECK-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,1,1,3] ; CHECK-NEXT: vpshufd {{.*#+}} ymm4 = ymm5[0,1,0,1,4,5,4,5] diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1010,8 +1010,8 @@ ; ; AVX2-LABEL: shuffle_v4i64_0142: ; AVX2: # %bb.0: -; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1 ; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,2,2] +; AVX2-NEXT: vbroadcastsd %xmm1, %ymm1 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7] ; AVX2-NEXT: retq ;