Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -15623,6 +15623,12 @@ SmallVector WidenedMask; if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && canWidenShuffleElements(ZeroableMask, WidenedMask)) { + // Shuffle mask widening should not interfere with a broadcast opportunity + // by obfuscating the splat mask and operands with bitcasts. + if (SDValue Broadcast = + lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) + return Broadcast; + MVT NewEltVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); Index: test/CodeGen/X86/haddsub-undef.ll =================================================================== --- test/CodeGen/X86/haddsub-undef.ll +++ test/CodeGen/X86/haddsub-undef.ll @@ -696,7 +696,7 @@ ; ; AVX2-SLOW-LABEL: add_ps_007_2: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX2-SLOW-NEXT: vbroadcastss %xmm0, %xmm1 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: retq @@ -826,7 +826,7 @@ ; ; AVX2-SLOW-LABEL: add_ps_018: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovddup {{.*#+}} xmm1 = xmm0[0,0] +; AVX2-SLOW-NEXT: vbroadcastss %xmm0, %xmm1 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3] ; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,2,3] Index: test/CodeGen/X86/insert-into-constant-vector.ll =================================================================== --- test/CodeGen/X86/insert-into-constant-vector.ll +++ test/CodeGen/X86/insert-into-constant-vector.ll @@ -352,13 +352,11 @@ ; X64SSE4-NEXT: movaps {{.*#+}} xmm0 = [4.2E+1,1.0E+0,2.0E+0,3.0E+0] ; X64SSE4-NEXT: retq ; -; X32AVX1-LABEL: elt6_v8f32: -; X32AVX1: # %bb.0: -; X32AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; X32AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; X32AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] -; X32AVX1-NEXT: retl +; X32AVX-LABEL: elt6_v8f32: +; X32AVX: # %bb.0: +; X32AVX-NEXT: vbroadcastss {{[0-9]+}}(%esp), %ymm0 +; X32AVX-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] +; X32AVX-NEXT: retl ; ; X64AVX1-LABEL: elt6_v8f32: ; X64AVX1: # %bb.0: @@ -367,29 +365,15 @@ ; X64AVX1-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] ; X64AVX1-NEXT: retq ; -; X32AVX2-LABEL: elt6_v8f32: -; X32AVX2: # %bb.0: -; X32AVX2-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 -; X32AVX2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] -; X32AVX2-NEXT: retl -; ; X64AVX2-LABEL: elt6_v8f32: ; X64AVX2: # %bb.0: -; X64AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; X64AVX2-NEXT: vbroadcastss %xmm0, %ymm0 ; X64AVX2-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] ; X64AVX2-NEXT: retq ; -; X32AVX512F-LABEL: elt6_v8f32: -; X32AVX512F: # %bb.0: -; X32AVX512F-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32AVX512F-NEXT: vbroadcastsd %xmm0, %ymm0 -; X32AVX512F-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] -; X32AVX512F-NEXT: retl -; ; X64AVX512F-LABEL: elt6_v8f32: ; X64AVX512F: # %bb.0: -; X64AVX512F-NEXT: vbroadcastsd %xmm0, %ymm0 +; X64AVX512F-NEXT: vbroadcastss %xmm0, %ymm0 ; X64AVX512F-NEXT: vblendps {{.*#+}} ymm0 = mem[0,1,2,3,4,5],ymm0[6],mem[7] ; X64AVX512F-NEXT: retq %ins = insertelement <8 x float> , float %x, i32 6 Index: test/CodeGen/X86/shuffle-of-splat-multiuses.ll =================================================================== --- test/CodeGen/X86/shuffle-of-splat-multiuses.ll +++ test/CodeGen/X86/shuffle-of-splat-multiuses.ll @@ -94,10 +94,8 @@ define <4 x i32> @undef_splatmask5(<4 x i32> %v, <4 x i32>* %p) nounwind { ; AVX2-LABEL: undef_splatmask5: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastd %xmm0, %xmm1 -; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa %xmm0, (%rdi) -; AVX2-NEXT: vmovdqa %xmm1, %xmm0 +; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX2-NEXT: vmovaps %xmm0, (%rdi) ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> Index: test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v4.ll +++ test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -403,9 +403,9 @@ ; ; AVX2OR512VL-LABEL: shuffle_v4i32_0142: ; AVX2OR512VL: # %bb.0: -; AVX2OR512VL-NEXT: vpbroadcastq %xmm1, %xmm1 -; AVX2OR512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,2,2] -; AVX2OR512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] +; AVX2OR512VL-NEXT: vbroadcastss %xmm1, %xmm1 +; AVX2OR512VL-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,2] +; AVX2OR512VL-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3] ; AVX2OR512VL-NEXT: retq %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %shuffle Index: test/CodeGen/X86/vector-shuffle-combining.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-combining.ll +++ test/CodeGen/X86/vector-shuffle-combining.ll @@ -802,7 +802,7 @@ ; ; AVX2-LABEL: combine_nested_undef_test12: ; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 ; AVX2-NEXT: retq %1 = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32>