Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -7122,9 +7122,9 @@ } } - // We need a splat of a single value to use broadcast, and it doesn't - // make any sense if the value is only in one element of the vector. - if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumUndefElts = UndefElements.count(); + if (!Ld || (NumElts - NumUndefElts) <= 1) { APInt SplatValue, Undef; unsigned SplatBitSize; bool HasUndef; @@ -7200,7 +7200,17 @@ } } } - return SDValue(); + + // If we are moving a scalar into a vector (Ld must be set and all elements + // but 1 are undef) and that operation is not obviously supported by + // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast. + // That's better than general shuffling and may eliminate a load to GPR and + // move from scalar to vector register. + if (!Ld || NumElts - NumUndefElts != 1) + return SDValue(); + unsigned ScalarSize = Ld.getValueSizeInBits(); + if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64))) + return SDValue(); } bool ConstSplatVal = Index: llvm/trunk/test/CodeGen/X86/avx-basic.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-basic.ll +++ llvm/trunk/test/CodeGen/X86/avx-basic.ll @@ -76,8 +76,7 @@ define <8 x i32> @VMOVZQI2PQI([0 x float]* nocapture %aFOO) nounwind { ; CHECK-LABEL: VMOVZQI2PQI: ; CHECK: ## %bb.0: -; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,0,1,1] +; CHECK-NEXT: vbroadcastss (%rdi), %ymm0 ; CHECK-NEXT: retq %ptrcast.i33.i = bitcast [0 x float]* %aFOO to i32* %val.i34.i = load i32, i32* %ptrcast.i33.i, align 4 Index: llvm/trunk/test/CodeGen/X86/insert-loaded-scalar.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/insert-loaded-scalar.ll +++ llvm/trunk/test/CodeGen/X86/insert-loaded-scalar.ll @@ -10,11 +10,16 @@ ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load8_ins_elt0_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: movzbl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load8_ins_elt0_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load8_ins_elt0_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 +; AVX2-NEXT: retq %x = load i8, i8* %p %ins = insertelement <16 x i8> undef, i8 %x, i32 0 ret <16 x i8> %ins @@ -27,11 +32,16 @@ ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load16_ins_elt0_v8i16: -; AVX: # %bb.0: -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load16_ins_elt0_v8i16: +; AVX1: # %bb.0: +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load16_ins_elt0_v8i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-NEXT: retq %x = load i16, i16* %p %ins = insertelement <8 x i16> undef, i16 %x, i32 0 ret <8 x i16> %ins @@ -105,12 +115,17 @@ ; SSE-NEXT: pslld $24, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load8_ins_eltc_v16i8: -; AVX: # %bb.0: -; AVX-NEXT: movzbl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: vpslld $24, %xmm0, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load8_ins_eltc_v16i8: +; AVX1: # %bb.0: +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: vpslld $24, %xmm0, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load8_ins_eltc_v16i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 +; AVX2-NEXT: retq %x = load i8, i8* %p %ins = insertelement <16 x i8> undef, i8 %x, i32 3 ret <16 x i8> %ins @@ -147,17 +162,10 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: load32_ins_eltc_v4i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: retq -; -; AVX2-LABEL: load32_ins_eltc_v4i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 -; AVX2-NEXT: retq +; AVX-LABEL: load32_ins_eltc_v4i32: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX-NEXT: retq %x = load i32, i32* %p %ins = insertelement <4 x i32> undef, i32 %x, i32 2 ret <4 x i32> %ins @@ -223,11 +231,16 @@ ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load8_ins_elt0_v32i8: -; AVX: # %bb.0: -; AVX-NEXT: movzbl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load8_ins_elt0_v32i8: +; AVX1: # %bb.0: +; AVX1-NEXT: movzbl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load8_ins_elt0_v32i8: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 +; AVX2-NEXT: retq %x = load i8, i8* %p %ins = insertelement <32 x i8> undef, i8 %x, i32 0 ret <32 x i8> %ins @@ -240,11 +253,16 @@ ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: load16_ins_elt0_v16i16: -; AVX: # %bb.0: -; AVX-NEXT: movzwl (%rdi), %eax -; AVX-NEXT: vmovd %eax, %xmm0 -; AVX-NEXT: retq +; AVX1-LABEL: load16_ins_elt0_v16i16: +; AVX1: # %bb.0: +; AVX1-NEXT: movzwl (%rdi), %eax +; AVX1-NEXT: vmovd %eax, %xmm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: load16_ins_elt0_v16i16: +; AVX2: # %bb.0: +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 +; AVX2-NEXT: retq %x = load i16, i16* %p %ins = insertelement <16 x i16> undef, i16 %x, i32 0 ret <16 x i16> %ins @@ -328,10 +346,7 @@ ; ; AVX2-LABEL: load8_ins_eltc_v32i8: ; AVX2: # %bb.0: -; AVX2-NEXT: movzbl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpsllq $40, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX2-NEXT: retq %x = load i8, i8* %p %ins = insertelement <32 x i8> undef, i8 %x, i32 21 @@ -356,10 +371,7 @@ ; ; AVX2-LABEL: load16_ins_eltc_v16i16: ; AVX2: # %bb.0: -; AVX2-NEXT: movzwl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX2-NEXT: retq %x = load i16, i16* %p %ins = insertelement <16 x i16> undef, i16 %x, i32 11 @@ -373,18 +385,10 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,2,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: load32_ins_eltc_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,2,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load32_ins_eltc_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: load32_ins_eltc_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX-NEXT: retq %x = load i32, i32* %p %ins = insertelement <8 x i32> undef, i32 %x, i32 7 ret <8 x i32> %ins @@ -397,17 +401,10 @@ ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; -; AVX1-LABEL: load64_ins_eltc_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX1-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load64_ins_eltc_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: load64_ins_eltc_v4i64: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX-NEXT: retq %x = load i64, i64* %p %ins = insertelement <4 x i64> undef, i64 %x, i32 3 ret <4 x i64> %ins @@ -420,18 +417,10 @@ ; SSE-NEXT: movsldup {{.*#+}} xmm1 = xmm0[0,0,2,2] ; SSE-NEXT: retq ; -; AVX1-LABEL: load32_ins_eltc_v8f32: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load32_ins_eltc_v8f32: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastss (%rdi), %xmm0 -; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: load32_ins_eltc_v8f32: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastss (%rdi), %ymm0 +; AVX-NEXT: retq %x = load float, float* %p %ins = insertelement <8 x float> undef, float %x, i32 5 ret <8 x float> %ins @@ -443,16 +432,10 @@ ; SSE-NEXT: movddup {{.*#+}} xmm1 = mem[0,0] ; SSE-NEXT: retq ; -; AVX1-LABEL: load64_ins_eltc_v4f64: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: load64_ins_eltc_v4f64: -; AVX2: # %bb.0: -; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 -; AVX2-NEXT: retq +; AVX-LABEL: load64_ins_eltc_v4f64: +; AVX: # %bb.0: +; AVX-NEXT: vbroadcastsd (%rdi), %ymm0 +; AVX-NEXT: retq %x = load double, double* %p %ins = insertelement <4 x double> undef, double %x, i32 3 ret <4 x double> %ins Index: llvm/trunk/test/CodeGen/X86/sse3-avx-addsub-2.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ llvm/trunk/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -274,13 +274,21 @@ ; SSE-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] ; SSE-NEXT: retq ; -; AVX-LABEL: test11: -; AVX: # %bb.0: -; AVX-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] -; AVX-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] -; AVX-NEXT: vsubss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] -; AVX-NEXT: retq +; AVX1-LABEL: test11: +; AVX1: # %bb.0: +; AVX1-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX1-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] +; AVX1-NEXT: retq +; +; AVX512-LABEL: test11: +; AVX512: # %bb.0: +; AVX512-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] +; AVX512-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0] +; AVX512-NEXT: vsubss %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 +; AVX512-NEXT: retq %1 = extractelement <4 x float> %A, i32 2 %2 = extractelement <4 x float> %B, i32 2 %sub = fsub float %1, %2 Index: llvm/trunk/test/CodeGen/X86/sse41.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse41.ll +++ llvm/trunk/test/CodeGen/X86/sse41.ll @@ -97,8 +97,9 @@ ; X86-AVX512: ## %bb.0: ## %entry ; X86-AVX512-NEXT: movl L_g16$non_lazy_ptr, %eax ## encoding: [0xa1,A,A,A,A] ; X86-AVX512-NEXT: ## fixup A - offset: 1, value: L_g16$non_lazy_ptr, kind: FK_Data_4 -; X86-AVX512-NEXT: vpmovzxbq (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00] -; X86-AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X86-AVX512-NEXT: vpbroadcastw (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x00] +; X86-AVX512-NEXT: vpmovzxbq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xc0] +; X86-AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; X86-AVX512-NEXT: retl ## encoding: [0xc3] ; ; X64-SSE-LABEL: pmovzxbq_1: @@ -121,8 +122,9 @@ ; X64-AVX512: ## %bb.0: ## %entry ; X64-AVX512-NEXT: movq _g16@{{.*}}(%rip), %rax ## encoding: [0x48,0x8b,0x05,A,A,A,A] ; X64-AVX512-NEXT: ## fixup A - offset: 3, value: _g16@GOTPCREL-4, kind: reloc_riprel_4byte_movq_load -; X64-AVX512-NEXT: vpmovzxbq (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0x00] -; X64-AVX512-NEXT: ## xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero +; X64-AVX512-NEXT: vpbroadcastw (%rax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0x00] +; X64-AVX512-NEXT: vpmovzxbq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x32,0xc0] +; X64-AVX512-NEXT: ## xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero ; X64-AVX512-NEXT: retq ## encoding: [0xc3] entry: %0 = load i16, i16* @g16, align 2 ; [#uses=1] Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -2619,8 +2619,9 @@ ; ; AVX1-LABEL: insert_dup_elt3_mem_v8i16_i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i32: Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -4597,8 +4597,9 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(i32* %ptr) #0 { ; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i32: ; AVX1: # %bb.0: -; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3] +; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 +; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,2,3,4,5,6,7] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ;