Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -5394,8 +5394,18 @@ IsUnary = true; break; case X86ISD::VBROADCAST: { - // We only decode broadcasts of same-sized vectors at the moment. - if (N->getOperand(0).getValueType() == VT) { + SDValue N0 = N->getOperand(0); + // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so, + // add the pre-extracted value to the Ops vector. + if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && + N0.getOperand(0).getValueType() == VT && + N0.getConstantOperandVal(1) == 0) + Ops.push_back(N0.getOperand(0)); + + // We only decode broadcasts of same-sized vectors, unless the broadcast + // came from an extract from the original width. If we found one, we + // pushed it the Ops vector above. + if (N0.getValueType() == VT || !Ops.empty()) { DecodeVectorBroadcast(VT, Mask); IsUnary = true; break; @@ -9729,6 +9739,12 @@ BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts); } + // We only support broadcasting from 128-bit vectors to minimize the + // number of patterns we need to deal with in isel. So extract down to + // 128-bits. + if (SrcVT.getSizeInBits() > 128) + V = extract128BitVector(V, 0, DAG, DL); + return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } Index: llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll +++ llvm/trunk/test/CodeGen/X86/masked_gather_scatter.ll @@ -714,8 +714,7 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) { ; KNL_64-LABEL: test14: ; KNL_64: # BB#0: -; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 -; KNL_64-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; KNL_64-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 ; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0 ; KNL_64-NEXT: vmovd %esi, %xmm1 ; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1 @@ -731,8 +730,7 @@ ; ; KNL_32-LABEL: test14: ; KNL_32: # BB#0: -; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1 -; KNL_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; KNL_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0 ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 ; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 @@ -742,8 +740,7 @@ ; ; SKX-LABEL: test14: ; SKX: # BB#0: -; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1 -; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0 +; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 ; SKX-NEXT: vpbroadcastq %xmm0, %zmm0 ; SKX-NEXT: vpbroadcastd %esi, %ymm1 ; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 @@ -758,8 +755,7 @@ ; ; SKX_32-LABEL: test14: ; SKX_32: # BB#0: -; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1 -; SKX_32-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0 +; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 ; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0 ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 ; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -126,7 +126,6 @@ ; ; KNL64-LABEL: expand3: ; KNL64: # BB#0: -; KNL64-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; KNL64-NEXT: vpbroadcastq %xmm0, %ymm0 ; KNL64-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; KNL64-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7] @@ -142,7 +141,6 @@ ; ; KNL32-LABEL: expand3: ; KNL32: # BB#0: -; KNL32-NEXT: # kill: %XMM0 %XMM0 %YMM0 ; KNL32-NEXT: vpbroadcastq %xmm0, %ymm0 ; KNL32-NEXT: vpxor %ymm1, %ymm1, %ymm1 ; KNL32-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7] Index: llvm/trunk/test/CodeGen/X86/widened-broadcast.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/widened-broadcast.ll +++ llvm/trunk/test/CodeGen/X86/widened-broadcast.ll @@ -51,14 +51,12 @@ ; ; AVX2-LABEL: load_splat_8f32_4f32_01010101: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_8f32_4f32_01010101: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %ld = load <4 x float>, <4 x float>* %ptr @@ -131,14 +129,12 @@ ; ; AVX2-LABEL: load_splat_8i32_4i32_01010101: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_8i32_4i32_01010101: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %ld = load <4 x i32>, <4 x i32>* %ptr @@ -242,14 +238,12 @@ ; ; AVX2-LABEL: load_splat_16i16_8i16_0101010101010101: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_16i16_8i16_0101010101010101: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX512-NEXT: vbroadcastss (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %ld = load <8 x i16>, <8 x i16>* %ptr @@ -272,14 +266,12 @@ ; ; AVX2-LABEL: load_splat_16i16_8i16_0123012301230123: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_16i16_8i16_0123012301230123: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %ld = load <8 x i16>, <8 x i16>* %ptr @@ -442,14 +434,12 @@ ; ; AVX2-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX512-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %ld = load <16 x i8>, <16 x i8>* %ptr @@ -472,14 +462,12 @@ ; ; AVX2-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX2-NEXT: vbroadcastss (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-NEXT: vbroadcastss %xmm0, %ymm0 +; AVX512-NEXT: vbroadcastss (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %ld = load <16 x i8>, <16 x i8>* %ptr @@ -502,14 +490,12 @@ ; ; AVX2-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567: ; AVX2: # BB#0: # %entry -; AVX2-NEXT: vmovaps (%rdi), %xmm0 -; AVX2-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX2-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567: ; AVX512: # BB#0: # %entry -; AVX512-NEXT: vmovaps (%rdi), %xmm0 -; AVX512-NEXT: vbroadcastsd %xmm0, %ymm0 +; AVX512-NEXT: vbroadcastsd (%rdi), %ymm0 ; AVX512-NEXT: retq entry: %ld = load <16 x i8>, <16 x i8>* %ptr