Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7287,6 +7287,23 @@ // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. + // First, look through bitcast: if the original value has a larger element + // type than the shuffle, the broadcast element is in essence truncated. + // Make that explicit to ease folding. + if (V.getOpcode() == ISD::BITCAST && VT.isInteger()) { + EVT EltVT = VT.getVectorElementType(); + SDValue V0 = V.getOperand(0); + EVT V0VT = V0.getValueType(); + + if (V0VT.isInteger() && V0VT.getVectorElementType().bitsGT(EltVT) && + ((V0.getOpcode() == ISD::BUILD_VECTOR || + (V0.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)))) { + V = DAG.getNode(ISD::TRUNCATE, DL, EltVT, V0.getOperand(BroadcastIdx)); + BroadcastIdx = 0; + } + } + + // Also check the simpler case, where we can directly reuse the scalar. if (V.getOpcode() == ISD::BUILD_VECTOR || (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) { V = V.getOperand(BroadcastIdx); Index: lib/Target/X86/X86InstrSSE.td =================================================================== --- lib/Target/X86/X86InstrSSE.td +++ lib/Target/X86/X86InstrSSE.td @@ -8337,6 +8337,13 @@ def : Pat<(v4f64 (X86VBroadcast (v2f64 VR128:$src))), (VBROADCASTSDYrr VR128:$src)>; + // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. + // This means we'll encounter truncated i32 loads; match that here. + def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWrm addr:$src)>; + def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), + (VPBROADCASTWYrm addr:$src)>; + // Provide aliases for broadcast from the same register class that // automatically does the extract. def : Pat<(v32i8 (X86VBroadcast (v32i8 VR256:$src))), Index: test/CodeGen/X86/vector-shuffle-pbroadcast-load.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-pbroadcast-load.ll +++ test/CodeGen/X86/vector-shuffle-pbroadcast-load.ll @@ -67,8 +67,7 @@ ; ; AVX2-LABEL: test_256_vpbroadcastw_i32: ; AVX2: # BB#0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm0 ; AVX2-NEXT: retq %tmp = load i32, i32* %arg, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 @@ -147,8 +146,7 @@ ; ; AVX2-LABEL: test_256_vpbroadcastb_i32: ; AVX2: # BB#0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX2-NEXT: retq %tmp = load i32, i32* %arg, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 @@ -189,9 +187,7 @@ ; ; AVX2-LABEL: test_256_vpbroadcastb: ; AVX2: # BB#0: -; AVX2-NEXT: movsbl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX2-NEXT: retq %tmp = load i8, i8* %arg, align 1 %tmp1 = sext i8 %tmp to i32 @@ -264,8 +260,7 @@ ; ; AVX2-LABEL: test_128_vpbroadcastw_i32: ; AVX2: # BB#0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 ; AVX2-NEXT: retq %tmp = load i32, i32* %arg, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 @@ -338,8 +333,7 @@ ; ; AVX2-LABEL: test_128_vpbroadcastb_i32: ; AVX2: # BB#0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX2-NEXT: retq %tmp = load i32, i32* %arg, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 @@ -377,9 +371,7 @@ ; ; AVX2-LABEL: test_128_vpbroadcastb: ; AVX2: # BB#0: -; AVX2-NEXT: movsbl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX2-NEXT: retq %tmp = load i8, i8* %arg, align 1 %tmp1 = sext i8 %tmp to i32