diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -44278,6 +44278,21 @@ for (unsigned i = 0; i != Scale; ++i) ShuffleMask.append(EltSizeInBits, i); + Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); + } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits && + (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) { + // If we have register broadcast instructions, use the scalar size as the + // element type for the shuffle. Then cast to the wider element type. The + // widened bits won't be used, and this might allow the use of a broadcast + // load. + assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale"); + unsigned Scale = EltSizeInBits / NumElts; + EVT BroadcastVT = + EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale); + Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00); + ShuffleMask.append(NumElts * Scale, 0); + Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask); + Vec = DAG.getBitcast(VT, Vec); } else { // For smaller scalar integers, we can simply any-extend it to the vector // element size (we don't care about the upper bits) and broadcast it to all @@ -44285,8 +44300,8 @@ SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT); Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl); ShuffleMask.append(NumElts, 0); + Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); } - Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); // Now, mask the relevant bit in each element. SmallVector Bits; diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -114,7 +114,7 @@ ; AVX2-LABEL: ext_i8_8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 @@ -273,7 +273,7 @@ ; AVX2-LABEL: ext_i8_8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 @@ -456,9 +456,8 @@ ; ; AVX2-LABEL: ext_i8_8i64: ; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $edi killed $edi def $rdi -; AVX2-NEXT: vmovq %rdi, %xmm0 -; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1 +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpcmpeqq %ymm0, %ymm2, %ymm0 @@ -525,7 +524,7 @@ ; AVX2-LABEL: ext_i16_16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm0, %ymm1 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -141,7 +141,7 @@ ; AVX2-LABEL: ext_i8_8i16: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 @@ -344,7 +344,7 @@ ; AVX2-LABEL: ext_i8_8i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0 @@ -583,9 +583,8 @@ ; ; AVX2-LABEL: ext_i8_8i64: ; AVX2: # %bb.0: -; AVX2-NEXT: # kill: def $edi killed $edi def $rdi -; AVX2-NEXT: vmovq %rdi, %xmm0 -; AVX2-NEXT: vpbroadcastq %xmm0, %ymm1 +; AVX2-NEXT: vmovd %edi, %xmm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8] ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpcmpeqq %ymm0, %ymm2, %ymm0 @@ -670,7 +669,7 @@ ; AVX2-LABEL: ext_i16_16i32: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpbroadcastd %xmm0, %ymm1 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm0, %ymm1, %ymm2 ; AVX2-NEXT: vpcmpeqd %ymm0, %ymm2, %ymm0 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -115,7 +115,7 @@ ; AVX2-LABEL: bitcast_i8_8i1: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovd %edi, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -2039,9 +2039,7 @@ ; ; AVX2-LABEL: load_sext_8i1_to_8i16: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movzwl (%rdi), %eax -; AVX2-NEXT: vmovd %eax, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 @@ -2261,8 +2259,7 @@ ; ; AVX2-LABEL: load_sext_8i1_to_8i32: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastb (%rdi), %ymm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128] ; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpcmpeqd %ymm1, %ymm0, %ymm0