Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -10364,11 +10364,30 @@ /// one of the inputs being zeroable. static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - const APInt &Zeroable, SelectionDAG &DAG) { - assert(!VT.isFloatingPoint() && "Floating point types are not supported"); + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + MVT MaskVT = VT; MVT EltVT = VT.getVectorElementType(); - SDValue Zero = DAG.getConstant(0, DL, EltVT); - SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); + SDValue Zero, AllOnes; + // Use f64 if i64 isn't legal. + if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { + EltVT = MVT::f64; + MaskVT = MVT::getVectorVT(EltVT, Mask.size()); + } + + MVT LogicVT = VT; + if (EltVT == MVT::f32 || EltVT == MVT::f64) { + Zero = DAG.getConstantFP(0.0, DL, MVT::f64); + AllOnes = DAG.getConstantFP(APInt::getAllOnesValue(64).bitsToDouble(), DL, + EltVT); + LogicVT = MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, + Mask.size()); + } else { + Zero = DAG.getConstant(0, DL, EltVT); + AllOnes = DAG.getAllOnesConstant(DL, EltVT); + } + SmallVector VMaskOps(Mask.size(), Zero); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { @@ -10386,8 +10405,11 @@ if (!V) return SDValue(); // No non-zeroable elements! - SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps); - return DAG.getNode(ISD::AND, DL, VT, V, VMask); + SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps); + VMask = DAG.getBitcast(LogicVT, VMask); + V = DAG.getBitcast(LogicVT, V); + SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask); + return DAG.getBitcast(VT, And); } /// Try to emit a blend instruction for a shuffle using bit math. @@ -10552,7 +10574,7 @@ // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, - DAG)) + Subtarget, DAG)) return Masked; if (Subtarget.hasBWI() && Subtarget.hasVLX()) { @@ -10610,6 +10632,13 @@ case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: { + // Attempt to lower to a bitmask if we can. + if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, + Subtarget, DAG)) + return Masked; + + // Otherwise load an immediate into a GPR, cast to k-register, and use a + // masked move. MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); @@ -12766,7 +12795,7 @@ return Blend; if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, - Zeroable, DAG)) + Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. @@ -13467,7 +13496,7 @@ return Blend; if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, DAG)) + Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. @@ -13735,7 +13764,7 @@ } if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, - Zeroable, DAG)) + Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. @@ -15571,7 +15600,7 @@ // No floating point type available, if we can't use the bit operations // for masking/blending then decompose into 128-bit vectors. if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, - DAG)) + Subtarget, DAG)) return V; if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return V; Index: test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- test/CodeGen/X86/avx512-mask-op.ll +++ test/CodeGen/X86/avx512-mask-op.ll @@ -1860,16 +1860,12 @@ ; ; SKX-LABEL: test_build_vec_v32i1: ; SKX: ## %bb.0: -; SKX-NEXT: movl $1497715861, %eax ## imm = 0x59455495 -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 ; SKX-NEXT: retq ; ; AVX512BW-LABEL: test_build_vec_v32i1: ; AVX512BW: ## %bb.0: -; AVX512BW-NEXT: movl $1497715861, %eax ## imm = 0x59455495 -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512DQ-LABEL: test_build_vec_v32i1: @@ -1880,9 +1876,7 @@ ; ; X86-LABEL: test_build_vec_v32i1: ; X86: ## %bb.0: -; X86-NEXT: movl $1497715861, %eax ## imm = 0x59455495 -; X86-NEXT: kmovd %eax, %k1 -; X86-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; X86-NEXT: vandps LCPI40_0, %zmm0, %zmm0 ; X86-NEXT: retl %ret = select <32 x i1> , <32 x i16> %x, <32 x i16> zeroinitializer ret <32 x i16> %ret Index: test/CodeGen/X86/merge-consecutive-loads-512.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-512.ll +++ test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -128,29 +128,17 @@ } define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp { -; AVX512F-LABEL: merge_8f64_f64_1u3u5zu8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: movb $32, %al -; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: knotw %k0, %k1 -; AVX512F-NEXT: vmovupd 8(%rdi), %zmm0 {%k1} {z} -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: merge_8f64_f64_1u3u5zu8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movb $32, %al -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: knotw %k0, %k1 -; AVX512BW-NEXT: vmovupd 8(%rdi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq +; ALL-LABEL: merge_8f64_f64_1u3u5zu8: +; ALL: # %bb.0: +; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 +; ALL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8: ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: movb $32, %cl -; X32-AVX512F-NEXT: kmovw %ecx, %k0 -; X32-AVX512F-NEXT: knotw %k0, %k1 -; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm0 {%k1} {z} +; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 +; X32-AVX512F-NEXT: vpandq {{\.LCPI.*}}, %zmm0, %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds double, double* %ptr, i64 1 %ptr2 = getelementptr inbounds double, double* %ptr, i64 3 @@ -219,29 +207,17 @@ } define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp { -; AVX512F-LABEL: merge_8i64_i64_1u3u5zu8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: movb $32, %al -; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: knotw %k0, %k1 -; AVX512F-NEXT: vmovdqu64 8(%rdi), %zmm0 {%k1} {z} -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: merge_8i64_i64_1u3u5zu8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movb $32, %al -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: knotw %k0, %k1 -; AVX512BW-NEXT: vmovdqu64 8(%rdi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq +; ALL-LABEL: merge_8i64_i64_1u3u5zu8: +; ALL: # %bb.0: +; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 +; ALL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8: ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: movb $32, %cl -; X32-AVX512F-NEXT: kmovw %ecx, %k0 -; X32-AVX512F-NEXT: knotw %k0, %k1 -; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 {%k1} {z} +; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 +; X32-AVX512F-NEXT: vpandq {{\.LCPI.*}}, %zmm0, %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1 %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3 @@ -450,29 +426,17 @@ } define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp { -; AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: -; AVX512F: # %bb.0: -; AVX512F-NEXT: movw $8240, %ax # imm = 0x2030 -; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: knotw %k0, %k1 -; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movw $8240, %ax # imm = 0x2030 -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: knotw %k0, %k1 -; AVX512BW-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq +; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: +; ALL: # %bb.0: +; ALL-NEXT: vmovdqu64 (%rdi), %zmm0 +; ALL-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 +; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: movw $8240, %cx # imm = 0x2030 -; X32-AVX512F-NEXT: kmovw %ecx, %k0 -; X32-AVX512F-NEXT: knotw %k0, %k1 -; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0 {%k1} {z} +; X32-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0 +; X32-AVX512F-NEXT: vpandd {{\.LCPI.*}}, %zmm0, %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3 Index: test/CodeGen/X86/vector-shuffle-512-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v32.ll +++ test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -203,9 +203,9 @@ ; ; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; SKX: ## %bb.0: -; SKX-NEXT: movl $1, %eax -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: movl $65535, %eax ## imm = 0xFFFF +; SKX-NEXT: vmovd %eax, %xmm1 +; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; SKX-NEXT: retq %shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> ret <32 x i16> %shuffle