Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -7684,6 +7684,10 @@ return DAG.getNode(ISD::OR, DL, VT, V1, V2); } +static SDValue getMaskNode(SDValue Mask, MVT MaskVT, + const X86Subtarget &Subtarget, SelectionDAG &DAG, + const SDLoc &dl); + /// \brief Try to emit a blend instruction for a shuffle. /// /// This doesn't do any checks for the availability of instructions for blending @@ -7842,7 +7846,20 @@ VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); } - + case MVT::v16f32: + case MVT::v8i64: + case MVT::v16i32: + case MVT::v32i16: + case MVT::v64i8: { + MVT IntegerType = + MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); + unsigned NumElts = VT.getVectorNumElements(); + SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts), + Subtarget, DAG, DL); + SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); + return DAG.getNode(ISD::VSELECT, DL, VT, VMask, V1, V2); + } default: llvm_unreachable("Not a supported integer vector type!"); } @@ -12206,6 +12223,7 @@ /// \brief Handle lowering of 16-lane 32-bit floating point shuffles. static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef Mask, + const SmallBitVector &Zeroable, SDValue V1, SDValue V2, const X86Subtarget &Subtarget, SelectionDAG &DAG) { @@ -12234,6 +12252,10 @@ lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) return Unpck; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + // Otherwise, fall back to a SHUFPS sequence. return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } @@ -12295,6 +12317,10 @@ lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) return Unpck; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } @@ -12341,6 +12367,10 @@ Mask, Subtarget, DAG)) return Rotate; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + // Try to use byte rotation instructions. if (Subtarget.hasBWI()) if (SDValue Rotate = lowerVectorShuffleAsByteRotate( @@ -12394,6 +12424,11 @@ } } + if (Subtarget.hasBWI()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } @@ -12434,6 +12469,11 @@ DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) return PSHUFB; + if (Subtarget.hasBWI()) + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + // VBMI can use VPERMV/VPERMV3 byte shuffles. if (Subtarget.hasVBMI()) return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG); @@ -12478,7 +12518,7 @@ case MVT::v8f64: return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); case MVT::v16f32: - return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG); + return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v8i64: return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); case MVT::v16i32: Index: test/CodeGen/X86/merge-consecutive-loads-512.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-512.ll +++ test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -225,19 +225,17 @@ define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8i64_i64_1u3u5zu8: ; ALL: # BB#0: -; ALL-NEXT: vmovdqu64 8(%rdi), %zmm1 -; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; ALL-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,u,2,u,4,13,u,7> -; ALL-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; ALL-NEXT: movb $32, %al +; ALL-NEXT: kmovw %eax, %k1 +; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 {%k1} {z} ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8: ; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm1 -; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0> -; X32-AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; X32-AVX512F-NEXT: movl 4(%esp), %eax +; X32-AVX512F-NEXT: movb $32, %cl +; X32-AVX512F-NEXT: kmovw %ecx, %k1 +; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 {%k1} {z} ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1 %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3 @@ -448,19 +446,17 @@ define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: ; ALL: # BB#0: -; ALL-NEXT: vmovdqu32 (%rdi), %zmm1 -; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; ALL-NEXT: vmovdqa32 {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; ALL-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 +; ALL-NEXT: movw $8240, %ax # imm = 0x2030 +; ALL-NEXT: kmovw %eax, %k1 +; ALL-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: ; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm1 -; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; X32-AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 +; X32-AVX512F-NEXT: movl 4(%esp), %eax +; X32-AVX512F-NEXT: movw $8240, %cx # imm = 0x2030 +; X32-AVX512F-NEXT: kmovw %ecx, %k1 +; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0 {%k1} {z} ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3 Index: test/CodeGen/X86/sse3-avx-addsub.ll =================================================================== --- test/CodeGen/X86/sse3-avx-addsub.ll +++ test/CodeGen/X86/sse3-avx-addsub.ll @@ -119,10 +119,11 @@ ; ; AVX512-LABEL: test5: ; AVX512: # BB#0: -; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vsubps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,2],zmm2[1,3],zmm0[4,6],zmm2[5,7],zmm0[8,10],zmm2[9,11],zmm0[12,14],zmm2[13,15] -; AVX512-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15] +; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vsubps %zmm1, %zmm0, %zmm2 {%k1} +; AVX512-NEXT: vmovaps %zmm2, %zmm0 ; AVX512-NEXT: retq %add = fadd <16 x float> %A, %B %sub = fsub <16 x float> %A, %B Index: test/CodeGen/X86/vector-shuffle-512-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v16.ll +++ test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -252,8 +252,9 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,1,2,19,u,u,u,u,u,u,u,u,u,u,u,u> -; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; ALL-NEXT: movw $8, %ax +; ALL-NEXT: kmovw %eax, %k1 +; ALL-NEXT: vpblendmd %zmm0, %zmm1, %zmm0 {%k1} ; ALL-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %c Index: test/CodeGen/X86/vector-shuffle-512-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v32.ll +++ test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -110,10 +110,10 @@ define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) { ; ALL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; ALL: # BB#0: -; ALL-NEXT: vmovdqu16 {{.*#+}} zmm2 = [32,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] ; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; ALL-NEXT: vpermt2w %zmm0, %zmm2, %zmm1 -; ALL-NEXT: vmovdqa64 %zmm1, %zmm0 +; ALL-NEXT: movl $1, %eax +; ALL-NEXT: kmovd %eax, %k1 +; ALL-NEXT: vpblendmw %zmm1, %zmm0, %zmm0 {%k1} ; ALL-NEXT: retq %shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> ret <32 x i16> %shuffle Index: test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v8.ll +++ test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -1179,16 +1179,16 @@ ; ; AVX512F-LABEL: shuffle_v8i64_81a3c5e7: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,2,11,4,13,6,15] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: retq +; AVX512F-NEXT: movb $-86, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vpblendmq %zmm1, %zmm0, %zmm0 {%k1} +; AVX512F-NEXT: retq ; ; AVX512F-32-LABEL: shuffle_v8i64_81a3c5e7: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,2,0,11,0,4,0,13,0,6,0,15,0] -; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: movb $-86, %al +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vpblendmq %zmm1, %zmm0, %zmm0 {%k1} ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle Index: test/CodeGen/X86/vector-shuffle-to-blend-avx512.ll.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-to-blend-avx512.ll.ll +++ test/CodeGen/X86/vector-shuffle-to-blend-avx512.ll.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=KNL + +define <16 x i32> @test_mm512_mask_blend_epi32(<16 x i32> %A, <16 x i32> %W){ +; SKX-LABEL: test_mm512_mask_blend_epi32: +; SKX: # BB#0: # %entry +; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vpblendmd %zmm1, %zmm0, %zmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm512_mask_blend_epi32: +; KNL: # BB#0: # %entry +; KNL-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpblendmd %zmm1, %zmm0, %zmm0 {%k1} +; KNL-NEXT: retq +entry: + %0 = shufflevector <16 x i32> %A, <16 x i32> %W, <16 x i32> + ret <16 x i32> %0 +} + +define <8 x i64> @test_mm512_mask_blend_epi64(<8 x i64> %A, <8 x i64> %W){ +; SKX-LABEL: test_mm512_mask_blend_epi64: +; SKX: # BB#0: # %entry +; SKX-NEXT: movb $-86, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vpblendmq %zmm1, %zmm0, %zmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm512_mask_blend_epi64: +; KNL: # BB#0: # %entry +; KNL-NEXT: movb $-86, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vpblendmq %zmm1, %zmm0, %zmm0 {%k1} +; KNL-NEXT: retq +entry: + %0 = shufflevector <8 x i64> %A, <8 x i64> %W, <8 x i32> + ret <8 x i64> %0 +} + +define <16 x float> @test_mm512_mask_blend_ps(<16 x float> %A, <16 x float> %W){ +; SKX-LABEL: test_mm512_mask_blend_ps: +; SKX: # BB#0: # %entry +; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vblendmps %zmm1, %zmm0, %zmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm512_mask_blend_ps: +; KNL: # BB#0: # %entry +; KNL-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vblendmps %zmm1, %zmm0, %zmm0 {%k1} +; KNL-NEXT: retq +entry: + %0 = shufflevector <16 x float> %A, <16 x float> %W, <16 x i32> + ret <16 x float> %0 +}