Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -8308,6 +8308,11 @@ return DAG.getNode(ISD::OR, DL, VT, V1, V2); } +static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget &Subtarget, + SelectionDAG &DAG); + /// \brief Try to emit a blend instruction for a shuffle. /// /// This doesn't do any checks for the availability of instructions for blending @@ -8466,7 +8471,17 @@ VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); } - + case MVT::v16f32: + case MVT::v8f64: + case MVT::v8i64: + case MVT::v16i32: + case MVT::v32i16: + case MVT::v64i8: { + MVT IntegerType = + MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); + return getVectorMaskingNode(V1, MaskNode, V2, Subtarget, DAG); + } default: llvm_unreachable("Not a supported integer vector type!"); } @@ -12892,6 +12907,10 @@ V2, DAG, Subtarget)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } @@ -12926,6 +12945,10 @@ lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) return Unpck; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + // Otherwise, fall back to a SHUFPS sequence. return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } @@ -12995,6 +13018,10 @@ V2, DAG, Subtarget)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } @@ -13062,7 +13089,10 @@ if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2, DAG, Subtarget)) return V; - + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } @@ -13109,6 +13139,10 @@ DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG); } } + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } @@ -13159,7 +13193,11 @@ if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return V; - + + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } Index: test/CodeGen/X86/merge-consecutive-loads-512.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-512.ll +++ test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -138,19 +138,17 @@ define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_f64_1u3u5zu8: ; ALL: # BB#0: -; ALL-NEXT: vmovupd 8(%rdi), %zmm1 -; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; ALL-NEXT: vmovapd {{.*#+}} zmm0 = <0,u,2,u,4,13,u,7> -; ALL-NEXT: vpermi2pd %zmm2, %zmm1, %zmm0 +; ALL-NEXT: movb $32, %al +; ALL-NEXT: kmovw %eax, %k1 +; ALL-NEXT: vmovupd 8(%rdi), %zmm0 {%k1} {z} ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8: ; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm1 -; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; X32-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0> -; X32-AVX512F-NEXT: vpermi2pd %zmm2, %zmm1, %zmm0 +; X32-AVX512F-NEXT: movl 4(%esp), %eax +; X32-AVX512F-NEXT: movb $32, %cl +; X32-AVX512F-NEXT: kmovw %ecx, %k1 +; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm0 {%k1} {z} ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds double, double* %ptr, i64 1 %ptr2 = getelementptr inbounds double, double* %ptr, i64 3 @@ -225,19 +223,17 @@ define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8i64_i64_1u3u5zu8: ; ALL: # BB#0: -; ALL-NEXT: vmovdqu64 8(%rdi), %zmm1 -; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; ALL-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,u,2,u,4,13,u,7> -; ALL-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; ALL-NEXT: movb $32, %al +; ALL-NEXT: kmovw %eax, %k1 +; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 {%k1} {z} ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8: ; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm1 -; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0> -; X32-AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; X32-AVX512F-NEXT: movl 4(%esp), %eax +; X32-AVX512F-NEXT: movb $32, %cl +; X32-AVX512F-NEXT: kmovw %ecx, %k1 +; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 {%k1} {z} ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1 %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3 @@ -448,19 +444,17 @@ define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: ; ALL: # BB#0: -; ALL-NEXT: vmovdqu32 (%rdi), %zmm1 -; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; ALL-NEXT: vmovdqa32 {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; ALL-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 +; ALL-NEXT: movw $8240, %ax # imm = 0x2030 +; ALL-NEXT: kmovw %eax, %k1 +; ALL-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: ; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm1 -; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; X32-AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 +; X32-AVX512F-NEXT: movl 4(%esp), %eax +; X32-AVX512F-NEXT: movw $8240, %cx # imm = 0x2030 +; X32-AVX512F-NEXT: kmovw %ecx, %k1 +; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0 {%k1} {z} ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3 Index: test/CodeGen/X86/sse3-avx-addsub.ll =================================================================== --- test/CodeGen/X86/sse3-avx-addsub.ll +++ test/CodeGen/X86/sse3-avx-addsub.ll @@ -119,10 +119,11 @@ ; ; AVX512-LABEL: test5: ; AVX512: # BB#0: -; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vsubps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,2],zmm2[1,3],zmm0[4,6],zmm2[5,7],zmm0[8,10],zmm2[9,11],zmm0[12,14],zmm2[13,15] -; AVX512-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15] +; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vsubps %zmm1, %zmm0, %zmm2 {%k1} +; AVX512-NEXT: vmovaps %zmm2, %zmm0 ; AVX512-NEXT: retq %add = fadd <16 x float> %A, %B %sub = fsub <16 x float> %A, %B Index: test/CodeGen/X86/vector-shuffle-512-v16.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v16.ll +++ test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -252,8 +252,10 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,1,2,19,u,u,u,u,u,u,u,u,u,u,u,u> -; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; ALL-NEXT: movw $8, %ax +; ALL-NEXT: kmovw %eax, %k1 +; ALL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; ALL-NEXT: vmovdqa64 %zmm1, %zmm0 ; ALL-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %c Index: test/CodeGen/X86/vector-shuffle-512-v32.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v32.ll +++ test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -110,10 +110,10 @@ define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) { ; ALL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; ALL: # BB#0: -; ALL-NEXT: vmovdqu16 {{.*#+}} zmm2 = [32,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; ALL-NEXT: vpermt2w %zmm0, %zmm2, %zmm1 -; ALL-NEXT: vmovdqa64 %zmm1, %zmm0 +; ALL-NEXT: movl $1, %eax +; ALL-NEXT: kmovd %eax, %k0 +; ALL-NEXT: knotd %k0, %k1 +; ALL-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} ; ALL-NEXT: retq %shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> ret <32 x i16> %shuffle Index: test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-512-v8.ll +++ test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -1179,16 +1179,17 @@ ; ; AVX512F-LABEL: shuffle_v8i64_81a3c5e7: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,2,11,4,13,6,15] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: retq +; AVX512F-NEXT: movb $-86, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: retq +; ; ; AVX512F-32-LABEL: shuffle_v8i64_81a3c5e7: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,2,0,11,0,4,0,13,0,6,0,15,0] -; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: movb $-86, %al +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle Index: test/CodeGen/X86/vector-shuffle-to-blend-avx512.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-to-blend-avx512.ll +++ test/CodeGen/X86/vector-shuffle-to-blend-avx512.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx | FileCheck %s --check-prefix=SKX +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl | FileCheck %s --check-prefix=KNL + +define <64 x i8> @test_mm512_mask_blend_epi8(<64 x i8> %A, <64 x i8> %W){ +; SKX-LABEL: test_mm512_mask_blend_epi8: +; SKX: # BB#0: # %entry +; SKX-NEXT: movl $2863311530, %eax # imm = 0xAAAAAAAA +; SKX-NEXT: kmovq %rax, %k1 +; SKX-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm512_mask_blend_epi8: +; KNL: # BB#0: # %entry +; KNL-NEXT: vpbroadcastw {{.*}}(%rip), %ymm4 +; KNL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 +; KNL-NEXT: retq +entry: + %0 = shufflevector <64 x i8> %A, <64 x i8> %W, <64 x i32> + ret <64 x i8> %0 +} + +define <32 x i16> @test_mm512_mask_blend_epi16(<32 x i16> %A, <32 x i16> %W){ +; SKX-LABEL: test_mm512_mask_blend_epi16: +; SKX: # BB#0: # %entry +; SKX-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm512_mask_blend_epi16: +; KNL: # BB#0: # %entry +; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7],ymm2[8],ymm0[9],ymm2[10],ymm0[11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] +; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7],ymm3[8],ymm1[9],ymm3[10],ymm1[11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] +; KNL-NEXT: retq +entry: + %0 = shufflevector <32 x i16> %A, <32 x i16> %W, <32 x i32> + ret <32 x i16> %0 +} + +define <16 x i32> @test_mm512_mask_blend_epi32(<16 x i32> %A, <16 x i32> %W){ +; SKX-LABEL: test_mm512_mask_blend_epi32: +; SKX: # BB#0: # %entry +; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm512_mask_blend_epi32: +; KNL: # BB#0: # %entry +; KNL-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; KNL-NEXT: retq +entry: + %0 = shufflevector <16 x i32> %A, <16 x i32> %W, <16 x i32> + ret <16 x i32> %0 +} + +define <8 x i64> @test_mm512_mask_blend_epi64(<8 x i64> %A, <8 x i64> %W){ +; SKX-LABEL: test_mm512_mask_blend_epi64: +; SKX: # BB#0: # %entry +; SKX-NEXT: movb $-86, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm512_mask_blend_epi64: +; KNL: # BB#0: # %entry +; KNL-NEXT: movb $-86, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; KNL-NEXT: retq +entry: + %0 = shufflevector <8 x i64> %A, <8 x i64> %W, <8 x i32> + ret <8 x i64> %0 +} + +define <16 x float> @test_mm512_mask_blend_ps(<16 x float> %A, <16 x float> %W){ +; SKX-LABEL: test_mm512_mask_blend_ps: +; SKX: # BB#0: # %entry +; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vmovaps %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm512_mask_blend_ps: +; KNL: # BB#0: # %entry +; KNL-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vmovaps %zmm1, %zmm0 {%k1} +; KNL-NEXT: retq +entry: + %0 = shufflevector <16 x float> %A, <16 x float> %W, <16 x i32> + ret <16 x float> %0 +} + +define <8 x double> @test_mm512_mask_blend_pd(<8 x double> %A, <8 x double> %W){ +; SKX-LABEL: test_mm512_mask_blend_pd: +; SKX: # BB#0: # %entry +; SKX-NEXT: vshufpd {{.*#+}} zmm0 = zmm1[0],zmm0[1],zmm1[2],zmm0[3],zmm1[4],zmm0[5],zmm1[6],zmm0[7] +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm512_mask_blend_pd: +; KNL: # BB#0: # %entry +; KNL-NEXT: vshufpd {{.*#+}} zmm0 = zmm1[0],zmm0[1],zmm1[2],zmm0[3],zmm1[4],zmm0[5],zmm1[6],zmm0[7] +; KNL-NEXT: retq +entry: + %0 = shufflevector <8 x double> %A, <8 x double> %W, <8 x i32> + ret <8 x double> %0 +}