Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -8307,6 +8307,11 @@ return DAG.getNode(ISD::OR, DL, VT, V1, V2); } +static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, + SDValue PreservedSrc, + const X86Subtarget &Subtarget, + SelectionDAG &DAG); + /// \brief Try to emit a blend instruction for a shuffle. /// /// This doesn't do any checks for the availability of instructions for blending @@ -8427,6 +8432,13 @@ case MVT::v32i8: { assert((VT.is128BitVector() || Subtarget.hasAVX2()) && "256-bit byte-blends require AVX2 support!"); + + if (Subtarget.hasBWI() && Subtarget.hasVLX()) { + MVT IntegerType = + MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); + return getVectorMaskingNode(V1, MaskNode, V2, Subtarget, DAG); + } // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = @@ -8465,7 +8477,17 @@ VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); } - + case MVT::v16f32: + case MVT::v8f64: + case MVT::v8i64: + case MVT::v16i32: + case MVT::v32i16: + case MVT::v64i8: { + MVT IntegerType = + MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); + SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); + return getVectorMaskingNode(V1, MaskNode, V2, Subtarget, DAG); + } default: llvm_unreachable("Not a supported integer vector type!"); } @@ -12891,6 +12913,10 @@ V2, DAG, Subtarget)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG); } @@ -12925,6 +12951,10 @@ lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) return Unpck; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + // Otherwise, fall back to a SHUFPS sequence. return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } @@ -12994,6 +13024,10 @@ V2, DAG, Subtarget)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG); } @@ -13062,6 +13096,9 @@ V1, V2, DAG, Subtarget)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG); } @@ -13109,6 +13146,10 @@ } } + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG); } @@ -13159,6 +13200,10 @@ DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) return V; + if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, + Zeroable, Subtarget, DAG)) + return Blend; + // FIXME: Implement direct support for this type! return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG); } Index: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll +++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll @@ -540,8 +540,9 @@ ; SKX-NEXT: vpmovm2b %k1, %zmm0 ; SKX-NEXT: vpsllq $40, %xmm0, %xmm0 ; SKX-NEXT: vpmovm2b %k0, %zmm1 -; SKX-NEXT: vmovdqu {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SKX-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; SKX-NEXT: movl $32, %eax +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; SKX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; SKX-NEXT: vpmovb2m %zmm0, %k0 @@ -607,8 +608,9 @@ ; SKX-NEXT: vpmovm2b %k1, %zmm0 ; SKX-NEXT: vpsllq $40, %xmm0, %xmm0 ; SKX-NEXT: vpmovm2b %k0, %zmm1 -; SKX-NEXT: vmovdqu {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; SKX-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; SKX-NEXT: movl $32, %eax +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1 ; SKX-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 ; SKX-NEXT: vpmovb2m %zmm0, %k0 Index: llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-512.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ llvm/trunk/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -138,19 +138,17 @@ define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8f64_f64_1u3u5zu8: ; ALL: # BB#0: -; ALL-NEXT: vmovupd 8(%rdi), %zmm1 -; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; ALL-NEXT: vmovapd {{.*#+}} zmm0 = <0,u,2,u,4,13,u,7> -; ALL-NEXT: vpermi2pd %zmm2, %zmm1, %zmm0 +; ALL-NEXT: movb $32, %al +; ALL-NEXT: kmovw %eax, %k1 +; ALL-NEXT: vmovupd 8(%rdi), %zmm0 {%k1} {z} ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8: ; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm1 -; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; X32-AVX512F-NEXT: vmovapd {{.*#+}} zmm0 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0> -; X32-AVX512F-NEXT: vpermi2pd %zmm2, %zmm1, %zmm0 +; X32-AVX512F-NEXT: movl 4(%esp), %eax +; X32-AVX512F-NEXT: movb $32, %cl +; X32-AVX512F-NEXT: kmovw %ecx, %k1 +; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm0 {%k1} {z} ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds double, double* %ptr, i64 1 %ptr2 = getelementptr inbounds double, double* %ptr, i64 3 @@ -225,19 +223,17 @@ define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_8i64_i64_1u3u5zu8: ; ALL: # BB#0: -; ALL-NEXT: vmovdqu64 8(%rdi), %zmm1 -; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; ALL-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,u,2,u,4,13,u,7> -; ALL-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; ALL-NEXT: movb $32, %al +; ALL-NEXT: kmovw %eax, %k1 +; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 {%k1} {z} ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8: ; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm1 -; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; X32-AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = <0,0,u,u,2,0,u,u,4,0,13,0,u,u,7,0> -; X32-AVX512F-NEXT: vpermi2q %zmm2, %zmm1, %zmm0 +; X32-AVX512F-NEXT: movl 4(%esp), %eax +; X32-AVX512F-NEXT: movb $32, %cl +; X32-AVX512F-NEXT: kmovw %ecx, %k1 +; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 {%k1} {z} ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1 %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3 @@ -448,19 +444,17 @@ define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp { ; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: ; ALL: # BB#0: -; ALL-NEXT: vmovdqu32 (%rdi), %zmm1 -; ALL-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; ALL-NEXT: vmovdqa32 {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; ALL-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 +; ALL-NEXT: movw $8240, %ax # imm = 0x2030 +; ALL-NEXT: kmovw %eax, %k1 +; ALL-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} ; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: ; X32-AVX512F: # BB#0: -; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm1 -; X32-AVX512F-NEXT: vpxord %zmm2, %zmm2, %zmm2 -; X32-AVX512F-NEXT: vmovdqa32 {{.*#+}} zmm0 = <0,u,u,3,20,21,u,u,u,u,u,u,12,29,14,15> -; X32-AVX512F-NEXT: vpermi2d %zmm2, %zmm1, %zmm0 +; X32-AVX512F-NEXT: movl 4(%esp), %eax +; X32-AVX512F-NEXT: movw $8240, %cx # imm = 0x2030 +; X32-AVX512F-NEXT: kmovw %ecx, %k1 +; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0 {%k1} {z} ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3 Index: llvm/trunk/test/CodeGen/X86/sse3-avx-addsub.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse3-avx-addsub.ll +++ llvm/trunk/test/CodeGen/X86/sse3-avx-addsub.ll @@ -119,10 +119,11 @@ ; ; AVX512-LABEL: test5: ; AVX512: # BB#0: -; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm2 -; AVX512-NEXT: vsubps %zmm1, %zmm0, %zmm0 -; AVX512-NEXT: vshufps {{.*#+}} zmm0 = zmm0[0,2],zmm2[1,3],zmm0[4,6],zmm2[5,7],zmm0[8,10],zmm2[9,11],zmm0[12,14],zmm2[13,15] -; AVX512-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[0,2,1,3,4,6,5,7,8,10,9,11,12,14,13,15] +; AVX512-NEXT: vaddps %zmm1, %zmm0, %zmm2 +; AVX512-NEXT: movw $-21846, %ax # imm = 0xAAAA +; AVX512-NEXT: kmovw %eax, %k1 +; AVX512-NEXT: vsubps %zmm1, %zmm0, %zmm2 {%k1} +; AVX512-NEXT: vmovaps %zmm2, %zmm0 ; AVX512-NEXT: retq %add = fadd <16 x float> %A, %B %sub = fsub <16 x float> %A, %B Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -423,8 +423,10 @@ ; ; AVX512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: movw $-21846, %ax # imm = 0xAAAA +; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} +; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle @@ -462,8 +464,10 @@ ; ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0] -; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: movw $-30584, %ax # imm = 0x8888 +; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} +; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle @@ -520,8 +524,10 @@ ; ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0] -; AVX512VL-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 +; AVX512VL-NEXT: movw $-28528, %ax # imm = 0x9090 +; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: vmovdqu8 %xmm0, %xmm1 {%k1} +; AVX512VL-NEXT: vmovdqa %xmm1, %xmm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle @@ -560,8 +566,9 @@ ; ; AVX512VL-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} xmm2 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0] -; AVX512VL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: movw $-21264, %ax # imm = 0xACF0 +; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %shuffle Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -717,8 +717,10 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: movw $-32768, %ax # imm = 0x8000 +; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} +; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -741,8 +743,10 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: movw $1, %ax +; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} +; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -765,8 +769,10 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0,0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: movw $21930, %ax # imm = 0x55AA +; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} +; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -789,8 +795,10 @@ ; ; AVX512VL-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,255,255,0,0,255,255,0,0,255,255,0,0,255,255,255,255,0,0,255,255,0,0,255,255,0,0,255,255,0,0] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: movw $-21931, %ax # imm = 0xAA55 +; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: vmovdqu16 %ymm0, %ymm1 {%k1} +; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -317,11 +317,11 @@ ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2 -; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512VL-NEXT: vpbroadcastb %xmm0, %xmm0 -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX512VL-NEXT: movl $32767, %eax # imm = 0x7FFF +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -349,8 +349,9 @@ ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_17_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <0,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: movl $1, %eax +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -379,8 +380,9 @@ ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: movw $1, %ax +; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -409,8 +411,9 @@ ; AVX512VL-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = <0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: movw $1, %ax +; AVX512VL-NEXT: kmovw %eax, %k1 +; AVX512VL-NEXT: vmovdqu16 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> @@ -1031,8 +1034,10 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_00_33_02_35_04_37_06_39_08_41_10_43_12_45_14_47_16_49_18_51_20_53_22_55_24_57_26_59_28_61_30_63: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -1055,8 +1060,9 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_32_01_34_03_36_05_38_07_40_09_42_11_44_13_46_15_48_17_50_19_52_21_54_23_56_25_58_27_60_29_62_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -1070,7 +1076,10 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_zz_01_zz_03_zz_05_zz_07_zz_09_zz_11_zz_13_zz_15_zz_17_zz_19_zz_21_zz_23_zz_25_zz_27_zz_29_zz_31: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 +; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VL-NEXT: kmovd %eax, %k0 +; AVX512VL-NEXT: knotd %k0, %k1 +; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm0 {%k1} {z} ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> ret <32 x i8> %shuffle @@ -1140,8 +1149,10 @@ ; AVX512VL-NEXT: vpshufb %ymm2, %ymm1, %ymm1 ; AVX512VL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] ; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5] -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vmovdqu8 %ymm0, %ymm1 {%k1} +; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -1384,10 +1395,11 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_00_32_01_33_02_34_03_35_04_36_05_37_06_38_07_39_24_56_25_57_26_58_27_59_28_60_29_61_30_62_31_63: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31] -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[0,u,1,u,2,u,3,u,4,u,5,u,6,u,7,u,24,u,25,u,26,u,27,u,28,u,29,u,30,u,31,u] +; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -1413,10 +1425,11 @@ ; ; AVX512VL-LABEL: shuffle_v32i8_08_40_09_41_10_42_11_43_12_44_13_45_14_46_15_47_16_48_17_49_18_50_19_51_20_52_21_53_22_54_23_55: ; AVX512VL: # BB#0: -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23] -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 {%k1} = ymm0[8,u,9,u,10,u,11,u,12,u,13,u,14,u,15,u,16,u,17,u,18,u,19,u,20,u,21,u,22,u,23,u] +; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -1670,15 +1683,16 @@ ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3,0,1] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,12,u,u,u,u,u,u,u,0,3,u,u,u,u,u,u,21,16,u,26,u,u,20,18,20,23] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u] -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm3 = <255,255,u,u,255,255,0,255,u,u,u,255,255,u,0,0,u,u,255,u,255,255,0,0,255,0,255,u,0,0,0,0> -; AVX512VL-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1] -; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] +; AVX512VL-NEXT: movl $-222248896, %eax # imm = 0xF2C0C040 +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm2 {%k1} = ymm1[10,13,u,u,3,3,u,8,u,u,u,12,1,u,u,u,u,u,20,u,17,22,u,u,16,u,27,u,u,u,u,u] +; AVX512VL-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1] +; AVX512VL-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,6,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,23,u,u,u,u] ; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,12,13,u,u,u,u,u,u,u,u,u,12,u,u,20,19,u,19,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4,5],ymm2[6],ymm0[7] -; AVX512VL-NEXT: vmovdqu {{.*#+}} ymm2 = [255,255,0,0,255,255,255,255,0,0,0,255,255,0,255,255,0,0,255,0,255,255,255,255,255,255,255,0,255,255,255,255] -; AVX512VL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7] +; AVX512VL-NEXT: movl $134948620, %eax # imm = 0x80B270C +; AVX512VL-NEXT: kmovd %eax, %k1 +; AVX512VL-NEXT: vmovdqu8 %ymm2, %ymm0 {%k1} ; AVX512VL-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -1931,7 +1945,6 @@ ; AVX2OR512VL: # BB#0: ; AVX2OR512VL-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero ; AVX2OR512VL-NEXT: retq - %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> ret <32 x i8> %shuffle } Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll @@ -252,8 +252,10 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a, <16 x i32> %b) { ; ALL-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u: ; ALL: # BB#0: -; ALL-NEXT: vmovdqa32 {{.*#+}} zmm2 = <0,1,2,19,u,u,u,u,u,u,u,u,u,u,u,u> -; ALL-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; ALL-NEXT: movw $8, %ax +; ALL-NEXT: kmovw %eax, %k1 +; ALL-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} +; ALL-NEXT: vmovdqa64 %zmm1, %zmm0 ; ALL-NEXT: retq %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> ret <16 x i32> %c Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v32.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -110,10 +110,10 @@ define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) { ; ALL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; ALL: # BB#0: -; ALL-NEXT: vmovdqu16 {{.*#+}} zmm2 = [32,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31] -; ALL-NEXT: vpxord %zmm1, %zmm1, %zmm1 -; ALL-NEXT: vpermt2w %zmm0, %zmm2, %zmm1 -; ALL-NEXT: vmovdqa64 %zmm1, %zmm0 +; ALL-NEXT: movl $1, %eax +; ALL-NEXT: kmovd %eax, %k0 +; ALL-NEXT: knotd %k0, %k1 +; ALL-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} ; ALL-NEXT: retq %shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> ret <32 x i16> %shuffle Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v8.ll @@ -1179,16 +1179,17 @@ ; ; AVX512F-LABEL: shuffle_v8i64_81a3c5e7: ; AVX512F: # BB#0: -; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,9,2,11,4,13,6,15] -; AVX512F-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm0 -; AVX512F-NEXT: retq +; AVX512F-NEXT: movb $-86, %al +; AVX512F-NEXT: kmovw %eax, %k1 +; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: retq +; ; ; AVX512F-32-LABEL: shuffle_v8i64_81a3c5e7: ; AVX512F-32: # BB#0: -; AVX512F-32-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,9,0,2,0,11,0,4,0,13,0,6,0,15,0] -; AVX512F-32-NEXT: vpermi2q %zmm0, %zmm1, %zmm2 -; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: movb $-86, %al +; AVX512F-32-NEXT: kmovw %eax, %k1 +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} ; AVX512F-32-NEXT: retl %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> ret <8 x i64> %shuffle Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-avx512.ll @@ -130,7 +130,7 @@ ; SKX-LABEL: expand6: ; SKX: # BB#0: ; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; SKX-NEXT: vinsertf{{.*}}$1, %xmm0, %ymm1, %ymm0 +; SKX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; SKX-NEXT: retq ; ; KNL-LABEL: expand6: @@ -331,3 +331,157 @@ %res = shufflevector <4 x float> %addV, <4 x float> %a, <8 x i32> ret <8 x float> %res } + + +; Shuffle to blend test + +define <64 x i8> @test_mm512_mask_blend_epi8(<64 x i8> %A, <64 x i8> %W){ +; SKX-LABEL: test_mm512_mask_blend_epi8: +; SKX: # BB#0: # %entry +; SKX-NEXT: movl $2863311530, %eax # imm = 0xAAAAAAAA +; SKX-NEXT: kmovq %rax, %k1 +; SKX-NEXT: vmovdqu8 %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm512_mask_blend_epi8: +; KNL: # BB#0: # %entry +; KNL-NEXT: vpbroadcastw {{.*}}(%rip), %ymm4 +; KNL-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0 +; KNL-NEXT: vpblendvb %ymm4, %ymm3, %ymm1, %ymm1 +; KNL-NEXT: retq +entry: + %0 = shufflevector <64 x i8> %A, <64 x i8> %W, <64 x i32> + ret <64 x i8> %0 +} + +define <32 x i16> @test_mm512_mask_blend_epi16(<32 x i16> %A, <32 x i16> %W){ +; SKX-LABEL: test_mm512_mask_blend_epi16: +; SKX: # BB#0: # %entry +; SKX-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm512_mask_blend_epi16: +; KNL: # BB#0: # %entry +; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7],ymm2[8],ymm0[9],ymm2[10],ymm0[11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] +; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2],ymm1[3],ymm3[4],ymm1[5],ymm3[6],ymm1[7],ymm3[8],ymm1[9],ymm3[10],ymm1[11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] +; KNL-NEXT: retq +entry: + %0 = shufflevector <32 x i16> %A, <32 x i16> %W, <32 x i32> + ret <32 x i16> %0 +} + +define <16 x i32> @test_mm512_mask_blend_epi32(<16 x i32> %A, <16 x i32> %W){ +; SKX-LABEL: test_mm512_mask_blend_epi32: +; SKX: # BB#0: # %entry +; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm512_mask_blend_epi32: +; KNL: # BB#0: # %entry +; KNL-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; KNL-NEXT: retq +entry: + %0 = shufflevector <16 x i32> %A, <16 x i32> %W, <16 x i32> + ret <16 x i32> %0 +} + +define <8 x i64> @test_mm512_mask_blend_epi64(<8 x i64> %A, <8 x i64> %W){ +; SKX-LABEL: test_mm512_mask_blend_epi64: +; SKX: # BB#0: # %entry +; SKX-NEXT: movb $-86, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm512_mask_blend_epi64: +; KNL: # BB#0: # %entry +; KNL-NEXT: movb $-86, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; KNL-NEXT: retq +entry: + %0 = shufflevector <8 x i64> %A, <8 x i64> %W, <8 x i32> + ret <8 x i64> %0 +} + +define <16 x float> @test_mm512_mask_blend_ps(<16 x float> %A, <16 x float> %W){ +; SKX-LABEL: test_mm512_mask_blend_ps: +; SKX: # BB#0: # %entry +; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vmovaps %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm512_mask_blend_ps: +; KNL: # BB#0: # %entry +; KNL-NEXT: movw $-21846, %ax # imm = 0xAAAA +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vmovaps %zmm1, %zmm0 {%k1} +; KNL-NEXT: retq +entry: + %0 = shufflevector <16 x float> %A, <16 x float> %W, <16 x i32> + ret <16 x float> %0 +} + +define <8 x double> @test_mm512_mask_blend_pd(<8 x double> %A, <8 x double> %W){ +; SKX-LABEL: test_mm512_mask_blend_pd: +; SKX: # BB#0: # %entry +; SKX-NEXT: movb $-88, %al +; SKX-NEXT: kmovb %eax, %k1 +; SKX-NEXT: vmovapd %zmm1, %zmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm512_mask_blend_pd: +; KNL: # BB#0: # %entry +; KNL-NEXT: movb $-88, %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: vmovapd %zmm1, %zmm0 {%k1} +; KNL-NEXT: retq +entry: + %0 = shufflevector <8 x double> %A, <8 x double> %W, <8 x i32> + ret <8 x double> %0 +} + + +define <32 x i8> @test_mm256_mask_blend_epi8(<32 x i8> %A, <32 x i8> %W){ +; SKX-LABEL: test_mm256_mask_blend_epi8: +; SKX: # BB#0: # %entry +; SKX-NEXT: movl $-1431655766, %eax # imm = 0xAAAAAAAA +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovdqu8 %ymm1, %ymm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm256_mask_blend_epi8: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; KNL-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; KNL-NEXT: retq +entry: + %0 = shufflevector <32 x i8> %A, <32 x i8> %W, <32 x i32> + ret <32 x i8> %0 +} + +define <16 x i8> @test_mm_mask_blend_epi8(<16 x i8> %A, <16 x i8> %W){ +; SKX-LABEL: test_mm_mask_blend_epi8: +; SKX: # BB#0: # %entry +; SKX-NEXT: movw $-21846, %ax # imm = 0xAAAA +; SKX-NEXT: kmovw %eax, %k1 +; SKX-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} +; SKX-NEXT: retq +; +; KNL-LABEL: test_mm_mask_blend_epi8: +; KNL: # BB#0: # %entry +; KNL-NEXT: vmovdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; KNL-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 +; KNL-NEXT: retq +entry: + %0 = shufflevector <16 x i8> %A, <16 x i8> %W, <16 x i32> + ret <16 x i8> %0 +} +