Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -10364,11 +10364,30 @@ /// one of the inputs being zeroable. static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef Mask, - const APInt &Zeroable, SelectionDAG &DAG) { - assert(!VT.isFloatingPoint() && "Floating point types are not supported"); + const APInt &Zeroable, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + MVT MaskVT = VT; MVT EltVT = VT.getVectorElementType(); - SDValue Zero = DAG.getConstant(0, DL, EltVT); - SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); + SDValue Zero, AllOnes; + // Use f64 if i64 isn't legal. + if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { + EltVT = MVT::f64; + MaskVT = MVT::getVectorVT(EltVT, Mask.size()); + } + + MVT LogicVT = VT; + if (EltVT == MVT::f32 || EltVT == MVT::f64) { + Zero = DAG.getConstantFP(0.0, DL, MVT::f64); + AllOnes = DAG.getConstantFP(APInt::getAllOnesValue(64).bitsToDouble(), DL, + EltVT); + LogicVT = MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, + Mask.size()); + } else { + Zero = DAG.getConstant(0, DL, EltVT); + AllOnes = DAG.getAllOnesConstant(DL, EltVT); + } + SmallVector VMaskOps(Mask.size(), Zero); SDValue V; for (int i = 0, Size = Mask.size(); i < Size; ++i) { @@ -10386,8 +10405,11 @@ if (!V) return SDValue(); // No non-zeroable elements! - SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps); - return DAG.getNode(ISD::AND, DL, VT, V, VMask); + SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps); + VMask = DAG.getBitcast(LogicVT, VMask); + V = DAG.getBitcast(LogicVT, V); + SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask); + return DAG.getBitcast(VT, And); } /// Try to emit a blend instruction for a shuffle using bit math. @@ -10552,7 +10574,7 @@ // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, - DAG)) + Subtarget, DAG)) return Masked; if (Subtarget.hasBWI() && Subtarget.hasVLX()) { @@ -10610,6 +10632,16 @@ case MVT::v16i32: case MVT::v32i16: case MVT::v64i8: { + // Attempt to lower to a bitmask if we can. Only if not optimizing for size. + bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); + if (!OptForSize) { + if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, + Subtarget, DAG)) + return Masked; + } + + // Otherwise load an immediate into a GPR, cast to k-register, and use a + // masked move. MVT IntegerType = MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); @@ -12766,7 +12798,7 @@ return Blend; if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, - Zeroable, DAG)) + Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. @@ -13467,7 +13499,7 @@ return Blend; if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, - Zeroable, DAG)) + Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. @@ -13735,7 +13767,7 @@ } if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, - Zeroable, DAG)) + Zeroable, Subtarget, DAG)) return Masked; // Use dedicated unpack instructions for masks that match their pattern. @@ -15571,7 +15603,7 @@ // No floating point type available, if we can't use the bit operations // for masking/blending then decompose into 128-bit vectors. if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, - DAG)) + Subtarget, DAG)) return V; if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) return V; Index: llvm/test/CodeGen/X86/avx512-mask-op.ll =================================================================== --- llvm/test/CodeGen/X86/avx512-mask-op.ll +++ llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1860,25 +1860,56 @@ ; ; SKX-LABEL: test_build_vec_v32i1: ; SKX: ## %bb.0: +; SKX-NEXT: vandps {{.*}}(%rip), %zmm0, %zmm0 +; SKX-NEXT: retq +; +; AVX512BW-LABEL: test_build_vec_v32i1: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: test_build_vec_v32i1: +; AVX512DQ: ## %bb.0: +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; AVX512DQ-NEXT: retq +; +; X86-LABEL: test_build_vec_v32i1: +; X86: ## %bb.0: +; X86-NEXT: vandps LCPI40_0, %zmm0, %zmm0 +; X86-NEXT: retl + %ret = select <32 x i1> , <32 x i16> %x, <32 x i16> zeroinitializer + ret <32 x i16> %ret +} + +define <32 x i16> @test_build_vec_v32i1_optsize(<32 x i16> %x) optsize { +; KNL-LABEL: test_build_vec_v32i1_optsize: +; KNL: ## %bb.0: +; KNL-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 +; KNL-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: test_build_vec_v32i1_optsize: +; SKX: ## %bb.0: ; SKX-NEXT: movl $1497715861, %eax ## imm = 0x59455495 ; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} ; SKX-NEXT: retq ; -; AVX512BW-LABEL: test_build_vec_v32i1: +; AVX512BW-LABEL: test_build_vec_v32i1_optsize: ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: movl $1497715861, %eax ## imm = 0x59455495 ; AVX512BW-NEXT: kmovd %eax, %k1 ; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: retq ; -; AVX512DQ-LABEL: test_build_vec_v32i1: +; AVX512DQ-LABEL: test_build_vec_v32i1_optsize: ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 ; AVX512DQ-NEXT: vandps {{.*}}(%rip), %ymm1, %ymm1 ; AVX512DQ-NEXT: retq ; -; X86-LABEL: test_build_vec_v32i1: +; X86-LABEL: test_build_vec_v32i1_optsize: ; X86: ## %bb.0: ; X86-NEXT: movl $1497715861, %eax ## imm = 0x59455495 ; X86-NEXT: kmovd %eax, %k1 @@ -1928,12 +1959,12 @@ ; KNL-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb %al, %al -; KNL-NEXT: je LBB42_2 +; KNL-NEXT: je LBB43_2 ; KNL-NEXT: ## %bb.1: ## %L1 ; KNL-NEXT: vmovapd %zmm0, (%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB42_2: ## %L2 +; KNL-NEXT: LBB43_2: ## %L2 ; KNL-NEXT: vmovapd %zmm0, 8(%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq @@ -1945,12 +1976,12 @@ ; SKX-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} ; SKX-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} ; SKX-NEXT: kortestb %k0, %k0 -; SKX-NEXT: je LBB42_2 +; SKX-NEXT: je LBB43_2 ; SKX-NEXT: ## %bb.1: ## %L1 ; SKX-NEXT: vmovapd %zmm0, (%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB42_2: ## %L2 +; SKX-NEXT: LBB43_2: ## %L2 ; SKX-NEXT: vmovapd %zmm0, 8(%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -1963,12 +1994,12 @@ ; AVX512BW-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testb %al, %al -; AVX512BW-NEXT: je LBB42_2 +; AVX512BW-NEXT: je LBB43_2 ; AVX512BW-NEXT: ## %bb.1: ## %L1 ; AVX512BW-NEXT: vmovapd %zmm0, (%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB42_2: ## %L2 +; AVX512BW-NEXT: LBB43_2: ## %L2 ; AVX512BW-NEXT: vmovapd %zmm0, 8(%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq @@ -1980,12 +2011,12 @@ ; AVX512DQ-NEXT: vmovupd 8(%rdi), %zmm1 {%k1} {z} ; AVX512DQ-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} ; AVX512DQ-NEXT: kortestb %k0, %k0 -; AVX512DQ-NEXT: je LBB42_2 +; AVX512DQ-NEXT: je LBB43_2 ; AVX512DQ-NEXT: ## %bb.1: ## %L1 ; AVX512DQ-NEXT: vmovapd %zmm0, (%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB42_2: ## %L2 +; AVX512DQ-NEXT: LBB43_2: ## %L2 ; AVX512DQ-NEXT: vmovapd %zmm0, 8(%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq @@ -1998,12 +2029,12 @@ ; X86-NEXT: vmovupd 8(%eax), %zmm1 {%k1} {z} ; X86-NEXT: vcmpltpd %zmm1, %zmm0, %k0 {%k1} ; X86-NEXT: kortestb %k0, %k0 -; X86-NEXT: je LBB42_2 +; X86-NEXT: je LBB43_2 ; X86-NEXT: ## %bb.1: ## %L1 ; X86-NEXT: vmovapd %zmm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB42_2: ## %L2 +; X86-NEXT: LBB43_2: ## %L2 ; X86-NEXT: vmovapd %zmm0, 8(%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -2052,13 +2083,13 @@ ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: orl %eax, %ecx -; KNL-NEXT: je LBB43_2 +; KNL-NEXT: je LBB44_2 ; KNL-NEXT: ## %bb.1: ## %L1 ; KNL-NEXT: vmovaps %zmm0, (%rdi) ; KNL-NEXT: vmovaps %zmm1, 64(%rdi) ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB43_2: ## %L2 +; KNL-NEXT: LBB44_2: ## %L2 ; KNL-NEXT: vmovaps %zmm0, 4(%rdi) ; KNL-NEXT: vmovaps %zmm1, 68(%rdi) ; KNL-NEXT: vzeroupper @@ -2077,13 +2108,13 @@ ; SKX-NEXT: vcmpltps %zmm2, %zmm1, %k2 ; SKX-NEXT: kunpckwd %k1, %k2, %k1 ; SKX-NEXT: kortestd %k1, %k0 -; SKX-NEXT: je LBB43_2 +; SKX-NEXT: je LBB44_2 ; SKX-NEXT: ## %bb.1: ## %L1 ; SKX-NEXT: vmovaps %zmm0, (%rdi) ; SKX-NEXT: vmovaps %zmm1, 64(%rdi) ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB43_2: ## %L2 +; SKX-NEXT: LBB44_2: ## %L2 ; SKX-NEXT: vmovaps %zmm0, 4(%rdi) ; SKX-NEXT: vmovaps %zmm1, 68(%rdi) ; SKX-NEXT: vzeroupper @@ -2102,13 +2133,13 @@ ; AVX512BW-NEXT: vcmpltps %zmm2, %zmm1, %k2 ; AVX512BW-NEXT: kunpckwd %k1, %k2, %k1 ; AVX512BW-NEXT: kortestd %k1, %k0 -; AVX512BW-NEXT: je LBB43_2 +; AVX512BW-NEXT: je LBB44_2 ; AVX512BW-NEXT: ## %bb.1: ## %L1 ; AVX512BW-NEXT: vmovaps %zmm0, (%rdi) ; AVX512BW-NEXT: vmovaps %zmm1, 64(%rdi) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB43_2: ## %L2 +; AVX512BW-NEXT: LBB44_2: ## %L2 ; AVX512BW-NEXT: vmovaps %zmm0, 4(%rdi) ; AVX512BW-NEXT: vmovaps %zmm1, 68(%rdi) ; AVX512BW-NEXT: vzeroupper @@ -2130,13 +2161,13 @@ ; AVX512DQ-NEXT: kmovw %k0, %ecx ; AVX512DQ-NEXT: shll $16, %ecx ; AVX512DQ-NEXT: orl %eax, %ecx -; AVX512DQ-NEXT: je LBB43_2 +; AVX512DQ-NEXT: je LBB44_2 ; AVX512DQ-NEXT: ## %bb.1: ## %L1 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rdi) ; AVX512DQ-NEXT: vmovaps %zmm1, 64(%rdi) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB43_2: ## %L2 +; AVX512DQ-NEXT: LBB44_2: ## %L2 ; AVX512DQ-NEXT: vmovaps %zmm0, 4(%rdi) ; AVX512DQ-NEXT: vmovaps %zmm1, 68(%rdi) ; AVX512DQ-NEXT: vzeroupper @@ -2156,13 +2187,13 @@ ; X86-NEXT: vcmpltps %zmm2, %zmm1, %k2 ; X86-NEXT: kunpckwd %k1, %k2, %k1 ; X86-NEXT: kortestd %k1, %k0 -; X86-NEXT: je LBB43_2 +; X86-NEXT: je LBB44_2 ; X86-NEXT: ## %bb.1: ## %L1 ; X86-NEXT: vmovaps %zmm0, (%eax) ; X86-NEXT: vmovaps %zmm1, 64(%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB43_2: ## %L2 +; X86-NEXT: LBB44_2: ## %L2 ; X86-NEXT: vmovaps %zmm0, 4(%eax) ; X86-NEXT: vmovaps %zmm1, 68(%eax) ; X86-NEXT: vzeroupper @@ -3175,12 +3206,12 @@ ; KNL-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testw %ax, %ax -; KNL-NEXT: jle LBB64_1 +; KNL-NEXT: jle LBB65_1 ; KNL-NEXT: ## %bb.2: ## %bb.2 ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB64_1: ## %bb.1 +; KNL-NEXT: LBB65_1: ## %bb.1 ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -3194,12 +3225,12 @@ ; SKX-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: testw %ax, %ax -; SKX-NEXT: jle LBB64_1 +; SKX-NEXT: jle LBB65_1 ; SKX-NEXT: ## %bb.2: ## %bb.2 ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB64_1: ## %bb.1 +; SKX-NEXT: LBB65_1: ## %bb.1 ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -3213,12 +3244,12 @@ ; AVX512BW-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testw %ax, %ax -; AVX512BW-NEXT: jle LBB64_1 +; AVX512BW-NEXT: jle LBB65_1 ; AVX512BW-NEXT: ## %bb.2: ## %bb.2 ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB64_1: ## %bb.1 +; AVX512BW-NEXT: LBB65_1: ## %bb.1 ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -3232,12 +3263,12 @@ ; AVX512DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512DQ-NEXT: kmovw %k0, %eax ; AVX512DQ-NEXT: testw %ax, %ax -; AVX512DQ-NEXT: jle LBB64_1 +; AVX512DQ-NEXT: jle LBB65_1 ; AVX512DQ-NEXT: ## %bb.2: ## %bb.2 ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB64_1: ## %bb.1 +; AVX512DQ-NEXT: LBB65_1: ## %bb.1 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -3251,12 +3282,12 @@ ; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; X86-NEXT: kmovd %k0, %eax ; X86-NEXT: testw %ax, %ax -; X86-NEXT: jle LBB64_1 +; X86-NEXT: jle LBB65_1 ; X86-NEXT: ## %bb.2: ## %bb.2 ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB64_1: ## %bb.1 +; X86-NEXT: LBB65_1: ## %bb.1 ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -3284,11 +3315,11 @@ ; CHECK-NEXT: vpord %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; CHECK-NEXT: kortestw %k0, %k0 -; CHECK-NEXT: jb LBB65_2 +; CHECK-NEXT: jb LBB66_2 ; CHECK-NEXT: ## %bb.1: ## %bb.1 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _foo -; CHECK-NEXT: LBB65_2: ## %bb.2 +; CHECK-NEXT: LBB66_2: ## %bb.2 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -3300,11 +3331,11 @@ ; X86-NEXT: vpord %zmm1, %zmm0, %zmm0 ; X86-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; X86-NEXT: kortestw %k0, %k0 -; X86-NEXT: jb LBB65_2 +; X86-NEXT: jb LBB66_2 ; X86-NEXT: ## %bb.1: ## %bb.1 ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo -; X86-NEXT: LBB65_2: ## %bb.2 +; X86-NEXT: LBB66_2: ## %bb.2 ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -3492,12 +3523,12 @@ ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb %al, %al -; KNL-NEXT: je LBB71_1 +; KNL-NEXT: je LBB72_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB71_1: ## %bar +; KNL-NEXT: LBB72_1: ## %bar ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -3514,12 +3545,12 @@ ; SKX-NEXT: vptestnmd %ymm3, %ymm3, %k2 ; SKX-NEXT: korb %k2, %k1, %k1 ; SKX-NEXT: ktestb %k1, %k0 -; SKX-NEXT: je LBB71_1 +; SKX-NEXT: je LBB72_1 ; SKX-NEXT: ## %bb.2: ## %exit ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB71_1: ## %bar +; SKX-NEXT: LBB72_1: ## %bar ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -3542,12 +3573,12 @@ ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testb %al, %al -; AVX512BW-NEXT: je LBB71_1 +; AVX512BW-NEXT: je LBB72_1 ; AVX512BW-NEXT: ## %bb.2: ## %exit ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB71_1: ## %bar +; AVX512BW-NEXT: LBB72_1: ## %bar ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -3568,12 +3599,12 @@ ; AVX512DQ-NEXT: korb %k1, %k0, %k0 ; AVX512DQ-NEXT: korb %k3, %k2, %k1 ; AVX512DQ-NEXT: ktestb %k1, %k0 -; AVX512DQ-NEXT: je LBB71_1 +; AVX512DQ-NEXT: je LBB72_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB71_1: ## %bar +; AVX512DQ-NEXT: LBB72_1: ## %bar ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -3590,12 +3621,12 @@ ; X86-NEXT: vptestnmd %ymm3, %ymm3, %k2 ; X86-NEXT: korb %k2, %k1, %k1 ; X86-NEXT: ktestb %k1, %k0 -; X86-NEXT: je LBB71_1 +; X86-NEXT: je LBB72_1 ; X86-NEXT: ## %bb.2: ## %exit ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB71_1: ## %bar +; X86-NEXT: LBB72_1: ## %bar ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -3633,12 +3664,12 @@ ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: testb %al, %al -; KNL-NEXT: je LBB72_1 +; KNL-NEXT: je LBB73_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB72_1: ## %bar +; KNL-NEXT: LBB73_1: ## %bar ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -3655,12 +3686,12 @@ ; SKX-NEXT: vptestnmq %zmm3, %zmm3, %k2 ; SKX-NEXT: korb %k2, %k1, %k1 ; SKX-NEXT: ktestb %k1, %k0 -; SKX-NEXT: je LBB72_1 +; SKX-NEXT: je LBB73_1 ; SKX-NEXT: ## %bb.2: ## %exit ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB72_1: ## %bar +; SKX-NEXT: LBB73_1: ## %bar ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -3679,12 +3710,12 @@ ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kmovd %k0, %eax ; AVX512BW-NEXT: testb %al, %al -; AVX512BW-NEXT: je LBB72_1 +; AVX512BW-NEXT: je LBB73_1 ; AVX512BW-NEXT: ## %bb.2: ## %exit ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB72_1: ## %bar +; AVX512BW-NEXT: LBB73_1: ## %bar ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -3701,12 +3732,12 @@ ; AVX512DQ-NEXT: vptestnmq %zmm3, %zmm3, %k2 ; AVX512DQ-NEXT: korb %k2, %k1, %k1 ; AVX512DQ-NEXT: ktestb %k1, %k0 -; AVX512DQ-NEXT: je LBB72_1 +; AVX512DQ-NEXT: je LBB73_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB72_1: ## %bar +; AVX512DQ-NEXT: LBB73_1: ## %bar ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -3723,12 +3754,12 @@ ; X86-NEXT: vptestnmq %zmm3, %zmm3, %k2 ; X86-NEXT: korb %k2, %k1, %k1 ; X86-NEXT: ktestb %k1, %k0 -; X86-NEXT: je LBB72_1 +; X86-NEXT: je LBB73_1 ; X86-NEXT: ## %bb.2: ## %exit ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB72_1: ## %bar +; X86-NEXT: LBB73_1: ## %bar ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -3765,12 +3796,12 @@ ; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kortestw %k0, %k0 -; KNL-NEXT: je LBB73_1 +; KNL-NEXT: je LBB74_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB73_1: ## %bar +; KNL-NEXT: LBB74_1: ## %bar ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -3787,12 +3818,12 @@ ; SKX-NEXT: vptestnmd %zmm3, %zmm3, %k2 ; SKX-NEXT: korw %k2, %k1, %k1 ; SKX-NEXT: ktestw %k1, %k0 -; SKX-NEXT: je LBB73_1 +; SKX-NEXT: je LBB74_1 ; SKX-NEXT: ## %bb.2: ## %exit ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB73_1: ## %bar +; SKX-NEXT: LBB74_1: ## %bar ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -3810,12 +3841,12 @@ ; AVX512BW-NEXT: korw %k2, %k1, %k1 ; AVX512BW-NEXT: kandw %k1, %k0, %k0 ; AVX512BW-NEXT: kortestw %k0, %k0 -; AVX512BW-NEXT: je LBB73_1 +; AVX512BW-NEXT: je LBB74_1 ; AVX512BW-NEXT: ## %bb.2: ## %exit ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB73_1: ## %bar +; AVX512BW-NEXT: LBB74_1: ## %bar ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -3832,12 +3863,12 @@ ; AVX512DQ-NEXT: vptestnmd %zmm3, %zmm3, %k2 ; AVX512DQ-NEXT: korw %k2, %k1, %k1 ; AVX512DQ-NEXT: ktestw %k1, %k0 -; AVX512DQ-NEXT: je LBB73_1 +; AVX512DQ-NEXT: je LBB74_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB73_1: ## %bar +; AVX512DQ-NEXT: LBB74_1: ## %bar ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -3854,12 +3885,12 @@ ; X86-NEXT: vptestnmd %zmm3, %zmm3, %k2 ; X86-NEXT: korw %k2, %k1, %k1 ; X86-NEXT: ktestw %k1, %k0 -; X86-NEXT: je LBB73_1 +; X86-NEXT: je LBB74_1 ; X86-NEXT: ## %bb.2: ## %exit ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB73_1: ## %bar +; X86-NEXT: LBB74_1: ## %bar ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -3911,12 +3942,12 @@ ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: orl %eax, %ecx -; KNL-NEXT: je LBB74_1 +; KNL-NEXT: je LBB75_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB74_1: ## %bar +; KNL-NEXT: LBB75_1: ## %bar ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -3933,12 +3964,12 @@ ; SKX-NEXT: vptestnmw %zmm3, %zmm3, %k2 ; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: ktestd %k1, %k0 -; SKX-NEXT: je LBB74_1 +; SKX-NEXT: je LBB75_1 ; SKX-NEXT: ## %bb.2: ## %exit ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB74_1: ## %bar +; SKX-NEXT: LBB75_1: ## %bar ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -3955,12 +3986,12 @@ ; AVX512BW-NEXT: vptestnmw %zmm3, %zmm3, %k2 ; AVX512BW-NEXT: kord %k2, %k1, %k1 ; AVX512BW-NEXT: ktestd %k1, %k0 -; AVX512BW-NEXT: je LBB74_1 +; AVX512BW-NEXT: je LBB75_1 ; AVX512BW-NEXT: ## %bb.2: ## %exit ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB74_1: ## %bar +; AVX512BW-NEXT: LBB75_1: ## %bar ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -3993,12 +4024,12 @@ ; AVX512DQ-NEXT: kmovw %k0, %ecx ; AVX512DQ-NEXT: shll $16, %ecx ; AVX512DQ-NEXT: orl %eax, %ecx -; AVX512DQ-NEXT: je LBB74_1 +; AVX512DQ-NEXT: je LBB75_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB74_1: ## %bar +; AVX512DQ-NEXT: LBB75_1: ## %bar ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -4015,12 +4046,12 @@ ; X86-NEXT: vptestnmw %zmm3, %zmm3, %k2 ; X86-NEXT: kord %k2, %k1, %k1 ; X86-NEXT: ktestd %k1, %k0 -; X86-NEXT: je LBB74_1 +; X86-NEXT: je LBB75_1 ; X86-NEXT: ## %bb.2: ## %exit ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB74_1: ## %bar +; X86-NEXT: LBB75_1: ## %bar ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp @@ -4096,12 +4127,12 @@ ; KNL-NEXT: orl %eax, %edx ; KNL-NEXT: shlq $32, %rdx ; KNL-NEXT: orq %rcx, %rdx -; KNL-NEXT: je LBB75_1 +; KNL-NEXT: je LBB76_1 ; KNL-NEXT: ## %bb.2: ## %exit ; KNL-NEXT: popq %rax ; KNL-NEXT: vzeroupper ; KNL-NEXT: retq -; KNL-NEXT: LBB75_1: ## %bar +; KNL-NEXT: LBB76_1: ## %bar ; KNL-NEXT: vzeroupper ; KNL-NEXT: callq _foo ; KNL-NEXT: popq %rax @@ -4118,12 +4149,12 @@ ; SKX-NEXT: vptestnmb %zmm3, %zmm3, %k2 ; SKX-NEXT: korq %k2, %k1, %k1 ; SKX-NEXT: ktestq %k1, %k0 -; SKX-NEXT: je LBB75_1 +; SKX-NEXT: je LBB76_1 ; SKX-NEXT: ## %bb.2: ## %exit ; SKX-NEXT: popq %rax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq -; SKX-NEXT: LBB75_1: ## %bar +; SKX-NEXT: LBB76_1: ## %bar ; SKX-NEXT: vzeroupper ; SKX-NEXT: callq _foo ; SKX-NEXT: popq %rax @@ -4140,12 +4171,12 @@ ; AVX512BW-NEXT: vptestnmb %zmm3, %zmm3, %k2 ; AVX512BW-NEXT: korq %k2, %k1, %k1 ; AVX512BW-NEXT: ktestq %k1, %k0 -; AVX512BW-NEXT: je LBB75_1 +; AVX512BW-NEXT: je LBB76_1 ; AVX512BW-NEXT: ## %bb.2: ## %exit ; AVX512BW-NEXT: popq %rax ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq -; AVX512BW-NEXT: LBB75_1: ## %bar +; AVX512BW-NEXT: LBB76_1: ## %bar ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: callq _foo ; AVX512BW-NEXT: popq %rax @@ -4202,12 +4233,12 @@ ; AVX512DQ-NEXT: orl %eax, %edx ; AVX512DQ-NEXT: shlq $32, %rdx ; AVX512DQ-NEXT: orq %rcx, %rdx -; AVX512DQ-NEXT: je LBB75_1 +; AVX512DQ-NEXT: je LBB76_1 ; AVX512DQ-NEXT: ## %bb.2: ## %exit ; AVX512DQ-NEXT: popq %rax ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq -; AVX512DQ-NEXT: LBB75_1: ## %bar +; AVX512DQ-NEXT: LBB76_1: ## %bar ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: callq _foo ; AVX512DQ-NEXT: popq %rax @@ -4226,12 +4257,12 @@ ; X86-NEXT: kandq %k1, %k0, %k0 ; X86-NEXT: kshiftrq $32, %k0, %k1 ; X86-NEXT: kortestd %k1, %k0 -; X86-NEXT: je LBB75_1 +; X86-NEXT: je LBB76_1 ; X86-NEXT: ## %bb.2: ## %exit ; X86-NEXT: addl $12, %esp ; X86-NEXT: vzeroupper ; X86-NEXT: retl -; X86-NEXT: LBB75_1: ## %bar +; X86-NEXT: LBB76_1: ## %bar ; X86-NEXT: vzeroupper ; X86-NEXT: calll _foo ; X86-NEXT: addl $12, %esp Index: llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll =================================================================== --- llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll +++ llvm/test/CodeGen/X86/merge-consecutive-loads-512.ll @@ -128,29 +128,17 @@ } define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp { -; AVX512F-LABEL: merge_8f64_f64_1u3u5zu8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: movb $32, %al -; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: knotw %k0, %k1 -; AVX512F-NEXT: vmovupd 8(%rdi), %zmm0 {%k1} {z} -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: merge_8f64_f64_1u3u5zu8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movb $32, %al -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: knotw %k0, %k1 -; AVX512BW-NEXT: vmovupd 8(%rdi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq +; ALL-LABEL: merge_8f64_f64_1u3u5zu8: +; ALL: # %bb.0: +; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 +; ALL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8: ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: movb $32, %cl -; X32-AVX512F-NEXT: kmovw %ecx, %k0 -; X32-AVX512F-NEXT: knotw %k0, %k1 -; X32-AVX512F-NEXT: vmovupd 8(%eax), %zmm0 {%k1} {z} +; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 +; X32-AVX512F-NEXT: vpandq {{\.LCPI.*}}, %zmm0, %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds double, double* %ptr, i64 1 %ptr2 = getelementptr inbounds double, double* %ptr, i64 3 @@ -219,29 +207,17 @@ } define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp { -; AVX512F-LABEL: merge_8i64_i64_1u3u5zu8: -; AVX512F: # %bb.0: -; AVX512F-NEXT: movb $32, %al -; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: knotw %k0, %k1 -; AVX512F-NEXT: vmovdqu64 8(%rdi), %zmm0 {%k1} {z} -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: merge_8i64_i64_1u3u5zu8: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movb $32, %al -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: knotw %k0, %k1 -; AVX512BW-NEXT: vmovdqu64 8(%rdi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq +; ALL-LABEL: merge_8i64_i64_1u3u5zu8: +; ALL: # %bb.0: +; ALL-NEXT: vmovdqu64 8(%rdi), %zmm0 +; ALL-NEXT: vpandq {{.*}}(%rip), %zmm0, %zmm0 +; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8: ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: movb $32, %cl -; X32-AVX512F-NEXT: kmovw %ecx, %k0 -; X32-AVX512F-NEXT: knotw %k0, %k1 -; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 {%k1} {z} +; X32-AVX512F-NEXT: vmovdqu64 8(%eax), %zmm0 +; X32-AVX512F-NEXT: vpandq {{\.LCPI.*}}, %zmm0, %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i64, i64* %ptr, i64 1 %ptr2 = getelementptr inbounds i64, i64* %ptr, i64 3 @@ -450,29 +426,17 @@ } define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp { -; AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: -; AVX512F: # %bb.0: -; AVX512F-NEXT: movw $8240, %ax # imm = 0x2030 -; AVX512F-NEXT: kmovw %eax, %k0 -; AVX512F-NEXT: knotw %k0, %k1 -; AVX512F-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} -; AVX512F-NEXT: retq -; -; AVX512BW-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: -; AVX512BW: # %bb.0: -; AVX512BW-NEXT: movw $8240, %ax # imm = 0x2030 -; AVX512BW-NEXT: kmovd %eax, %k0 -; AVX512BW-NEXT: knotw %k0, %k1 -; AVX512BW-NEXT: vmovdqu32 (%rdi), %zmm0 {%k1} {z} -; AVX512BW-NEXT: retq +; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: +; ALL: # %bb.0: +; ALL-NEXT: vmovdqu64 (%rdi), %zmm0 +; ALL-NEXT: vpandd {{.*}}(%rip), %zmm0, %zmm0 +; ALL-NEXT: retq ; ; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF: ; X32-AVX512F: # %bb.0: ; X32-AVX512F-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-AVX512F-NEXT: movw $8240, %cx # imm = 0x2030 -; X32-AVX512F-NEXT: kmovw %ecx, %k0 -; X32-AVX512F-NEXT: knotw %k0, %k1 -; X32-AVX512F-NEXT: vmovdqu32 (%eax), %zmm0 {%k1} {z} +; X32-AVX512F-NEXT: vmovdqu64 (%eax), %zmm0 +; X32-AVX512F-NEXT: vpandd {{\.LCPI.*}}, %zmm0, %zmm0 ; X32-AVX512F-NEXT: retl %ptr0 = getelementptr inbounds i32, i32* %ptr, i64 0 %ptr3 = getelementptr inbounds i32, i32* %ptr, i64 3 Index: llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll =================================================================== --- llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -203,9 +203,9 @@ ; ; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: ; SKX: ## %bb.0: -; SKX-NEXT: movl $1, %eax -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: movl $65535, %eax ## imm = 0xFFFF +; SKX-NEXT: vmovd %eax, %xmm1 +; SKX-NEXT: vpandq %zmm1, %zmm0, %zmm0 ; SKX-NEXT: retq %shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> ret <32 x i16> %shuffle