diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -20029,6 +20029,9 @@ if (!Subtarget.hasAVX512()) return false; + if (!V.getValueType().isSimple()) + return false; + MVT VT = V.getSimpleValueType().getScalarType(); if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI()) return false; @@ -46724,6 +46727,37 @@ return DAG.getBitcast(VT, Res); } +static SDValue commuteSelect(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + if (!Subtarget.hasAVX512()) + return SDValue(); + if (N->getOpcode() != ISD::VSELECT) + return SDValue(); + + SDLoc DL(N); + SDValue Cond = N->getOperand(0); + SDValue LHS = N->getOperand(1); + SDValue RHS = N->getOperand(2); + + if (canCombineAsMaskOperation(LHS, Subtarget)) + return SDValue(); + + if (!canCombineAsMaskOperation(RHS, Subtarget)) + return SDValue(); + + if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse()) + return SDValue(); + + // Commute LHS and RHS to create opportunity to select mask instruction. + // (vselect M, L, R) -> (vselect ~M, R, L) + ISD::CondCode NewCC = + ISD::getSetCCInverse(cast(Cond.getOperand(2))->get(), + Cond.getOperand(0).getValueType()); + Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0), + Cond.getOperand(1), NewCC); + return DAG.getSelect(DL, LHS.getValueType(), Cond, RHS, LHS); +} + /// Do target-specific dag combines on SELECT and VSELECT nodes. static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -46738,6 +46772,13 @@ if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS)) return V; + // When avx512 is available the lhs operand of select instruction can be + // folded with mask instruction, while the rhs operand can't. Commute the + // lhs and rhs of the select instruction to create the opportunity of + // folding. + if (SDValue V = commuteSelect(N, DAG, Subtarget)) + return V; + EVT VT = LHS.getValueType(); EVT CondVT = Cond.getValueType(); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); diff --git a/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll b/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll --- a/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll +++ b/llvm/test/CodeGen/X86/avx512f-vec-test-testn.ll @@ -220,9 +220,9 @@ ; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 ; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vpsubq %xmm0, %xmm1, %xmm1 -; CHECK-NEXT: vptestnmq %zmm0, %zmm0, %k1 -; CHECK-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vmovdqa %xmm1, %xmm0 +; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1 +; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1} +; CHECK-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: ret{{[l|q]}} %1 = sub <2 x i64> zeroinitializer, %a diff --git a/llvm/test/CodeGen/X86/combine-rotates.ll b/llvm/test/CodeGen/X86/combine-rotates.ll --- a/llvm/test/CodeGen/X86/combine-rotates.ll +++ b/llvm/test/CodeGen/X86/combine-rotates.ll @@ -163,10 +163,8 @@ ; ; AVX512-LABEL: combine_vec_rot_select_zero: ; AVX512: # %bb.0: -; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vptestnmd %xmm1, %xmm1, %k1 -; AVX512-NEXT: vmovdqa32 %xmm0, %xmm2 {%k1} -; AVX512-NEXT: vmovdqa %xmm2, %xmm0 +; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vprolvd %xmm1, %xmm0, %xmm0 {%k1} ; AVX512-NEXT: retq %3 = and <4 x i32> %1, %4 = shl <4 x i32> %0, %3 diff --git a/llvm/test/CodeGen/X86/paddus.ll b/llvm/test/CodeGen/X86/paddus.ll --- a/llvm/test/CodeGen/X86/paddus.ll +++ b/llvm/test/CodeGen/X86/paddus.ll @@ -450,10 +450,9 @@ ; ; AVX512-LABEL: test13: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512-NEXT: vpsubb %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vpcmpeqb %zmm2, %zmm0, %k1 -; AVX512-NEXT: vmovdqu8 %zmm2, %zmm1 {%k1} +; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512-NEXT: vpcmpneqb %zmm1, %zmm0, %k1 +; AVX512-NEXT: vpsubb %zmm1, %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = add <64 x i8> %x, @@ -1203,10 +1202,9 @@ ; ; AVX512-LABEL: test31: ; AVX512: # %bb.0: -; AVX512-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 -; AVX512-NEXT: vpsubw %zmm2, %zmm0, %zmm1 -; AVX512-NEXT: vpcmpeqw %zmm2, %zmm0, %k1 -; AVX512-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} +; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 +; AVX512-NEXT: vpcmpneqw %zmm1, %zmm0, %k1 +; AVX512-NEXT: vpsubw %zmm1, %zmm0, %zmm1 {%k1} ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq %1 = add <32 x i16> %x, diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll --- a/llvm/test/CodeGen/X86/sat-add.ll +++ b/llvm/test/CodeGen/X86/sat-add.ll @@ -1120,11 +1120,11 @@ ; ; AVX512-LABEL: unsigned_sat_variable_v4i32_using_cmp_notval: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpnleud %xmm1, %xmm0, %k1 -; AVX512-NEXT: vmovdqa32 %xmm3, %xmm2 {%k1} +; AVX512-NEXT: vmovdqa %xmm1, %xmm3 +; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm3 +; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpleud %xmm3, %xmm0, %k1 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm2 {%k1} ; AVX512-NEXT: vmovdqa %xmm2, %xmm0 ; AVX512-NEXT: retq %noty = xor <4 x i32> %y, @@ -1343,11 +1343,11 @@ ; ; AVX512-LABEL: unsigned_sat_variable_v2i64_using_cmp_notval: ; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3 -; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm2 -; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpcmpnleuq %xmm1, %xmm0, %k1 -; AVX512-NEXT: vmovdqa64 %xmm3, %xmm2 {%k1} +; AVX512-NEXT: vmovdqa %xmm1, %xmm3 +; AVX512-NEXT: vpternlogq $15, %xmm1, %xmm1, %xmm3 +; AVX512-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 +; AVX512-NEXT: vpcmpleuq %xmm3, %xmm0, %k1 +; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm2 {%k1} ; AVX512-NEXT: vmovdqa %xmm2, %xmm0 ; AVX512-NEXT: retq %noty = xor <2 x i64> %y, diff --git a/llvm/test/CodeGen/X86/vector-bo-select-avx512.ll b/llvm/test/CodeGen/X86/vector-bo-select-avx512.ll --- a/llvm/test/CodeGen/X86/vector-bo-select-avx512.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select-avx512.ll @@ -6,10 +6,8 @@ ; AVX512-LABEL: select_sub: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1 -; AVX512-NEXT: vpsubq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1 +; AVX512-NEXT: vpsubq %zmm2, %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq entry: %arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1 @@ -25,10 +23,8 @@ ; AVX512-LABEL: select_add: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1 -; AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1 +; AVX512-NEXT: vpaddq %zmm2, %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq entry: %arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1 @@ -44,10 +40,8 @@ ; AVX512-LABEL: select_and: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1 -; AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1 +; AVX512-NEXT: vpandq %zmm2, %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq entry: %arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1 @@ -63,10 +57,8 @@ ; AVX512-LABEL: select_xor: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1 -; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1 +; AVX512-NEXT: vpxorq %zmm2, %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq entry: %arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1 @@ -82,10 +74,8 @@ ; AVX512-LABEL: select_shl: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1 -; AVX512-NEXT: vpsllvq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1 +; AVX512-NEXT: vpsllvq %zmm2, %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq entry: %arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1 @@ -101,10 +91,8 @@ ; AVX512-LABEL: select_srl: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1 -; AVX512-NEXT: vpsrlvq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1 +; AVX512-NEXT: vpsrlvq %zmm2, %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq entry: %arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1 @@ -120,10 +108,8 @@ ; AVX512-LABEL: select_sra: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 -; AVX512-NEXT: vptestmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1 -; AVX512-NEXT: vpsravq %zmm2, %zmm1, %zmm1 -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512-NEXT: vptestnmq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm3, %k1 +; AVX512-NEXT: vpsravq %zmm2, %zmm1, %zmm0 {%k1} ; AVX512-NEXT: retq entry: %arrayidx = getelementptr inbounds <8 x i64>, ptr %ptr, i64 1 @@ -140,19 +126,17 @@ ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1 +; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1 ; AVX512F-NEXT: vpmulld %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: select_mul: ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1 -; AVX512VL-NEXT: vpmulld %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1 +; AVX512VL-NEXT: vpmulld %ymm2, %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq entry: %arrayidx = getelementptr inbounds <8 x i32>, ptr %ptr, i32 1 @@ -169,19 +153,17 @@ ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1 +; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1 ; AVX512F-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: select_smax: ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1 -; AVX512VL-NEXT: vpmaxsd %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1 +; AVX512VL-NEXT: vpmaxsd %ymm2, %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq entry: %arrayidx = getelementptr inbounds <8 x i32>, ptr %ptr, i32 1 @@ -199,19 +181,17 @@ ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1 +; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1 ; AVX512F-NEXT: vpminsd %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: select_smin: ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1 -; AVX512VL-NEXT: vpminsd %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1 +; AVX512VL-NEXT: vpminsd %ymm2, %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq entry: %arrayidx = getelementptr inbounds <8 x i32>, ptr %ptr, i32 1 @@ -229,19 +209,17 @@ ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1 +; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1 ; AVX512F-NEXT: vpmaxud %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: select_umax: ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1 -; AVX512VL-NEXT: vpmaxud %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1 +; AVX512VL-NEXT: vpmaxud %ymm2, %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq entry: %arrayidx = getelementptr inbounds <8 x i32>, ptr %ptr, i32 1 @@ -259,19 +237,17 @@ ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512F-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1 +; AVX512F-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm3, %k1 ; AVX512F-NEXT: vpminud %ymm2, %ymm1, %ymm1 -; AVX512F-NEXT: vmovdqa32 %zmm0, %zmm1 {%k1} -; AVX512F-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: select_umin: ; AVX512VL: # %bb.0: # %entry ; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm3 -; AVX512VL-NEXT: vptestmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1 -; AVX512VL-NEXT: vpminud %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} -; AVX512VL-NEXT: vmovdqa %ymm1, %ymm0 +; AVX512VL-NEXT: vptestnmd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm3, %k1 +; AVX512VL-NEXT: vpminud %ymm2, %ymm1, %ymm0 {%k1} ; AVX512VL-NEXT: retq entry: %arrayidx = getelementptr inbounds <8 x i32>, ptr %ptr, i32 1 diff --git a/llvm/test/CodeGen/X86/vector-bo-select.ll b/llvm/test/CodeGen/X86/vector-bo-select.ll --- a/llvm/test/CodeGen/X86/vector-bo-select.ll +++ b/llvm/test/CodeGen/X86/vector-bo-select.ll @@ -2430,9 +2430,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512-NEXT: vpsubd %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vpsubd %zmm2, %zmm1, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq %s = select <16 x i1> %b, <16 x i32> zeroinitializer, <16 x i32> %y %r = sub <16 x i32> %x, %s @@ -3388,9 +3388,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512-NEXT: vpsllvd %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vpsllvd %zmm2, %zmm1, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq %s = select <16 x i1> %b, <16 x i32> zeroinitializer, <16 x i32> %y %r = shl <16 x i32> %x, %s @@ -4140,9 +4140,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512-NEXT: vpsrlvd %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vpsrlvd %zmm2, %zmm1, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq %s = select <16 x i1> %b, <16 x i32> zeroinitializer, <16 x i32> %y %r = lshr <16 x i32> %x, %s @@ -4993,9 +4993,9 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512-NEXT: vpslld $31, %zmm0, %zmm0 -; AVX512-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512-NEXT: vpsravd %zmm2, %zmm1, %zmm0 -; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k1 +; AVX512-NEXT: vpsravd %zmm2, %zmm1, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 ; AVX512-NEXT: retq %s = select <16 x i1> %b, <16 x i32> zeroinitializer, <16 x i32> %y %r = ashr <16 x i32> %x, %s