diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -499,7 +499,7 @@ bool tryShiftAmountMod(SDNode *N); bool tryShrinkShlLogicImm(SDNode *N); bool tryVPTERNLOG(SDNode *N); - bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); + bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask, bool Invert); bool tryMatchBitSelect(SDNode *N); MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, @@ -3944,6 +3944,7 @@ if (!(Subtarget->hasVLX() || NVT.is512BitVector())) return false; + unsigned NOpc = N->getOpcode(); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -3989,8 +3990,14 @@ case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break; } - switch (N->getOpcode()) { + switch (NOpc) { default: llvm_unreachable("Unexpected opcode!"); + case X86ISD::ANDNP: + if (A == N0) + Imm &= ~TernlogMagicA; + else + Imm = ~(Imm)&TernlogMagicA; + break; case ISD::AND: Imm &= TernlogMagicA; break; case ISD::OR: Imm |= TernlogMagicA; break; case ISD::XOR: Imm ^= TernlogMagicA; break; @@ -4230,8 +4237,8 @@ // Try to create VPTESTM instruction. If InMask is not null, it will be used // to form a masked operation. -bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, - SDValue InMask) { +bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue InMask, + bool Invert) { assert(Subtarget->hasAVX512() && "Expected AVX512!"); assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && "Unexpected VT!"); @@ -4355,6 +4362,9 @@ } bool IsTestN = CC == ISD::SETEQ; + if (Invert) + IsTestN = !IsTestN; + unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast, IsMasked); @@ -4407,6 +4417,9 @@ if (!NVT.isVector() || !Subtarget->hasAVX512()) return false; + if (!NVT.is128BitVector() && !NVT.is256BitVector() && !NVT.is512BitVector()) + return false; + // We need VLX for 128/256-bit. if (!(Subtarget->hasVLX() || NVT.is512BitVector())) return false; @@ -4592,16 +4605,29 @@ return; break; + case X86ISD::ANDNP: + if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + // Try to form a masked VPTESTM + if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && + tryVPTESTM(Node, N0, N1, true)) + return; + } + if (tryVPTERNLOG(Node)) + return; + break; + case ISD::AND: if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { // Try to form a masked VPTESTM. Operands can be in either order. SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && - tryVPTESTM(Node, N0, N1)) + tryVPTESTM(Node, N0, N1, false)) return; if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && - tryVPTESTM(Node, N1, N0)) + tryVPTESTM(Node, N1, N0, false)) return; } @@ -5407,7 +5433,7 @@ } case ISD::SETCC: { - if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue())) + if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue(), false)) return; break; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -39633,10 +39633,8 @@ // vselect Cond, 000..., X -> andn Cond, X if (TValIsAllZeros) { - MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64); - SDValue CastCond = DAG.getBitcast(AndNVT, Cond); - SDValue CastRHS = DAG.getBitcast(AndNVT, RHS); - SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS); + SDValue CastRHS = DAG.getBitcast(CondVT, RHS); + SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS); return DAG.getBitcast(VT, AndN); } @@ -42423,12 +42421,16 @@ return SDValue(); } -/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). -static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) { +/// Try to fold: +/// (and (not X), Y) -> (andnp X, Y) +/// (and (xor X, -1), Y) -> (andnp X, Y). +static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { assert(N->getOpcode() == ISD::AND); MVT VT = N->getSimpleValueType(0); - if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) + if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector() && + !(VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())) return SDValue(); SDValue X, Y; @@ -43006,7 +43008,7 @@ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; - if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG)) + if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG, Subtarget)) return R; if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget)) diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -2994,7 +2994,6 @@ } // These nodes use 'vnot' instead of 'not' to support vectors. -def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>; def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>; // TODO - do we need a X86SchedWriteWidths::KMASK type? @@ -3002,7 +3001,7 @@ defm KOR : avx512_mask_binop_all<0x45, "kor", or, SchedWriteVecLogic.XMM, 1>; defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, SchedWriteVecLogic.XMM, 1>; defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, SchedWriteVecLogic.XMM, 1>; -defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, SchedWriteVecLogic.XMM, 0>; +defm KANDN : avx512_mask_binop_all<0x42, "kandn", X86andnp, SchedWriteVecLogic.XMM, 0>; defm KADD : avx512_mask_binop_all<0x4A, "kadd", X86kadd, SchedWriteVecLogic.XMM, 1, HasDQI>; multiclass avx512_binop_pat; } -defm : avx512_binop_pat; -defm : avx512_binop_pat; -defm : avx512_binop_pat; -defm : avx512_binop_pat; -defm : avx512_binop_pat; +defm : avx512_binop_pat; +defm : avx512_binop_pat; +defm : avx512_binop_pat; +defm : avx512_binop_pat; +defm : avx512_binop_pat; // Mask unpacking multiclass avx512_mask_unpck %c, <1 x i1>* %x ret void } + +; Regression test from https://github.com/JuliaLang/julia/issues/36955 +define i8 @julia_issue36955(<8 x i1> %mask, <8 x double> %a) { +; X86-AVX512F-LABEL: julia_issue36955: +; X86-AVX512F: # %bb.0: +; X86-AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 +; X86-AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; X86-AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; X86-AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X86-AVX512F-NEXT: vcmpnlepd %zmm0, %zmm1, %k1 +; X86-AVX512F-NEXT: kandnw %k0, %k1, %k0 +; X86-AVX512F-NEXT: kandw %k1, %k0, %k0 +; X86-AVX512F-NEXT: knotw %k1, %k1 +; X86-AVX512F-NEXT: korw %k1, %k0, %k0 +; X86-AVX512F-NEXT: kmovw %k0, %eax +; X86-AVX512F-NEXT: # kill: def $al killed $al killed $eax +; X86-AVX512F-NEXT: vzeroupper +; X86-AVX512F-NEXT: retl +; +; X64-AVX512F-LABEL: julia_issue36955: +; X64-AVX512F: # %bb.0: +; X64-AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 +; X64-AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 +; X64-AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 +; X64-AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-AVX512F-NEXT: vcmpnlepd %zmm0, %zmm1, %k1 +; X64-AVX512F-NEXT: kandnw %k0, %k1, %k0 +; X64-AVX512F-NEXT: kandw %k1, %k0, %k0 +; X64-AVX512F-NEXT: knotw %k1, %k1 +; X64-AVX512F-NEXT: korw %k1, %k0, %k0 +; X64-AVX512F-NEXT: kmovw %k0, %eax +; X64-AVX512F-NEXT: # kill: def $al killed $al killed $eax +; X64-AVX512F-NEXT: vzeroupper +; X64-AVX512F-NEXT: retq +; +; X86-AVX512BW-LABEL: julia_issue36955: +; X86-AVX512BW: # %bb.0: +; X86-AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 +; X86-AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; X86-AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X86-AVX512BW-NEXT: vcmpnlepd %zmm0, %zmm1, %k1 +; X86-AVX512BW-NEXT: kandnw %k0, %k1, %k0 +; X86-AVX512BW-NEXT: kandw %k1, %k0, %k0 +; X86-AVX512BW-NEXT: knotw %k1, %k1 +; X86-AVX512BW-NEXT: korw %k1, %k0, %k0 +; X86-AVX512BW-NEXT: kmovd %k0, %eax +; X86-AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; X86-AVX512BW-NEXT: vzeroupper +; X86-AVX512BW-NEXT: retl +; +; X64-AVX512BW-LABEL: julia_issue36955: +; X64-AVX512BW: # %bb.0: +; X64-AVX512BW-NEXT: vpsllw $15, %xmm0, %xmm0 +; X64-AVX512BW-NEXT: vpmovw2m %zmm0, %k0 +; X64-AVX512BW-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; X64-AVX512BW-NEXT: vcmpnlepd %zmm0, %zmm1, %k1 +; X64-AVX512BW-NEXT: kandnw %k0, %k1, %k0 +; X64-AVX512BW-NEXT: kandw %k1, %k0, %k0 +; X64-AVX512BW-NEXT: knotw %k1, %k1 +; X64-AVX512BW-NEXT: korw %k1, %k0, %k0 +; X64-AVX512BW-NEXT: kmovd %k0, %eax +; X64-AVX512BW-NEXT: # kill: def $al killed $al killed $eax +; X64-AVX512BW-NEXT: vzeroupper +; X64-AVX512BW-NEXT: retq + %fcmp = fcmp ugt <8 x double> %a, zeroinitializer + %xor = xor <8 x i1> %fcmp, + %select1 = select <8 x i1> %fcmp, <8 x i1> zeroinitializer, <8 x i1> %mask + %select2 = select <8 x i1> %xor, <8 x i1> , <8 x i1> %select1 + %ret = bitcast <8 x i1> %select2 to i8 + ret i8 %ret +} diff --git a/llvm/test/CodeGen/X86/combine-bitselect.ll b/llvm/test/CodeGen/X86/combine-bitselect.ll --- a/llvm/test/CodeGen/X86/combine-bitselect.ll +++ b/llvm/test/CodeGen/X86/combine-bitselect.ll @@ -1052,10 +1052,10 @@ ; AVX512F: # %bb.0: # %bb ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: vpcmpeqd {{.*}}(%rip){1to16}, %zmm1, %k1 -; AVX512F-NEXT: vpcmpeqd {{.*}}(%rip){1to16}, %zmm1, %k2 -; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k0 {%k2} -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vpcmpeqd {{.*}}(%rip){1to16}, %zmm1, %k0 +; AVX512F-NEXT: kandnw %k0, %k1, %k0 +; AVX512F-NEXT: vpcmpeqd {{.*}}(%rip){1to16}, %zmm1, %k1 {%k1} ; AVX512F-NEXT: korw %k0, %k1, %k1 ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 @@ -1064,11 +1064,11 @@ ; ; AVX512VL-LABEL: bitselect_v4i1_loop: ; AVX512VL: # %bb.0: # %bb -; AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip){1to4}, %xmm1, %k1 -; AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip){1to4}, %xmm1, %k2 -; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k0 {%k2} -; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 {%k1} -; AVX512VL-NEXT: korw %k0, %k1, %k1 +; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 +; AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip){1to4}, %xmm1, %k0 +; AVX512VL-NEXT: vpcmpeqd {{.*}}(%rip){1to4}, %xmm1, %k2 {%k1} +; AVX512VL-NEXT: kandnw %k0, %k1, %k0 +; AVX512VL-NEXT: korw %k0, %k2, %k1 ; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512VL-NEXT: retq diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll --- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll @@ -612,12 +612,12 @@ ; ; AVX512F-LABEL: v16i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 -; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k1 {%k1} ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/usub_sat_vec.ll b/llvm/test/CodeGen/X86/usub_sat_vec.ll --- a/llvm/test/CodeGen/X86/usub_sat_vec.ll +++ b/llvm/test/CodeGen/X86/usub_sat_vec.ll @@ -545,12 +545,12 @@ ; ; AVX512F-LABEL: v16i1: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 -; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovsxbd %xmm1, %zmm1 ; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 -; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k1 -; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 {%k1} +; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 +; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 +; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k1 {%k1} ; AVX512F-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z} ; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512F-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll --- a/llvm/test/CodeGen/X86/vec_ssubo.ll +++ b/llvm/test/CodeGen/X86/vec_ssubo.ll @@ -1135,7 +1135,7 @@ ; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0 ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 -; AVX512-NEXT: vptestnmd %xmm1, %xmm1, %k2 {%k1} +; AVX512-NEXT: kandnw %k1, %k0, %k2 ; AVX512-NEXT: kxorw %k0, %k1, %k0 ; AVX512-NEXT: kxorw %k2, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vec_usubo.ll b/llvm/test/CodeGen/X86/vec_usubo.ll --- a/llvm/test/CodeGen/X86/vec_usubo.ll +++ b/llvm/test/CodeGen/X86/vec_usubo.ll @@ -1221,10 +1221,10 @@ ; AVX512: # %bb.0: ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 -; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 -; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k1 +; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 +; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512-NEXT: kxorw %k1, %k0, %k1 -; AVX512-NEXT: vptestnmd %xmm0, %xmm0, %k2 {%k1} +; AVX512-NEXT: kandnw %k1, %k0, %k2 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k2} {z} ; AVX512-NEXT: kmovd %k1, %eax