diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -4513,6 +4513,29 @@ /// method accepts vectors as its arguments. SDValue expandVectorSplice(SDNode *Node, SelectionDAG &DAG) const; + /// Legalize a SETCC with given LHS and RHS and condition code CC on the + /// current target. + /// + /// If the SETCC has been legalized using AND / OR, then the legalized node + /// will be stored in LHS. RHS and CC will be set to SDValue(). NeedInvert + /// will be set to false. + /// + /// If the SETCC has been legalized by using getSetCCSwappedOperands(), + /// then the values of LHS and RHS will be swapped, CC will be set to the + /// new condition, and NeedInvert will be set to false. + /// + /// If the SETCC has been legalized using the inverse condcode, then LHS and + /// RHS will be unchanged, CC will set to the inverted condcode, and + /// NeedInvert will be set to true. The caller must invert the result of the + /// SETCC with SelectionDAG::getLogicalNOT() or take equivalent action to swap + /// the effect of a true/false result. + /// + /// \returns true if the SetCC has been legalized, false if it hasn't. + bool LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT, SDValue &LHS, + SDValue &RHS, SDValue &CC, bool &NeedInvert, + const SDLoc &dl, SDValue &Chain, + bool IsSignaling = false) const; + //===--------------------------------------------------------------------===// // Instruction Emitting Hooks // diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -133,10 +133,6 @@ SDValue N1, SDValue N2, ArrayRef Mask) const; - bool LegalizeSetCCCondCode(EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC, - bool &NeedInvert, const SDLoc &dl, SDValue &Chain, - bool IsSignaling = false); - SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned); void ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32, @@ -1685,152 +1681,6 @@ Results.push_back(Tmp2); } -/// Legalize a SETCC with given LHS and RHS and condition code CC on the current -/// target. -/// -/// If the SETCC has been legalized using AND / OR, then the legalized node -/// will be stored in LHS. RHS and CC will be set to SDValue(). NeedInvert -/// will be set to false. -/// -/// If the SETCC has been legalized by using getSetCCSwappedOperands(), -/// then the values of LHS and RHS will be swapped, CC will be set to the -/// new condition, and NeedInvert will be set to false. -/// -/// If the SETCC has been legalized using the inverse condcode, then LHS and -/// RHS will be unchanged, CC will set to the inverted condcode, and NeedInvert -/// will be set to true. The caller must invert the result of the SETCC with -/// SelectionDAG::getLogicalNOT() or take equivalent action to swap the effect -/// of a true/false result. -/// -/// \returns true if the SetCC has been legalized, false if it hasn't. -bool SelectionDAGLegalize::LegalizeSetCCCondCode( - EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC, bool &NeedInvert, - const SDLoc &dl, SDValue &Chain, bool IsSignaling) { - MVT OpVT = LHS.getSimpleValueType(); - ISD::CondCode CCCode = cast(CC)->get(); - NeedInvert = false; - switch (TLI.getCondCodeAction(CCCode, OpVT)) { - default: llvm_unreachable("Unknown condition code action!"); - case TargetLowering::Legal: - // Nothing to do. - break; - case TargetLowering::Expand: { - ISD::CondCode InvCC = ISD::getSetCCSwappedOperands(CCCode); - if (TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) { - std::swap(LHS, RHS); - CC = DAG.getCondCode(InvCC); - return true; - } - // Swapping operands didn't work. Try inverting the condition. - bool NeedSwap = false; - InvCC = getSetCCInverse(CCCode, OpVT); - if (!TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) { - // If inverting the condition is not enough, try swapping operands - // on top of it. - InvCC = ISD::getSetCCSwappedOperands(InvCC); - NeedSwap = true; - } - if (TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) { - CC = DAG.getCondCode(InvCC); - NeedInvert = true; - if (NeedSwap) - std::swap(LHS, RHS); - return true; - } - - ISD::CondCode CC1 = ISD::SETCC_INVALID, CC2 = ISD::SETCC_INVALID; - unsigned Opc = 0; - switch (CCCode) { - default: llvm_unreachable("Don't know how to expand this condition!"); - case ISD::SETUO: - if (TLI.isCondCodeLegal(ISD::SETUNE, OpVT)) { - CC1 = ISD::SETUNE; CC2 = ISD::SETUNE; Opc = ISD::OR; - break; - } - assert(TLI.isCondCodeLegal(ISD::SETOEQ, OpVT) && - "If SETUE is expanded, SETOEQ or SETUNE must be legal!"); - NeedInvert = true; - LLVM_FALLTHROUGH; - case ISD::SETO: - assert(TLI.isCondCodeLegal(ISD::SETOEQ, OpVT) - && "If SETO is expanded, SETOEQ must be legal!"); - CC1 = ISD::SETOEQ; CC2 = ISD::SETOEQ; Opc = ISD::AND; break; - case ISD::SETONE: - case ISD::SETUEQ: - // If the SETUO or SETO CC isn't legal, we might be able to use - // SETOGT || SETOLT, inverting the result for SETUEQ. We only need one - // of SETOGT/SETOLT to be legal, the other can be emulated by swapping - // the operands. - CC2 = ((unsigned)CCCode & 0x8U) ? ISD::SETUO : ISD::SETO; - if (!TLI.isCondCodeLegal(CC2, OpVT) && - (TLI.isCondCodeLegal(ISD::SETOGT, OpVT) || - TLI.isCondCodeLegal(ISD::SETOLT, OpVT))) { - CC1 = ISD::SETOGT; - CC2 = ISD::SETOLT; - Opc = ISD::OR; - NeedInvert = ((unsigned)CCCode & 0x8U); - break; - } - LLVM_FALLTHROUGH; - case ISD::SETOEQ: - case ISD::SETOGT: - case ISD::SETOGE: - case ISD::SETOLT: - case ISD::SETOLE: - case ISD::SETUNE: - case ISD::SETUGT: - case ISD::SETUGE: - case ISD::SETULT: - case ISD::SETULE: - // If we are floating point, assign and break, otherwise fall through. - if (!OpVT.isInteger()) { - // We can use the 4th bit to tell if we are the unordered - // or ordered version of the opcode. - CC2 = ((unsigned)CCCode & 0x8U) ? ISD::SETUO : ISD::SETO; - Opc = ((unsigned)CCCode & 0x8U) ? ISD::OR : ISD::AND; - CC1 = (ISD::CondCode)(((int)CCCode & 0x7) | 0x10); - break; - } - // Fallthrough if we are unsigned integer. - LLVM_FALLTHROUGH; - case ISD::SETLE: - case ISD::SETGT: - case ISD::SETGE: - case ISD::SETLT: - case ISD::SETNE: - case ISD::SETEQ: - // If all combinations of inverting the condition and swapping operands - // didn't work then we have no means to expand the condition. - llvm_unreachable("Don't know how to expand this condition!"); - } - - SDValue SetCC1, SetCC2; - if (CCCode != ISD::SETO && CCCode != ISD::SETUO) { - // If we aren't the ordered or unorder operation, - // then the pattern is (LHS CC1 RHS) Opc (LHS CC2 RHS). - SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1, Chain, - IsSignaling); - SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2, Chain, - IsSignaling); - } else { - // Otherwise, the pattern is (LHS CC1 LHS) Opc (RHS CC2 RHS) - SetCC1 = DAG.getSetCC(dl, VT, LHS, LHS, CC1, Chain, - IsSignaling); - SetCC2 = DAG.getSetCC(dl, VT, RHS, RHS, CC2, Chain, - IsSignaling); - } - if (Chain) - Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, SetCC1.getValue(1), - SetCC2.getValue(1)); - LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2); - RHS = SDValue(); - CC = SDValue(); - return true; - } - } - return false; -} - /// Emit a store/load combination to the stack. This stores /// SrcOp to a stack slot of type SlotVT, truncating it if needed. It then does /// a load from the stack slot to DestVT, extending it if needed. @@ -3729,8 +3579,8 @@ Tmp2 = Node->getOperand(1 + Offset); Tmp3 = Node->getOperand(2 + Offset); bool Legalized = - LegalizeSetCCCondCode(Node->getValueType(0), Tmp1, Tmp2, Tmp3, - NeedInvert, dl, Chain, IsSignaling); + TLI.LegalizeSetCCCondCode(DAG, Node->getValueType(0), Tmp1, Tmp2, Tmp3, + NeedInvert, dl, Chain, IsSignaling); if (Legalized) { // If we expanded the SETCC by swapping LHS and RHS, or by inverting the @@ -3825,8 +3675,9 @@ } if (!Legalized) { - Legalized = LegalizeSetCCCondCode(getSetCCResultType(Tmp1.getValueType()), - Tmp1, Tmp2, CC, NeedInvert, dl, Chain); + Legalized = TLI.LegalizeSetCCCondCode( + DAG, getSetCCResultType(Tmp1.getValueType()), Tmp1, Tmp2, CC, + NeedInvert, dl, Chain); assert(Legalized && "Can't legalize SELECT_CC with legal condition!"); @@ -3860,8 +3711,8 @@ Tmp4 = Node->getOperand(1); // CC bool Legalized = - LegalizeSetCCCondCode(getSetCCResultType(Tmp2.getValueType()), Tmp2, - Tmp3, Tmp4, NeedInvert, dl, Chain); + TLI.LegalizeSetCCCondCode(DAG, getSetCCResultType(Tmp2.getValueType()), + Tmp2, Tmp3, Tmp4, NeedInvert, dl, Chain); (void)Legalized; assert(Legalized && "Can't legalize BR_CC with legal condition!"); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -138,6 +138,7 @@ SDValue ExpandStore(SDNode *N); SDValue ExpandFNEG(SDNode *Node); void ExpandFSUB(SDNode *Node, SmallVectorImpl &Results); + void ExpandSETCC(SDNode *Node, SmallVectorImpl &Results); void ExpandBITREVERSE(SDNode *Node, SmallVectorImpl &Results); void ExpandUADDSUBO(SDNode *Node, SmallVectorImpl &Results); void ExpandSADDSUBO(SDNode *Node, SmallVectorImpl &Results); @@ -396,7 +397,6 @@ case ISD::SELECT: case ISD::VSELECT: case ISD::SELECT_CC: - case ISD::SETCC: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: case ISD::TRUNCATE: @@ -495,6 +495,14 @@ Action = TLI.getOperationAction(Node->getOpcode(), Node->getOperand(1).getValueType()); break; + case ISD::SETCC: { + MVT OpVT = Node->getOperand(0).getSimpleValueType(); + ISD::CondCode CCCode = cast(Node->getOperand(2))->get(); + Action = TLI.getCondCodeAction(CCCode, OpVT); + if (Action == TargetLowering::Legal) + Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0)); + break; + } } LLVM_DEBUG(dbgs() << "\nLegalizing vector op: "; Node->dump(&DAG)); @@ -762,7 +770,7 @@ ExpandFSUB(Node, Results); return; case ISD::SETCC: - Results.push_back(UnrollVSETCC(Node)); + ExpandSETCC(Node, Results); return; case ISD::ABS: if (TLI.expandABS(Node, Tmp, DAG)) { @@ -1331,6 +1339,50 @@ Results.push_back(Tmp); } +void VectorLegalizer::ExpandSETCC(SDNode *Node, + SmallVectorImpl &Results) { + bool NeedInvert = false; + SDLoc dl(Node); + MVT OpVT = Node->getOperand(0).getSimpleValueType(); + ISD::CondCode CCCode = cast(Node->getOperand(2))->get(); + + if (TLI.getCondCodeAction(CCCode, OpVT) != TargetLowering::Expand) { + Results.push_back(UnrollVSETCC(Node)); + return; + } + + SDValue Chain; + SDValue LHS = Node->getOperand(0); + SDValue RHS = Node->getOperand(1); + SDValue CC = Node->getOperand(2); + bool Legalized = TLI.LegalizeSetCCCondCode(DAG, Node->getValueType(0), LHS, + RHS, CC, NeedInvert, dl, Chain); + + if (Legalized) { + // If we expanded the SETCC by swapping LHS and RHS, or by inverting the + // condition code, create a new SETCC node. + if (CC.getNode()) + LHS = DAG.getNode(ISD::SETCC, dl, Node->getValueType(0), LHS, RHS, CC, + Node->getFlags()); + + // If we expanded the SETCC by inverting the condition code, then wrap + // the existing SETCC in a NOT to restore the intended condition. + if (NeedInvert) + LHS = DAG.getLogicalNOT(dl, LHS, LHS->getValueType(0)); + } else { + // Otherwise, SETCC for the given comparison type must be completely + // illegal; expand it into a SELECT_CC. + EVT VT = Node->getValueType(0); + LHS = + DAG.getNode(ISD::SELECT_CC, dl, VT, LHS, RHS, + DAG.getBoolConstant(true, dl, VT, LHS.getValueType()), + DAG.getBoolConstant(false, dl, VT, LHS.getValueType()), CC); + LHS->setFlags(Node->getFlags()); + } + + Results.push_back(LHS); +} + void VectorLegalizer::ExpandUADDSUBO(SDNode *Node, SmallVectorImpl &Results) { SDValue Result, Overflow; diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8698,3 +8698,137 @@ return DAG.getLoad(VT, DL, StoreV2, StackPtr2, MachinePointerInfo::getUnknownStack(MF)); } + +bool TargetLowering::LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT, + SDValue &LHS, SDValue &RHS, + SDValue &CC, bool &NeedInvert, + const SDLoc &dl, SDValue &Chain, + bool IsSignaling) const { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT OpVT = LHS.getSimpleValueType(); + ISD::CondCode CCCode = cast(CC)->get(); + NeedInvert = false; + switch (TLI.getCondCodeAction(CCCode, OpVT)) { + default: + llvm_unreachable("Unknown condition code action!"); + case TargetLowering::Legal: + // Nothing to do. + break; + case TargetLowering::Expand: { + ISD::CondCode InvCC = ISD::getSetCCSwappedOperands(CCCode); + if (TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) { + std::swap(LHS, RHS); + CC = DAG.getCondCode(InvCC); + return true; + } + // Swapping operands didn't work. Try inverting the condition. + bool NeedSwap = false; + InvCC = getSetCCInverse(CCCode, OpVT); + if (!TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) { + // If inverting the condition is not enough, try swapping operands + // on top of it. + InvCC = ISD::getSetCCSwappedOperands(InvCC); + NeedSwap = true; + } + if (TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) { + CC = DAG.getCondCode(InvCC); + NeedInvert = true; + if (NeedSwap) + std::swap(LHS, RHS); + return true; + } + + ISD::CondCode CC1 = ISD::SETCC_INVALID, CC2 = ISD::SETCC_INVALID; + unsigned Opc = 0; + switch (CCCode) { + default: + llvm_unreachable("Don't know how to expand this condition!"); + case ISD::SETUO: + if (TLI.isCondCodeLegal(ISD::SETUNE, OpVT)) { + CC1 = ISD::SETUNE; + CC2 = ISD::SETUNE; + Opc = ISD::OR; + break; + } + assert(TLI.isCondCodeLegal(ISD::SETOEQ, OpVT) && + "If SETUE is expanded, SETOEQ or SETUNE must be legal!"); + NeedInvert = true; + LLVM_FALLTHROUGH; + case ISD::SETO: + assert(TLI.isCondCodeLegal(ISD::SETOEQ, OpVT) && + "If SETO is expanded, SETOEQ must be legal!"); + CC1 = ISD::SETOEQ; + CC2 = ISD::SETOEQ; + Opc = ISD::AND; + break; + case ISD::SETONE: + case ISD::SETUEQ: + // If the SETUO or SETO CC isn't legal, we might be able to use + // SETOGT || SETOLT, inverting the result for SETUEQ. We only need one + // of SETOGT/SETOLT to be legal, the other can be emulated by swapping + // the operands. + CC2 = ((unsigned)CCCode & 0x8U) ? ISD::SETUO : ISD::SETO; + if (!TLI.isCondCodeLegal(CC2, OpVT) && + (TLI.isCondCodeLegal(ISD::SETOGT, OpVT) || + TLI.isCondCodeLegal(ISD::SETOLT, OpVT))) { + CC1 = ISD::SETOGT; + CC2 = ISD::SETOLT; + Opc = ISD::OR; + NeedInvert = ((unsigned)CCCode & 0x8U); + break; + } + LLVM_FALLTHROUGH; + case ISD::SETOEQ: + case ISD::SETOGT: + case ISD::SETOGE: + case ISD::SETOLT: + case ISD::SETOLE: + case ISD::SETUNE: + case ISD::SETUGT: + case ISD::SETUGE: + case ISD::SETULT: + case ISD::SETULE: + // If we are floating point, assign and break, otherwise fall through. + if (!OpVT.isInteger()) { + // We can use the 4th bit to tell if we are the unordered + // or ordered version of the opcode. + CC2 = ((unsigned)CCCode & 0x8U) ? ISD::SETUO : ISD::SETO; + Opc = ((unsigned)CCCode & 0x8U) ? ISD::OR : ISD::AND; + CC1 = (ISD::CondCode)(((int)CCCode & 0x7) | 0x10); + break; + } + // Fallthrough if we are unsigned integer. + LLVM_FALLTHROUGH; + case ISD::SETLE: + case ISD::SETGT: + case ISD::SETGE: + case ISD::SETLT: + case ISD::SETNE: + case ISD::SETEQ: + // If all combinations of inverting the condition and swapping operands + // didn't work then we have no means to expand the condition. + llvm_unreachable("Don't know how to expand this condition!"); + } + + SDValue SetCC1, SetCC2; + if (CCCode != ISD::SETO && CCCode != ISD::SETUO) { + // If we aren't the ordered or unorder operation, + // then the pattern is (LHS CC1 RHS) Opc (LHS CC2 RHS). + SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1, Chain, IsSignaling); + SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2, Chain, IsSignaling); + } else { + // Otherwise, the pattern is (LHS CC1 LHS) Opc (RHS CC2 RHS) + SetCC1 = DAG.getSetCC(dl, VT, LHS, LHS, CC1, Chain, IsSignaling); + SetCC2 = DAG.getSetCC(dl, VT, RHS, RHS, CC2, Chain, IsSignaling); + } + if (Chain) + Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, SetCC1.getValue(1), + SetCC2.getValue(1)); + LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2); + RHS = SDValue(); + CC = SDValue(); + return true; + } + } + return false; +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1388,6 +1388,20 @@ // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one. setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + if (VT.isFloatingPoint()) { + setCondCodeAction(ISD::SETO, VT, Expand); + setCondCodeAction(ISD::SETOLT, VT, Expand); + setCondCodeAction(ISD::SETLT, VT, Expand); + setCondCodeAction(ISD::SETOLE, VT, Expand); + setCondCodeAction(ISD::SETLE, VT, Expand); + setCondCodeAction(ISD::SETULT, VT, Expand); + setCondCodeAction(ISD::SETULE, VT, Expand); + setCondCodeAction(ISD::SETUGE, VT, Expand); + setCondCodeAction(ISD::SETUGT, VT, Expand); + setCondCodeAction(ISD::SETUEQ, VT, Expand); + setCondCodeAction(ISD::SETUNE, VT, Expand); + } + // Lower fixed length vector operations to scalable equivalents. setOperationAction(ISD::ABS, VT, Custom); setOperationAction(ISD::ADD, VT, Custom); @@ -10389,11 +10403,8 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const { - if (Op.getValueType().isScalableVector()) { - if (Op.getOperand(0).getValueType().isFloatingPoint()) - return Op; + if (Op.getValueType().isScalableVector()) return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO); - } if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType())) return LowerFixedLengthVectorSetccToSVE(Op, DAG); @@ -17455,10 +17466,6 @@ assert(Op.getValueType() == InVT.changeTypeToInteger() && "Expected integer result of the same bit length as the inputs!"); - // Expand floating point vector comparisons. - if (InVT.isFloatingPoint()) - return SDValue(); - auto Op1 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(0)); auto Op2 = convertToScalableVector(DAG, ContainerVT, Op.getOperand(1)); auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -262,18 +262,6 @@ def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>; -def setoge_or_setge : PatFrags<(ops node:$lhs, node:$rhs), - [(setoge node:$lhs, node:$rhs), - (setge node:$lhs, node:$rhs)]>; -def setogt_or_setgt : PatFrags<(ops node:$lhs, node:$rhs), - [(setogt node:$lhs, node:$rhs), - (setgt node:$lhs, node:$rhs)]>; -def setoeq_or_seteq : PatFrags<(ops node:$lhs, node:$rhs), - [(setoeq node:$lhs, node:$rhs), - (seteq node:$lhs, node:$rhs)]>; -def setone_or_setne : PatFrags<(ops node:$lhs, node:$rhs), - [(setone node:$lhs, node:$rhs), - (setne node:$lhs, node:$rhs)]>; def AArch64mul_p_oneuse : PatFrag<(ops node:$pred, node:$src1, node:$src2), (AArch64mul_p node:$pred, node:$src1, node:$src2), [{ return N->hasOneUse(); @@ -1252,11 +1240,11 @@ defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>; defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>; - defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge_or_setge>; - defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt_or_setgt>; - defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq_or_seteq>; - defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone_or_setne>; - defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, setuo>; + defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, SETOGE, SETGE, SETOLE, SETLE>; + defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, SETOGT, SETGT, SETOLT, SETLT>; + defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, SETOEQ, SETEQ, SETOEQ, SETEQ>; + defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, SETONE, SETNE, SETONE, SETNE>; + defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, SETUO, SETUO, SETUO, SETUO>; defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>; defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -4685,20 +4685,22 @@ } multiclass sve_fp_3op_p_pd_cc opc, string asm, SDPatternOperator op, - SDPatternOperator op_nopred> + CondCode cc1, CondCode cc2, + CondCode invcc1, CondCode invcc2> : sve_fp_3op_p_pd { - def : SVE_2_Op_AllActive_Pat(NAME # _H), PTRUE_H>; - def : SVE_2_Op_AllActive_Pat(NAME # _H), PTRUE_S>; - def : SVE_2_Op_AllActive_Pat(NAME # _H), PTRUE_D>; - def : SVE_2_Op_AllActive_Pat(NAME # _S), PTRUE_S>; - def : SVE_2_Op_AllActive_Pat(NAME # _S), PTRUE_D>; - def : SVE_2_Op_AllActive_Pat(NAME # _D), PTRUE_D>; + defm : SVE_SETCC_Pat(NAME # _H)>; + defm : SVE_SETCC_Pat(NAME # _H)>; + defm : SVE_SETCC_Pat(NAME # _H)>; + defm : SVE_SETCC_Pat(NAME # _S)>; + defm : SVE_SETCC_Pat(NAME # _S)>; + defm : SVE_SETCC_Pat(NAME # _D)>; + + defm : SVE_SETCC_Pat(NAME # _H)>; + defm : SVE_SETCC_Pat(NAME # _H)>; + defm : SVE_SETCC_Pat(NAME # _H)>; + defm : SVE_SETCC_Pat(NAME # _S)>; + defm : SVE_SETCC_Pat(NAME # _S)>; + defm : SVE_SETCC_Pat(NAME # _D)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-float-compares.ll @@ -0,0 +1,762 @@ +; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE +; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 +; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_GE_2048 + +target triple = "aarch64-unknown-linux-gnu" + +; Don't use SVE when its registers are no bigger than NEON. +; NO_SVE-NOT: z{0-9} + +; +; FCMP OEQ +; + +; Don't use SVE for 64-bit vectors. +define <4 x i16> @fcmp_oeq_v4f16(<4 x half> %op1, <4 x half> %op2) #0 { +; CHECK-LABEL: fcmp_oeq_v4f16: +; CHECK: fcmeq v0.4h, v0.4h, v1.4h +; CHECK-NEXT: ret + %cmp = fcmp oeq <4 x half> %op1, %op2 + %sext = sext <4 x i1> %cmp to <4 x i16> + ret <4 x i16> %sext +} + +; Don't use SVE for 128-bit vectors. +define <8 x i16> @fcmp_oeq_v8f16(<8 x half> %op1, <8 x half> %op2) #0 { +; CHECK-LABEL: fcmp_oeq_v8f16: +; CHECK: fcmeq v0.8h, v0.8h, v1.8h +; CHECK-NEXT: ret + %cmp = fcmp oeq <8 x half> %op1, %op2 + %sext = sext <8 x i1> %cmp to <8 x i16> + ret <8 x i16> %sext +} + +define void @fcmp_oeq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_oeq_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp oeq <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +define void @fcmp_oeq_v32f16(<32 x half>* %a, <32 x half>* %b, <32 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_oeq_v32f16: +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl32 +; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_512-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_512-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 +; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 +; VBITS_EQ_256-DAG: add x[[C_HI:[0-9]+]], x2, #32 +; VBITS_EQ_256-DAG: ld1h { [[OP1_LO:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1h { [[OP1_HI:z[0-9]+]].h }, [[PG]]/z, [x[[A_HI]]] +; VBITS_EQ_256-DAG: ld1h { [[OP2_LO:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: ld1h { [[OP2_HI:z[0-9]+]].h }, [[PG]]/z, [x[[B_HI]]] +; VBITS_EQ_256-DAG: fcmeq [[CMP_HI:p[0-9]+]].h, [[PG]]/z, [[OP1_HI]].h, [[OP2_HI]].h +; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].h, [[CMP_HI]]/z, #-1 +; VBITS_EQ_256-DAG: fcmeq [[CMP_LO:p[0-9]+]].h, [[PG]]/z, [[OP1_LO]].h, [[OP2_LO]].h +; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].h, [[CMP_LO]]/z, #-1 +; VBITS_EQ_256-DAG: st1h { [[SEXT_LO]].h }, [[PG]], [x2] +; VBITS_EQ_256-DAG: st1h { [[SEXT_HI]].h }, [[PG]], [x[[C_HI]]] +; VBITS_EQ_256-NEXT: ret + %op1 = load <32 x half>, <32 x half>* %a + %op2 = load <32 x half>, <32 x half>* %b + %cmp = fcmp oeq <32 x half> %op1, %op2 + %sext = sext <32 x i1> %cmp to <32 x i16> + store <32 x i16> %sext, <32 x i16>* %c + ret void +} + +define void @fcmp_oeq_v64f16(<64 x half>* %a, <64 x half>* %b, <64 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_oeq_v64f16: +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl64 +; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_1024-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; VBITS_GE_1024-NEXT: ret + %op1 = load <64 x half>, <64 x half>* %a + %op2 = load <64 x half>, <64 x half>* %b + %cmp = fcmp oeq <64 x half> %op1, %op2 + %sext = sext <64 x i1> %cmp to <64 x i16> + store <64 x i16> %sext, <64 x i16>* %c + ret void +} + +define void @fcmp_oeq_v128f16(<128 x half>* %a, <128 x half>* %b, <128 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_oeq_v128f16: +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl128 +; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; VBITS_GE_2048-NEXT: ret + %op1 = load <128 x half>, <128 x half>* %a + %op2 = load <128 x half>, <128 x half>* %b + %cmp = fcmp oeq <128 x half> %op1, %op2 + %sext = sext <128 x i1> %cmp to <128 x i16> + store <128 x i16> %sext, <128 x i16>* %c + ret void +} + +; Don't use SVE for 64-bit vectors. +define <2 x i32> @fcmp_oeq_v2f32(<2 x float> %op1, <2 x float> %op2) #0 { +; CHECK-LABEL: fcmp_oeq_v2f32: +; CHECK: fcmeq v0.2s, v0.2s, v1.2s +; CHECK-NEXT: ret + %cmp = fcmp oeq <2 x float> %op1, %op2 + %sext = sext <2 x i1> %cmp to <2 x i32> + ret <2 x i32> %sext +} + +; Don't use SVE for 128-bit vectors. +define <4 x i32> @fcmp_oeq_v4f32(<4 x float> %op1, <4 x float> %op2) #0 { +; CHECK-LABEL: fcmp_oeq_v4f32: +; CHECK: fcmeq v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ret + %cmp = fcmp oeq <4 x float> %op1, %op2 + %sext = sext <4 x i1> %cmp to <4 x i32> + ret <4 x i32> %sext +} + +define void @fcmp_oeq_v8f32(<8 x float>* %a, <8 x float>* %b, <8 x i32>* %c) #0 { +; CHECK-LABEL: fcmp_oeq_v8f32: +; CHECK: ptrue [[PG:p[0-9]+]].s, vl8 +; CHECK-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; CHECK-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; CHECK-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1 +; CHECK-NEXT: st1w { [[SEXT]].s }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <8 x float>, <8 x float>* %a + %op2 = load <8 x float>, <8 x float>* %b + %cmp = fcmp oeq <8 x float> %op1, %op2 + %sext = sext <8 x i1> %cmp to <8 x i32> + store <8 x i32> %sext, <8 x i32>* %c + ret void +} + +define void @fcmp_oeq_v16f32(<16 x float>* %a, <16 x float>* %b, <16 x i32>* %c) #0 { +; CHECK-LABEL: fcmp_oeq_v16f32: +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl16 +; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_512-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s +; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_512-NEXT: st1w { [[SEXT]].s }, [[PG]], [x2] +; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].s, vl8 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 +; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 +; VBITS_EQ_256-DAG: add x[[C_HI:[0-9]+]], x2, #32 +; VBITS_EQ_256-DAG: ld1w { [[OP1_LO:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1w { [[OP1_HI:z[0-9]+]].s }, [[PG]]/z, [x[[A_HI]]] +; VBITS_EQ_256-DAG: ld1w { [[OP2_LO:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: ld1w { [[OP2_HI:z[0-9]+]].s }, [[PG]]/z, [x[[B_HI]]] +; VBITS_EQ_256-DAG: fcmeq [[CMP_HI:p[0-9]+]].s, [[PG]]/z, [[OP1_HI]].s, [[OP2_HI]].s +; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].s, [[CMP_HI]]/z, #-1 +; VBITS_EQ_256-DAG: fcmeq [[CMP_LO:p[0-9]+]].s, [[PG]]/z, [[OP1_LO]].s, [[OP2_LO]].s +; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].s, [[CMP_LO]]/z, #-1 +; VBITS_EQ_256-DAG: st1w { [[SEXT_LO]].s }, [[PG]], [x2] +; VBITS_EQ_256-DAG: st1w { [[SEXT_HI]].s }, [[PG]], [x[[C_HI]]] +; VBITS_EQ_256-NEXT: ret + %op1 = load <16 x float>, <16 x float>* %a + %op2 = load <16 x float>, <16 x float>* %b + %cmp = fcmp oeq <16 x float> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i32> + store <16 x i32> %sext, <16 x i32>* %c + ret void +} + +define void @fcmp_oeq_v32f32(<32 x float>* %a, <32 x float>* %b, <32 x i32>* %c) #0 { +; CHECK-LABEL: fcmp_oeq_v32f32: +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl32 +; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s +; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_1024-NEXT: st1w { [[SEXT]].s }, [[PG]], [x2] +; VBITS_GE_1024-NEXT: ret + %op1 = load <32 x float>, <32 x float>* %a + %op2 = load <32 x float>, <32 x float>* %b + %cmp = fcmp oeq <32 x float> %op1, %op2 + %sext = sext <32 x i1> %cmp to <32 x i32> + store <32 x i32> %sext, <32 x i32>* %c + ret void +} + +define void @fcmp_oeq_v64f32(<64 x float>* %a, <64 x float>* %b, <64 x i32>* %c) #0 { +; CHECK-LABEL: fcmp_oeq_v64f32: +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl64 +; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG]]/z, [[OP1]].s, [[OP2]].s +; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: st1w { [[SEXT]].s }, [[PG]], [x2] +; VBITS_GE_2048-NEXT: ret + %op1 = load <64 x float>, <64 x float>* %a + %op2 = load <64 x float>, <64 x float>* %b + %cmp = fcmp oeq <64 x float> %op1, %op2 + %sext = sext <64 x i1> %cmp to <64 x i32> + store <64 x i32> %sext, <64 x i32>* %c + ret void +} + +; Don't use SVE for 64-bit vectors. +define <1 x i64> @fcmp_oeq_v1f64(<1 x double> %op1, <1 x double> %op2) #0 { +; CHECK-LABEL: fcmp_oeq_v1f64: +; CHECK: fcmeq d0, d0, d1 +; CHECK-NEXT: ret + %cmp = fcmp oeq <1 x double> %op1, %op2 + %sext = sext <1 x i1> %cmp to <1 x i64> + ret <1 x i64> %sext +} + +; Don't use SVE for 128-bit vectors. +define <2 x i64> @fcmp_oeq_v2f64(<2 x double> %op1, <2 x double> %op2) #0 { +; CHECK-LABEL: fcmp_oeq_v2f64: +; CHECK: fcmeq v0.2d, v0.2d, v1.2d +; CHECK-NEXT: ret + %cmp = fcmp oeq <2 x double> %op1, %op2 + %sext = sext <2 x i1> %cmp to <2 x i64> + ret <2 x i64> %sext +} + +define void @fcmp_oeq_v4f64(<4 x double>* %a, <4 x double>* %b, <4 x i64>* %c) #0 { +; CHECK-LABEL: fcmp_oeq_v4f64: +; CHECK: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; CHECK-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; CHECK-NEXT: fcmeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1 +; CHECK-NEXT: st1d { [[SEXT]].d }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <4 x double>, <4 x double>* %a + %op2 = load <4 x double>, <4 x double>* %b + %cmp = fcmp oeq <4 x double> %op1, %op2 + %sext = sext <4 x i1> %cmp to <4 x i64> + store <4 x i64> %sext, <4 x i64>* %c + ret void +} + +define void @fcmp_oeq_v8f64(<8 x double>* %a, <8 x double>* %b, <8 x i64>* %c) #0 { +; CHECK-LABEL: fcmp_oeq_v8f64: +; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_512-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_512-NEXT: fcmeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d +; VBITS_GE_512-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1 +; VBITS_GE_512-NEXT: st1d { [[SEXT]].d }, [[PG]], [x2] +; VBITS_GE_512-NEXT: ret + +; Ensure sensible type legalisation +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: add x[[A_HI:[0-9]+]], x0, #32 +; VBITS_EQ_256-DAG: add x[[B_HI:[0-9]+]], x1, #32 +; VBITS_EQ_256-DAG: add x[[C_HI:[0-9]+]], x2, #32 +; VBITS_EQ_256-DAG: ld1d { [[OP1_LO:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_EQ_256-DAG: ld1d { [[OP1_HI:z[0-9]+]].d }, [[PG]]/z, [x[[A_HI]]] +; VBITS_EQ_256-DAG: ld1d { [[OP2_LO:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: ld1d { [[OP2_HI:z[0-9]+]].d }, [[PG]]/z, [x[[B_HI]]] +; VBITS_EQ_256-DAG: fcmeq [[CMP_HI:p[0-9]+]].d, [[PG]]/z, [[OP1_HI]].d, [[OP2_HI]].d +; VBITS_EQ_256-DAG: mov [[SEXT_HI:z[0-9]+]].d, [[CMP_HI]]/z, #-1 +; VBITS_EQ_256-DAG: fcmeq [[CMP_LO:p[0-9]+]].d, [[PG]]/z, [[OP1_LO]].d, [[OP2_LO]].d +; VBITS_EQ_256-DAG: mov [[SEXT_LO:z[0-9]+]].d, [[CMP_LO]]/z, #-1 +; VBITS_EQ_256-DAG: st1d { [[SEXT_LO]].d }, [[PG]], [x2] +; VBITS_EQ_256-DAG: st1d { [[SEXT_HI]].d }, [[PG]], [x[[C_HI]]] +; VBITS_EQ_256-NEXT: ret + %op1 = load <8 x double>, <8 x double>* %a + %op2 = load <8 x double>, <8 x double>* %b + %cmp = fcmp oeq <8 x double> %op1, %op2 + %sext = sext <8 x i1> %cmp to <8 x i64> + store <8 x i64> %sext, <8 x i64>* %c + ret void +} + +define void @fcmp_oeq_v16f64(<16 x double>* %a, <16 x double>* %b, <16 x i64>* %c) #0 { +; CHECK-LABEL: fcmp_oeq_v16f64: +; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl16 +; VBITS_GE_1024-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_1024-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_1024-NEXT: fcmeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d +; VBITS_GE_1024-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1 +; VBITS_GE_1024-NEXT: st1d { [[SEXT]].d }, [[PG]], [x2] +; VBITS_GE_1024-NEXT: ret + %op1 = load <16 x double>, <16 x double>* %a + %op2 = load <16 x double>, <16 x double>* %b + %cmp = fcmp oeq <16 x double> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i64> + store <16 x i64> %sext, <16 x i64>* %c + ret void +} + +define void @fcmp_oeq_v32f64(<32 x double>* %a, <32 x double>* %b, <32 x i64>* %c) #0 { +; CHECK-LABEL: fcmp_oeq_v32f64: +; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl32 +; VBITS_GE_2048-DAG: ld1d { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0] +; VBITS_GE_2048-DAG: ld1d { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].d, [[PG]]/z, [[OP1]].d, [[OP2]].d +; VBITS_GE_2048-NEXT: mov [[SEXT:z[0-9]+]].d, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: st1d { [[SEXT]].d }, [[PG]], [x2] +; VBITS_GE_2048-NEXT: ret + %op1 = load <32 x double>, <32 x double>* %a + %op2 = load <32 x double>, <32 x double>* %b + %cmp = fcmp oeq <32 x double> %op1, %op2 + %sext = sext <32 x i1> %cmp to <32 x i64> + store <32 x i64> %sext, <32 x i64>* %c + ret void +} + +; +; FCMP UEQ +; + +define void @fcmp_ueq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_ueq_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: mov [[INV:w[0-9]+]], #65535 +; CHECK-NEXT: fcmne [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: mov [[TMP:z[0-9]+]].h, [[INV]] +; CHECK-NEXT: eor [[SEXT]].d, [[SEXT]].d, [[TMP]].d +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp ueq <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP ONE +; + +define void @fcmp_one_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_one_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: fcmne [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp one <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP UNE +; + +define void @fcmp_une_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_une_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: mov [[INV:w[0-9]+]], #65535 +; CHECK-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: mov [[TMP:z[0-9]+]].h, [[INV]] +; CHECK-NEXT: eor [[SEXT]].d, [[SEXT]].d, [[TMP]].d +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp une <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP OGT +; + +define void @fcmp_ogt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_ogt_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: fcmgt [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp ogt <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP UGT +; + +define void @fcmp_ugt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_ugt_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: mov [[INV:w[0-9]+]], #65535 +; CHECK-NEXT: fcmge [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP2]].h, [[OP1]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: mov [[TMP:z[0-9]+]].h, [[INV]] +; CHECK-NEXT: eor [[SEXT]].d, [[SEXT]].d, [[TMP]].d +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp ugt <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP OLT +; + +define void @fcmp_olt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_olt_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: fcmgt [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP2]].h, [[OP1]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp olt <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP ULT +; + +define void @fcmp_ult_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_ult_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: mov [[INV:w[0-9]+]], #65535 +; CHECK-NEXT: fcmge [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: mov [[TMP:z[0-9]+]].h, [[INV]] +; CHECK-NEXT: eor [[SEXT]].d, [[SEXT]].d, [[TMP]].d +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp ult <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP OGE +; + +define void @fcmp_oge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_oge_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: fcmge [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp oge <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP UGE +; + +define void @fcmp_uge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_uge_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: mov [[INV:w[0-9]+]], #65535 +; CHECK-NEXT: fcmgt [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP2]].h, [[OP1]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: mov [[TMP:z[0-9]+]].h, [[INV]] +; CHECK-NEXT: eor [[SEXT]].d, [[SEXT]].d, [[TMP]].d +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp uge <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP OLE +; + +define void @fcmp_ole_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_ole_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: fcmge [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP2]].h, [[OP1]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp ole <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP ULE +; + +define void @fcmp_ule_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_ule_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: mov [[INV:w[0-9]+]], #65535 +; CHECK-NEXT: fcmgt [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: mov [[TMP:z[0-9]+]].h, [[INV]] +; CHECK-NEXT: eor [[SEXT]].d, [[SEXT]].d, [[TMP]].d +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp ule <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP UNO +; + +define void @fcmp_uno_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_uno_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: fcmuo [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp uno <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP ORD +; + +define void @fcmp_ord_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_ord_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: mov [[INV:w[0-9]+]], #65535 +; CHECK-NEXT: fcmuo [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: mov [[TMP:z[0-9]+]].h, [[INV]] +; CHECK-NEXT: eor [[SEXT]].d, [[SEXT]].d, [[TMP]].d +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp ord <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP EQ +; + +define void @fcmp_eq_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_eq_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp fast oeq <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP NE +; + +define void @fcmp_ne_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_ne_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: fcmne [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp fast one <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP GT +; + +define void @fcmp_gt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_gt_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: fcmgt [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp fast ogt <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP LT +; + +define void @fcmp_lt_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_lt_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: fcmgt [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP2]].h, [[OP1]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp fast olt <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP GE +; + +define void @fcmp_ge_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_ge_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: fcmge [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP1]].h, [[OP2]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp fast oge <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +; +; FCMP LE +; + +define void @fcmp_le_v16f16(<16 x half>* %a, <16 x half>* %b, <16 x i16>* %c) #0 { +; CHECK-LABEL: fcmp_le_v16f16: +; CHECK: ptrue [[PG:p[0-9]+]].h, vl16 +; CHECK-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0] +; CHECK-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1] +; CHECK-NEXT: fcmge [[CMP:p[0-9]+]].h, [[PG]]/z, [[OP2]].h, [[OP1]].h +; CHECK-NEXT: mov [[SEXT:z[0-9]+]].h, [[CMP]]/z, #-1 +; CHECK-NEXT: st1h { [[SEXT]].h }, [[PG]], [x2] +; CHECK-NEXT: ret + %op1 = load <16 x half>, <16 x half>* %a + %op2 = load <16 x half>, <16 x half>* %b + %cmp = fcmp fast ole <16 x half> %op1, %op2 + %sext = sext <16 x i1> %cmp to <16 x i16> + store <16 x i16> %sext, <16 x i16>* %c + ret void +} + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/PowerPC/vsx.ll b/llvm/test/CodeGen/PowerPC/vsx.ll --- a/llvm/test/CodeGen/PowerPC/vsx.ll +++ b/llvm/test/CodeGen/PowerPC/vsx.ll @@ -626,32 +626,32 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: xvcmpgtsp vs0, v5, v4 ; CHECK-NEXT: xvcmpgtsp vs1, v4, v5 -; CHECK-NEXT: xxlnor vs0, vs1, vs0 -; CHECK-NEXT: xxsel v2, v3, v2, vs0 +; CHECK-NEXT: xxlor vs0, vs1, vs0 +; CHECK-NEXT: xxsel v2, v2, v3, vs0 ; CHECK-NEXT: blr ; ; CHECK-REG-LABEL: test22: ; CHECK-REG: # %bb.0: # %entry ; CHECK-REG-NEXT: xvcmpgtsp vs0, v5, v4 ; CHECK-REG-NEXT: xvcmpgtsp vs1, v4, v5 -; CHECK-REG-NEXT: xxlnor vs0, vs1, vs0 -; CHECK-REG-NEXT: xxsel v2, v3, v2, vs0 +; CHECK-REG-NEXT: xxlor vs0, vs1, vs0 +; CHECK-REG-NEXT: xxsel v2, v2, v3, vs0 ; CHECK-REG-NEXT: blr ; ; CHECK-FISL-LABEL: test22: ; CHECK-FISL: # %bb.0: # %entry ; CHECK-FISL-NEXT: xvcmpgtsp vs1, v5, v4 ; CHECK-FISL-NEXT: xvcmpgtsp vs0, v4, v5 -; CHECK-FISL-NEXT: xxlnor vs0, vs0, vs1 -; CHECK-FISL-NEXT: xxsel v2, v3, v2, vs0 +; CHECK-FISL-NEXT: xxlor vs0, vs0, vs1 +; CHECK-FISL-NEXT: xxsel v2, v2, v3, vs0 ; CHECK-FISL-NEXT: blr ; ; CHECK-LE-LABEL: test22: ; CHECK-LE: # %bb.0: # %entry ; CHECK-LE-NEXT: xvcmpgtsp vs0, v5, v4 ; CHECK-LE-NEXT: xvcmpgtsp vs1, v4, v5 -; CHECK-LE-NEXT: xxlnor vs0, vs1, vs0 -; CHECK-LE-NEXT: xxsel v2, v3, v2, vs0 +; CHECK-LE-NEXT: xxlor vs0, vs1, vs0 +; CHECK-LE-NEXT: xxsel v2, v2, v3, vs0 ; CHECK-LE-NEXT: blr entry: %m = fcmp ueq <4 x float> %c, %d diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-setcc.ll @@ -285,9 +285,8 @@ ; CHECK-NEXT: vle64.v v28, (a0) ; CHECK-NEXT: vle64.v v8, (a1) ; CHECK-NEXT: vmfle.vv v25, v8, v28 -; CHECK-NEXT: vsetivli a0, 8, e8,mf2,ta,mu -; CHECK-NEXT: vmnand.mm v25, v25, v25 ; CHECK-NEXT: vsetivli a0, 8, e8,m1,ta,mu +; CHECK-NEXT: vmnand.mm v25, v25, v25 ; CHECK-NEXT: vse1.v v25, (a2) ; CHECK-NEXT: ret %a = load <8 x double>, <8 x double>* %x @@ -358,8 +357,8 @@ ; CHECK-NEXT: vsetvli a4, a3, e32,m8,ta,mu ; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: vle32.v v16, (a1) -; CHECK-NEXT: vmflt.vv v25, v16, v8 -; CHECK-NEXT: vmflt.vv v26, v8, v16 +; CHECK-NEXT: vmflt.vv v25, v8, v16 +; CHECK-NEXT: vmflt.vv v26, v16, v8 ; CHECK-NEXT: vsetvli a0, a3, e8,m2,ta,mu ; CHECK-NEXT: vmnor.mm v25, v26, v25 ; CHECK-NEXT: vse1.v v25, (a2) @@ -395,8 +394,8 @@ ; CHECK-NEXT: vsetivli a3, 16, e64,m8,ta,mu ; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: vle64.v v16, (a1) -; CHECK-NEXT: vmflt.vv v25, v16, v8 -; CHECK-NEXT: vmflt.vv v26, v8, v16 +; CHECK-NEXT: vmflt.vv v25, v8, v16 +; CHECK-NEXT: vmflt.vv v26, v16, v8 ; CHECK-NEXT: vsetivli a0, 16, e8,m1,ta,mu ; CHECK-NEXT: vmor.mm v25, v26, v25 ; CHECK-NEXT: vse1.v v25, (a2) @@ -433,9 +432,8 @@ ; CHECK-NEXT: vle16.v v26, (a0) ; CHECK-NEXT: vmfeq.vv v27, v25, v25 ; CHECK-NEXT: vmfeq.vv v25, v26, v26 -; CHECK-NEXT: vsetivli a0, 4, e8,mf2,ta,mu -; CHECK-NEXT: vmand.mm v25, v25, v27 ; CHECK-NEXT: vsetivli a0, 4, e8,m1,ta,mu +; CHECK-NEXT: vmand.mm v25, v25, v27 ; CHECK-NEXT: vse1.v v25, (a2) ; CHECK-NEXT: ret %a = load <4 x half>, <4 x half>* %x @@ -453,9 +451,8 @@ ; CHECK-NEXT: vle16.v v26, (a0) ; CHECK-NEXT: vmfne.vv v27, v25, v25 ; CHECK-NEXT: vmfne.vv v25, v26, v26 -; CHECK-NEXT: vsetivli a0, 2, e8,mf2,ta,mu -; CHECK-NEXT: vmor.mm v25, v25, v27 ; CHECK-NEXT: vsetivli a0, 2, e8,m1,ta,mu +; CHECK-NEXT: vmor.mm v25, v25, v27 ; CHECK-NEXT: vse1.v v25, (a2) ; CHECK-NEXT: ret %a = load <2 x half>, <2 x half>* %x @@ -747,9 +744,8 @@ ; CHECK-NEXT: vsetivli a2, 8, e64,m4,ta,mu ; CHECK-NEXT: vle64.v v28, (a0) ; CHECK-NEXT: vmfge.vf v25, v28, fa0 -; CHECK-NEXT: vsetivli a0, 8, e8,mf2,ta,mu -; CHECK-NEXT: vmnand.mm v25, v25, v25 ; CHECK-NEXT: vsetivli a0, 8, e8,m1,ta,mu +; CHECK-NEXT: vmnand.mm v25, v25, v25 ; CHECK-NEXT: vse1.v v25, (a1) ; CHECK-NEXT: ret %a = load <8 x double>, <8 x double>* %x @@ -820,8 +816,8 @@ ; CHECK-NEXT: addi a2, zero, 32 ; CHECK-NEXT: vsetvli a3, a2, e32,m8,ta,mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmfgt.vf v25, v8, fa0 -; CHECK-NEXT: vmflt.vf v26, v8, fa0 +; CHECK-NEXT: vmflt.vf v25, v8, fa0 +; CHECK-NEXT: vmfgt.vf v26, v8, fa0 ; CHECK-NEXT: vsetvli a0, a2, e8,m2,ta,mu ; CHECK-NEXT: vmnor.mm v25, v26, v25 ; CHECK-NEXT: vse1.v v25, (a1) @@ -857,8 +853,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli a2, 16, e64,m8,ta,mu ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vmfgt.vf v25, v8, fa0 -; CHECK-NEXT: vmflt.vf v26, v8, fa0 +; CHECK-NEXT: vmflt.vf v25, v8, fa0 +; CHECK-NEXT: vmfgt.vf v26, v8, fa0 ; CHECK-NEXT: vsetivli a0, 16, e8,m1,ta,mu ; CHECK-NEXT: vmor.mm v25, v26, v25 ; CHECK-NEXT: vse1.v v25, (a1) @@ -896,9 +892,8 @@ ; CHECK-NEXT: vfmv.v.f v26, fa0 ; CHECK-NEXT: vmfeq.vf v27, v26, fa0 ; CHECK-NEXT: vmfeq.vv v26, v25, v25 -; CHECK-NEXT: vsetivli a0, 4, e8,mf2,ta,mu -; CHECK-NEXT: vmand.mm v25, v26, v27 ; CHECK-NEXT: vsetivli a0, 4, e8,m1,ta,mu +; CHECK-NEXT: vmand.mm v25, v26, v27 ; CHECK-NEXT: vse1.v v25, (a1) ; CHECK-NEXT: ret %a = load <4 x half>, <4 x half>* %x @@ -917,9 +912,8 @@ ; CHECK-NEXT: vfmv.v.f v26, fa0 ; CHECK-NEXT: vmfne.vf v27, v26, fa0 ; CHECK-NEXT: vmfne.vv v26, v25, v25 -; CHECK-NEXT: vsetivli a0, 2, e8,mf2,ta,mu -; CHECK-NEXT: vmor.mm v25, v26, v27 ; CHECK-NEXT: vsetivli a0, 2, e8,m1,ta,mu +; CHECK-NEXT: vmor.mm v25, v26, v27 ; CHECK-NEXT: vse1.v v25, (a1) ; CHECK-NEXT: ret %a = load <2 x half>, <2 x half>* %x @@ -1212,9 +1206,8 @@ ; CHECK-NEXT: vsetivli a2, 8, e64,m4,ta,mu ; CHECK-NEXT: vle64.v v28, (a0) ; CHECK-NEXT: vmfle.vf v25, v28, fa0 -; CHECK-NEXT: vsetivli a0, 8, e8,mf2,ta,mu -; CHECK-NEXT: vmnand.mm v25, v25, v25 ; CHECK-NEXT: vsetivli a0, 8, e8,m1,ta,mu +; CHECK-NEXT: vmnand.mm v25, v25, v25 ; CHECK-NEXT: vse1.v v25, (a1) ; CHECK-NEXT: ret %a = load <8 x double>, <8 x double>* %x @@ -1285,8 +1278,8 @@ ; CHECK-NEXT: addi a2, zero, 32 ; CHECK-NEXT: vsetvli a3, a2, e32,m8,ta,mu ; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vmflt.vf v25, v8, fa0 -; CHECK-NEXT: vmfgt.vf v26, v8, fa0 +; CHECK-NEXT: vmfgt.vf v25, v8, fa0 +; CHECK-NEXT: vmflt.vf v26, v8, fa0 ; CHECK-NEXT: vsetvli a0, a2, e8,m2,ta,mu ; CHECK-NEXT: vmnor.mm v25, v26, v25 ; CHECK-NEXT: vse1.v v25, (a1) @@ -1322,8 +1315,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli a2, 16, e64,m8,ta,mu ; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: vmflt.vf v25, v8, fa0 -; CHECK-NEXT: vmfgt.vf v26, v8, fa0 +; CHECK-NEXT: vmfgt.vf v25, v8, fa0 +; CHECK-NEXT: vmflt.vf v26, v8, fa0 ; CHECK-NEXT: vsetivli a0, 16, e8,m1,ta,mu ; CHECK-NEXT: vmor.mm v25, v26, v25 ; CHECK-NEXT: vse1.v v25, (a1) @@ -1359,11 +1352,10 @@ ; CHECK-NEXT: vsetivli a2, 4, e16,m1,ta,mu ; CHECK-NEXT: vle16.v v25, (a0) ; CHECK-NEXT: vfmv.v.f v26, fa0 -; CHECK-NEXT: vmfeq.vf v27, v26, fa0 -; CHECK-NEXT: vmfeq.vv v26, v25, v25 -; CHECK-NEXT: vsetivli a0, 4, e8,mf2,ta,mu -; CHECK-NEXT: vmand.mm v25, v27, v26 +; CHECK-NEXT: vmfeq.vv v27, v25, v25 +; CHECK-NEXT: vmfeq.vf v25, v26, fa0 ; CHECK-NEXT: vsetivli a0, 4, e8,m1,ta,mu +; CHECK-NEXT: vmand.mm v25, v25, v27 ; CHECK-NEXT: vse1.v v25, (a1) ; CHECK-NEXT: ret %a = load <4 x half>, <4 x half>* %x @@ -1380,11 +1372,10 @@ ; CHECK-NEXT: vsetivli a2, 2, e16,m1,ta,mu ; CHECK-NEXT: vle16.v v25, (a0) ; CHECK-NEXT: vfmv.v.f v26, fa0 -; CHECK-NEXT: vmfne.vf v27, v26, fa0 -; CHECK-NEXT: vmfne.vv v26, v25, v25 -; CHECK-NEXT: vsetivli a0, 2, e8,mf2,ta,mu -; CHECK-NEXT: vmor.mm v25, v27, v26 +; CHECK-NEXT: vmfne.vv v27, v25, v25 +; CHECK-NEXT: vmfne.vf v25, v26, fa0 ; CHECK-NEXT: vsetivli a0, 2, e8,m1,ta,mu +; CHECK-NEXT: vmor.mm v25, v25, v27 ; CHECK-NEXT: vse1.v v25, (a1) ; CHECK-NEXT: ret %a = load <2 x half>, <2 x half>* %x