Index: llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -327,7 +327,6 @@ bool isOffsetMultipleOf(SDNode *N, unsigned Val) const; void transferMemOperands(SDNode *N, SDNode *Result); - MachineSDNode *flipSignBit(const SDValue &N, SDNode **SignBit = nullptr); }; } // end anonymous namespace @@ -4138,51 +4137,6 @@ CurDAG->setNodeMemRefs(cast(Result), {MemOp}); } -/// This method returns a node after flipping the MSB of each element -/// of vector integer type. Additionally, if SignBitVec is non-null, -/// this method sets a node with one at MSB of all elements -/// and zero at other bits in SignBitVec. -MachineSDNode * -PPCDAGToDAGISel::flipSignBit(const SDValue &N, SDNode **SignBitVec) { - SDLoc dl(N); - EVT VecVT = N.getValueType(); - if (VecVT == MVT::v4i32) { - if (SignBitVec) { - SDNode *ZV = CurDAG->getMachineNode(PPC::V_SET0, dl, MVT::v4i32); - *SignBitVec = CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT, - SDValue(ZV, 0)); - } - return CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT, N); - } - else if (VecVT == MVT::v8i16) { - SDNode *Hi = CurDAG->getMachineNode(PPC::LIS, dl, MVT::i32, - getI32Imm(0x8000, dl)); - SDNode *ScaImm = CurDAG->getMachineNode(PPC::ORI, dl, MVT::i32, - SDValue(Hi, 0), - getI32Imm(0x8000, dl)); - SDNode *VecImm = CurDAG->getMachineNode(PPC::MTVSRWS, dl, VecVT, - SDValue(ScaImm, 0)); - /* - Alternatively, we can do this as follow to use VRF instead of GPR. - vspltish 5, 1 - vspltish 6, 15 - vslh 5, 6, 5 - */ - if (SignBitVec) *SignBitVec = VecImm; - return CurDAG->getMachineNode(PPC::VADDUHM, dl, VecVT, N, - SDValue(VecImm, 0)); - } - else if (VecVT == MVT::v16i8) { - SDNode *VecImm = CurDAG->getMachineNode(PPC::XXSPLTIB, dl, MVT::i32, - getI32Imm(0x80, dl)); - if (SignBitVec) *SignBitVec = VecImm; - return CurDAG->getMachineNode(PPC::VADDUBM, dl, VecVT, N, - SDValue(VecImm, 0)); - } - else - llvm_unreachable("Unsupported vector data type for flipSignBit"); -} - // Select - Convert the specified operand from a target-independent to a // target-specific node if it hasn't already been changed. void PPCDAGToDAGISel::Select(SDNode *N) { @@ -4995,55 +4949,6 @@ return; } } - case ISD::ABS: { - assert(PPCSubTarget->hasP9Vector() && "ABS is supported with P9 Vector"); - - // For vector absolute difference, we use VABSDUW instruction of POWER9. - // Since VABSDU instructions are for unsigned integers, we need adjustment - // for signed integers. - // For abs(sub(a, b)), we generate VABSDUW(a+0x80000000, b+0x80000000). - // Otherwise, abs(sub(-1, 0)) returns 0xFFFFFFFF(=-1) instead of 1. - // For abs(a), we generate VABSDUW(a+0x80000000, 0x80000000). - EVT VecVT = N->getOperand(0).getValueType(); - SDNode *AbsOp = nullptr; - unsigned AbsOpcode; - - if (VecVT == MVT::v4i32) - AbsOpcode = PPC::VABSDUW; - else if (VecVT == MVT::v8i16) - AbsOpcode = PPC::VABSDUH; - else if (VecVT == MVT::v16i8) - AbsOpcode = PPC::VABSDUB; - else - llvm_unreachable("Unsupported vector data type for ISD::ABS"); - - // Even for signed integers, we can skip adjustment if all values are - // known to be positive (as signed integer) due to zero-extended inputs. - if (N->getOperand(0).getOpcode() == ISD::SUB && - N->getOperand(0)->getOperand(0).getOpcode() == ISD::ZERO_EXTEND && - N->getOperand(0)->getOperand(1).getOpcode() == ISD::ZERO_EXTEND) { - AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, - SDValue(N->getOperand(0)->getOperand(0)), - SDValue(N->getOperand(0)->getOperand(1))); - ReplaceNode(N, AbsOp); - return; - } - if (N->getOperand(0).getOpcode() == ISD::SUB) { - SDValue SubVal = N->getOperand(0); - SDNode *Op0 = flipSignBit(SubVal->getOperand(0)); - SDNode *Op1 = flipSignBit(SubVal->getOperand(1)); - AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, - SDValue(Op0, 0), SDValue(Op1, 0)); - } - else { - SDNode *Op1 = nullptr; - SDNode *Op0 = flipSignBit(N->getOperand(0), &Op1); - AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, SDValue(Op0, 0), - SDValue(Op1, 0)); - } - ReplaceNode(N, AbsOp); - return; - } } SelectCode(N); Index: llvm/lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.h +++ llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -374,6 +374,13 @@ /// and thereby have no chain. SWAP_NO_CHAIN, + /// An SDNode for Power9 absolute value difference. + /// operand #0 vector + /// operand #1 vector + /// operand #2 constant i32 0 or 1, to indicate whether needs to patch + /// the most significant bit for signed i32 + ABSD, + /// QVFPERM = This corresponds to the QPX qvfperm instruction. QVFPERM, @@ -995,6 +1002,7 @@ SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const; @@ -1098,6 +1106,8 @@ SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineABS(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineVSelect(SDNode *N, DAGCombinerInfo &DCI) const; /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces /// SETCC with integer subtraction when (1) there is a legal way of doing it Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -251,12 +251,6 @@ setOperationAction(ISD::UREM, MVT::i64, Expand); } - if (Subtarget.hasP9Vector()) { - setOperationAction(ISD::ABS, MVT::v4i32, Legal); - setOperationAction(ISD::ABS, MVT::v8i16, Legal); - setOperationAction(ISD::ABS, MVT::v16i8, Legal); - } - // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); @@ -554,6 +548,7 @@ // add/sub are legal for all supported vector VT's. setOperationAction(ISD::ADD, VT, Legal); setOperationAction(ISD::SUB, VT, Legal); + setOperationAction(ISD::ABS, VT, Custom); // Vector instructions introduced in P8 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) { @@ -659,6 +654,9 @@ setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); + if (!Subtarget.hasP8Altivec()) + setOperationAction(ISD::ABS, MVT::v2i64, Expand); + addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass); addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass); addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass); @@ -1081,6 +1079,11 @@ setTargetDAGCombine(ISD::FSQRT); } + if (Subtarget.hasP9Vector()) { + setTargetDAGCombine(ISD::ABS); + setTargetDAGCombine(ISD::VSELECT); + } + // Darwin long double math library functions have $LDBL128 appended. if (Subtarget.isDarwin()) { setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); @@ -1341,6 +1344,7 @@ case PPCISD::RFEBB: return "PPCISD::RFEBB"; case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; + case PPCISD::ABSD: return "PPCISD::ABSD"; case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; @@ -9001,35 +9005,6 @@ return DAG.getRegister(PPC::R2, MVT::i32); } - // We are looking for absolute values here. - // The idea is to try to fit one of two patterns: - // max (a, (0-a)) OR max ((0-a), a) - if (Subtarget.hasP9Vector() && - (IntrinsicID == Intrinsic::ppc_altivec_vmaxsw || - IntrinsicID == Intrinsic::ppc_altivec_vmaxsh || - IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) { - SDValue V1 = Op.getOperand(1); - SDValue V2 = Op.getOperand(2); - if (V1.getSimpleValueType() == V2.getSimpleValueType() && - (V1.getSimpleValueType() == MVT::v4i32 || - V1.getSimpleValueType() == MVT::v8i16 || - V1.getSimpleValueType() == MVT::v16i8)) { - if ( V1.getOpcode() == ISD::SUB && - ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) && - V1.getOperand(1) == V2 ) { - // Generate the abs instruction with the operands - return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2); - } - - if ( V2.getOpcode() == ISD::SUB && - ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) && - V2.getOperand(1) == V1 ) { - // Generate the abs instruction with the operands - return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1); - } - } - } - // If this is a lowered altivec predicate compare, CompareOpc is set to the // opcode number of the comparison. int CompareOpc; @@ -9570,6 +9545,43 @@ } } +SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { + + assert(Op.getOpcode() == ISD::ABS && "Should only be called for ISD::ABS"); + + EVT VT = Op.getValueType(); + assert(VT.isVector() && + "Only set vector abs as custom, scalar abs shouldn't reach here!"); + assert((VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || + VT == MVT::v16i8) && + "Unexpected vector element type!"); + assert((VT != MVT::v2i64 || Subtarget.hasP8Altivec()) && + "Current subtarget doesn't support smax v2i64!"); + + // For vector abs, it can be lowered to: + // abs x + // ==> + // y = -x + // smax(x, y) + + SDLoc dl(Op); + SDValue X = Op.getOperand(0); + SDValue Zero = DAG.getConstant(0, dl, VT); + SDValue Y = DAG.getNode(ISD::SUB, dl, VT, Zero, X); + + // SMAX patch hasn't landed yet, so use intrinsic first + // TODO: Should use SMAX directly once SMAX patch landed + Intrinsic::ID BifID = Intrinsic::ppc_altivec_vmaxsw; + if (VT == MVT::v2i64) + BifID = Intrinsic::ppc_altivec_vmaxsd; + else if (VT == MVT::v8i16) + BifID = Intrinsic::ppc_altivec_vmaxsh; + else if (VT == MVT::v16i8) + BifID = Intrinsic::ppc_altivec_vmaxsb; + + return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT); +} + /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -9622,6 +9634,7 @@ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::ABS: return LowerABS(Op, DAG); // For counter-based loop handling. case ISD::INTRINSIC_W_CHAIN: return SDValue(); @@ -12983,6 +12996,39 @@ } } } + + // Combine vmaxsw/h/b(a, a's negation) to abs(a) + // Expose the vabsduw/h/b opportunity for down stream + if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Vector() && + (IID == Intrinsic::ppc_altivec_vmaxsw || + IID == Intrinsic::ppc_altivec_vmaxsh || + IID == Intrinsic::ppc_altivec_vmaxsb)) { + SDValue V1 = N->getOperand(1); + SDValue V2 = N->getOperand(2); + if ((V1.getSimpleValueType() == MVT::v4i32 || + V1.getSimpleValueType() == MVT::v8i16 || + V1.getSimpleValueType() == MVT::v16i8) && + V1.getSimpleValueType() == V2.getSimpleValueType()) { + // (0-a, a) + if (V1.getOpcode() == ISD::SUB && + ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) && + V1.getOperand(1) == V2) { + return DAG.getNode(ISD::ABS, dl, V2.getValueType(), V2); + } + // (a, 0-a) + if (V2.getOpcode() == ISD::SUB && + ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) && + V2.getOperand(1) == V1) { + return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1); + } + // (x-y, y-x) + if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB && + V1.getOperand(0) == V2.getOperand(1) && + V1.getOperand(1) == V2.getOperand(0)) { + return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1); + } + } + } } break; @@ -13215,6 +13261,10 @@ } case ISD::BUILD_VECTOR: return DAGCombineBuildVector(N, DCI); + case ISD::ABS: + return combineABS(N, DCI); + case ISD::VSELECT: + return combineVSelect(N, DCI); } return SDValue(); @@ -14472,3 +14522,93 @@ // For non-constant masks, we can always use the record-form and. return true; } + +SDValue PPCTargetLowering::combineABS(SDNode *N, DAGCombinerInfo &DCI) const { + assert((N->getOpcode() == ISD::ABS) && "Need ABS node here"); + assert(Subtarget.hasP9Vector() && + "Only combine this when P9 vector supported!"); + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + if (N->getOperand(0).getOpcode() == ISD::SUB) { + + // Even for signed integers, if it's known to be positive (as signed + // integer) due to zero-extended inputs. + unsigned SubOpcd0 = N->getOperand(0)->getOperand(0).getOpcode(); + unsigned SubOpcd1 = N->getOperand(0)->getOperand(1).getOpcode(); + if ((SubOpcd0 == ISD::ZERO_EXTEND || + SubOpcd0 == ISD::ZERO_EXTEND_VECTOR_INREG) && + (SubOpcd1 == ISD::ZERO_EXTEND || + SubOpcd1 == ISD::ZERO_EXTEND_VECTOR_INREG)) { + return DAG.getNode(PPCISD::ABSD, dl, N->getOperand(0).getValueType(), + N->getOperand(0)->getOperand(0), + N->getOperand(0)->getOperand(1), + DAG.getTargetConstant(0, dl, MVT::i32)); + } + + // For type v4i32, it can be optimized with xvnegsp + vabsduw + if (N->getOperand(0).getValueType() == MVT::v4i32 && + N->getOperand(0).hasOneUse()) { + return DAG.getNode(PPCISD::ABSD, dl, N->getOperand(0).getValueType(), + N->getOperand(0)->getOperand(0), + N->getOperand(0)->getOperand(1), + DAG.getTargetConstant(1, dl, MVT::i32)); + } + } + + return SDValue(); +} + +SDValue PPCTargetLowering::combineVSelect(SDNode *N, + DAGCombinerInfo &DCI) const { + assert((N->getOpcode() == ISD::VSELECT) && "Need VSELECT node here"); + assert(Subtarget.hasP9Vector() && + "Only combine this when P9 vector supported!"); + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + SDValue Cond = N->getOperand(0); + SDValue TrueOpnd = N->getOperand(1); + SDValue FalseOpnd = N->getOperand(2); + EVT VT = N->getOperand(1).getValueType(); + + if (Cond.getOpcode() != ISD::SETCC || TrueOpnd.getOpcode() != ISD::SUB || + FalseOpnd.getOpcode() != ISD::SUB) + return SDValue(); + + // ABSD only available for type v4i32/v8i16/v16i8 + if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8) + return SDValue(); + + ISD::CondCode CC = cast(Cond.getOperand(2))->get(); + + // Can only handle unsigned comparison here + switch (CC) { + default: + return SDValue(); + case ISD::SETUGT: + case ISD::SETUGE: + break; + case ISD::SETULT: + case ISD::SETULE: + std::swap(TrueOpnd, FalseOpnd); + break; + } + + SDValue CmpOpnd1 = Cond.getOperand(0); + SDValue CmpOpnd2 = Cond.getOperand(1); + + // SETCC CmpOpnd1 CmpOpnd2 cond + // TrueOpnd = CmpOpnd1 - CmpOpnd2 + // FalseOpnd = CmpOpnd2 - CmpOpnd1 + if (TrueOpnd.getOperand(0) == CmpOpnd1 && + TrueOpnd.getOperand(1) == CmpOpnd2 && + FalseOpnd.getOperand(0) == CmpOpnd2 && + FalseOpnd.getOperand(1) == CmpOpnd1) { + return DAG.getNode(PPCISD::ABSD, dl, N->getOperand(1).getValueType(), + CmpOpnd1, CmpOpnd2, + DAG.getTargetConstant(0, dl, MVT::i32)); + } + + return SDValue(); +} Index: llvm/lib/Target/PowerPC/PPCInstrVSX.td =================================================================== --- llvm/lib/Target/PowerPC/PPCInstrVSX.td +++ llvm/lib/Target/PowerPC/PPCInstrVSX.td @@ -67,6 +67,10 @@ def SDTVecConv : SDTypeProfile<1, 2, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2> ]>; +def SDTVabsd : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<3, i32> +]>; + def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; @@ -79,6 +83,7 @@ def PPCsvec2fp : SDNode<"PPCISD::SINT_VEC_TO_FP", SDTVecConv, []>; def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>; def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>; +def PPCabsd : SDNode<"PPCISD::ABSD", SDTVabsd, []>; multiclass XX3Form_Rcr opcode, bits<7> xo, string asmbase, string asmstr, InstrItinClass itin, Intrinsic Int, @@ -4029,3 +4034,17 @@ } } +let Predicates = [HasP9Altivec] in { + + def : Pat<(v4i32 (PPCabsd v4i32:$A, v4i32:$B, (i32 0))), + (v4i32 (VABSDUW $A, $B))>; + + def : Pat<(v8i16 (PPCabsd v8i16:$A, v8i16:$B, (i32 0))), + (v8i16 (VABSDUH $A, $B))>; + + def : Pat<(v16i8 (PPCabsd v16i8:$A, v16i8:$B, (i32 0))), + (v16i8 (VABSDUB $A, $B))>; + + def : Pat<(v4i32 (PPCabsd v4i32:$A, v4i32:$B, (i32 1))), + (v4i32 (VABSDUW (XVNEGSP $A), (XVNEGSP $B)))>; +} Index: llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll =================================================================== --- llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll +++ llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll @@ -1,42 +1,40 @@ -; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s -; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-PWR8 -implicit-check-not vabsdu +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-PWR8 -implicit-check-not vabsdu -; Function Attrs: nounwind readnone define <4 x i32> @simple_absv_32(<4 x i32> %a) local_unnamed_addr { entry: %sub.i = sub <4 x i32> zeroinitializer, %a %0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %a, <4 x i32> %sub.i) ret <4 x i32> %0 ; CHECK-LABEL: simple_absv_32 -; CHECK-DAG: vxor {{[0-9]+}}, [[REG:[0-9]+]], [[REG]] -; CHECK-DAG: xvnegsp 34, 34 -; CHECK-DAG: xvnegsp 35, {{[0-9]+}} -; CHECK-NEXT: vabsduw 2, 2, {{[0-9]+}} +; CHECK-NOT: vxor +; CHECK-NOT: vabsduw +; CHECK: vnegw v[[REG:[0-9]+]], v2 +; CHECK-NEXT: vmaxsw v2, v2, v[[REG]] ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: simple_absv_32 -; CHECK-PWR8: xxlxor -; CHECK-PWR8: vsubuwm -; CHECK-PWR8: vmaxsw -; CHECK-PWR8: blr +; CHECK-PWR8: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-PWR8-NEXT: vsubuwm v[[SUB:[0-9]+]], v[[ZERO]], v2 +; CHECK-PWR8-NEXT: vmaxsw v2, v2, v[[SUB]] +; CHECK-PWR8-NEXT: blr } -; Function Attrs: nounwind readnone define <4 x i32> @simple_absv_32_swap(<4 x i32> %a) local_unnamed_addr { entry: %sub.i = sub <4 x i32> zeroinitializer, %a %0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %sub.i, <4 x i32> %a) ret <4 x i32> %0 ; CHECK-LABEL: simple_absv_32_swap -; CHECK-DAG: vxor {{[0-9]+}}, [[REG:[0-9]+]], [[REG]] -; CHECK-DAG: xvnegsp 34, 34 -; CHECK-DAG: xvnegsp 35, {{[0-9]+}} -; CHECK-NEXT: vabsduw 2, 2, {{[0-9]+}} +; CHECK-NOT: vxor +; CHECK-NOT: vabsduw +; CHECK: vnegw v[[REG:[0-9]+]], v2 +; CHECK-NEXT: vmaxsw v2, v2, v[[REG]] ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: simple_absv_32_swap -; CHECK-PWR8: xxlxor -; CHECK-PWR8: vsubuwm -; CHECK-PWR8: vmaxsw +; CHECK-PWR8: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-PWR8-NEXT: vsubuwm v[[SUB:[0-9]+]], v[[ZERO]], v2 +; CHECK-PWR8-NEXT: vmaxsw v2, v[[SUB]], v2 ; CHECK-PWR8: blr } @@ -46,37 +44,39 @@ %0 = tail call <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16> %a, <8 x i16> %sub.i) ret <8 x i16> %0 ; CHECK-LABEL: simple_absv_16 -; CHECK: mtvsrws {{[0-9]+}}, {{[0-9]+}} -; CHECK-NEXT: vadduhm 2, 2, [[IMM:[0-9]+]] -; CHECK-NEXT: vabsduh 2, 2, [[IMM]] +; CHECK-NOT: mtvsrws +; CHECK-NOT: vabsduh +; CHECK: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-NEXT: vsubuhm v[[REG:[0-9]+]], v[[ZERO]], v2 +; CHECK-NEXT: vmaxsh v2, v2, v[[REG]] ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: simple_absv_16 -; CHECK-PWR8: xxlxor -; CHECK-PWR8: vsubuhm -; CHECK-PWR8: vmaxsh +; CHECK-PWR8: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-PWR8-NEXT: vsubuhm v[[REG:[0-9]+]], v[[ZERO]], v2 +; CHECK-PWR8-NEXT: vmaxsh v2, v2, v[[REG]] ; CHECK-PWR8: blr } -; Function Attrs: nounwind readnone define <16 x i8> @simple_absv_8(<16 x i8> %a) local_unnamed_addr { entry: %sub.i = sub <16 x i8> zeroinitializer, %a %0 = tail call <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8> %a, <16 x i8> %sub.i) ret <16 x i8> %0 ; CHECK-LABEL: simple_absv_8 -; CHECK: xxspltib {{[0-9]+}}, 128 -; CHECK-NEXT: vaddubm 2, 2, [[IMM:[0-9]+]] -; CHECK-NEXT: vabsdub 2, 2, [[IMM]] +; CHECK-NOT: xxspltib +; CHECK-NOT: vabsdub +; CHECK: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-NEXT: vsububm v[[REG:[0-9]+]], v[[ZERO]], v2 +; CHECK-NEXT: vmaxsb v2, v2, v[[REG]] ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: simple_absv_8 -; CHECK-PWR8: xxlxor -; CHECK-PWR8: vsububm -; CHECK-PWR8: vmaxsb +; CHECK-PWR8: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-PWR8-NEXT: vsububm v[[REG:[0-9]+]], v[[ZERO]], v2 +; CHECK-PWR8-NEXT: vmaxsb v2, v2, v[[REG]] ; CHECK-PWR8: blr } ; The select pattern can only be detected for v4i32. -; Function Attrs: norecurse nounwind readnone define <4 x i32> @sub_absv_32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr { entry: %0 = sub nsw <4 x i32> %a, %b @@ -85,14 +85,63 @@ %3 = select <4 x i1> %1, <4 x i32> %0, <4 x i32> %2 ret <4 x i32> %3 ; CHECK-LABEL: sub_absv_32 -; CHECK-DAG: xvnegsp 34, 34 -; CHECK-DAG: xvnegsp 35, 35 -; CHECK-NEXT: vabsduw 2, 2, 3 +; CHECK-NOT: vsubuwm +; CHECK-NOT: vnegw +; CHECK-NOT: vmaxsw +; CHECK-DAG: xvnegsp v2, v2 +; CHECK-DAG: xvnegsp v3, v3 +; CHECK-NEXT: vabsduw v2, v{{[23]}}, v{{[23]}} ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: sub_absv_32 -; CHECK-PWR8: vsubuwm -; CHECK-PWR8: xxlxor -; CHECK-PWR8: blr +; CHECK-PWR8-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-PWR8-DAG: vsubuwm v[[SUB:[0-9]+]], v2, v3 +; CHECK-PWR8-NEXT: vsubuwm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-PWR8-NEXT: vmaxsw v2, v[[SUB]], v[[SUB1]] +; CHECK-PWR8-NEXT: blr +} + +define <8 x i16> @sub_absv_16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr { +entry: + %0 = sub nsw <8 x i16> %a, %b + %1 = icmp sgt <8 x i16> %0, + %2 = sub <8 x i16> zeroinitializer, %0 + %3 = select <8 x i1> %1, <8 x i16> %0, <8 x i16> %2 + ret <8 x i16> %3 +; CHECK-LABEL: sub_absv_16 +; CHECK-NOT: vabsduh +; CHECK-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-DAG: vsubuhm v[[SUB:[0-9]+]], v2, v3 +; CHECK: vsubuhm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-NEXT: vmaxsh v2, v[[SUB]], v[[SUB1]] +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: sub_absv_16 +; CHECK-PWR8-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-PWR8-DAG: vsubuhm v[[SUB:[0-9]+]], v2, v3 +; CHECK-PWR8: vsubuhm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-PWR8-NEXT: vmaxsh v2, v[[SUB]], v[[SUB1]] +; CHECK-PWR8-NEXT: blr +} + +define <16 x i8> @sub_absv_8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr { +entry: + %0 = sub nsw <16 x i8> %a, %b + %1 = icmp sgt <16 x i8> %0, + %2 = sub <16 x i8> zeroinitializer, %0 + %3 = select <16 x i1> %1, <16 x i8> %0, <16 x i8> %2 + ret <16 x i8> %3 +; CHECK-LABEL: sub_absv_8 +; CHECK-NOT: vabsdub +; CHECK-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-DAG: vsububm v[[SUB:[0-9]+]], v2, v3 +; CHECK: vsububm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-NEXT: vmaxsb v2, v[[SUB]], v[[SUB1]] +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: sub_absv_8 +; CHECK-PWR8-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-PWR8-DAG: vsububm v[[SUB:[0-9]+]], v2, v3 +; CHECK-PWR8: vsububm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-PWR8-NEXT: vmaxsb v2, v[[SUB]], v[[SUB1]] +; CHECK-PWR8-NEXT: blr } ; FIXME: This does not produce the ISD::ABS that we are looking for. @@ -100,8 +149,7 @@ ; We do manage to find the word version of ABS but not the halfword. ; Threfore, we end up doing more work than is required with a pair of abs for word ; instead of just one for the halfword. -; Function Attrs: norecurse nounwind readnone -define <8 x i16> @sub_absv_16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr { +define <8 x i16> @sub_absv_16_ext(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr { entry: %0 = sext <8 x i16> %a to <8 x i32> %1 = sext <8 x i16> %b to <8 x i32> @@ -111,23 +159,25 @@ %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4 %6 = trunc <8 x i32> %5 to <8 x i16> ret <8 x i16> %6 -; CHECK-LABEL: sub_absv_16 +; CHECK-LABEL: sub_absv_16_ext ; CHECK-NOT: vabsduh ; CHECK: vabsduw +; CHECK-NOT: vnegw ; CHECK-NOT: vabsduh ; CHECK: vabsduw +; CHECK-NOT: vnegw ; CHECK-NOT: vabsduh ; CHECK: blr ; CHECK-PWR8-LABEL: sub_absv_16 -; CHECK-PWR8: vsubuwm -; CHECK-PWR8: xxlxor +; CHECK-PWR8-DAG: vsubuwm +; CHECK-PWR8-DAG: xxlxor ; CHECK-PWR8: blr } ; FIXME: This does not produce ISD::ABS. This does not even vectorize correctly! ; This function should look like sub_absv_32 and sub_absv_16 except that the type is v16i8. ; Function Attrs: norecurse nounwind readnone -define <16 x i8> @sub_absv_8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr { +define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr { entry: %vecext = extractelement <16 x i8> %a, i32 0 %conv = zext i8 %vecext to i32 @@ -290,20 +340,19 @@ %conv122 = trunc i32 %15 to i8 %vecins123 = insertelement <16 x i8> %vecins115, i8 %conv122, i32 15 ret <16 x i8> %vecins123 -; CHECK-LABEL: sub_absv_8 +; CHECK-LABEL: sub_absv_8_ext ; CHECK-NOT: vabsdub ; CHECK: subf ; CHECK-NOT: vabsdub ; CHECK: xor ; CHECK-NOT: vabsdub ; CHECK: blr -; CHECK-PWR8-LABEL: sub_absv_8 +; CHECK-PWR8-LABEL: sub_absv_8_ext ; CHECK-PWR8: subf ; CHECK-PWR8: xor ; CHECK-PWR8: blr } -; Function Attrs: nounwind readnone define <4 x i32> @sub_absv_vec_32(<4 x i32> %a, <4 x i32> %b) local_unnamed_addr { entry: %sub = sub <4 x i32> %a, %b @@ -311,16 +360,21 @@ %0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %sub, <4 x i32> %sub.i) ret <4 x i32> %0 ; CHECK-LABEL: sub_absv_vec_32 -; CHECK: vabsduw 2, 2, 3 +; CHECK-NOT: vsubuwm +; CHECK-NOT: vnegw +; CHECK-NOT: vmaxsw +; CHECK-DAG: xvnegsp v2, v2 +; CHECK-DAG: xvnegsp v3, v3 +; CHECK-NEXT: vabsduw v2, v{{[23]}}, v{{[23]}} ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: sub_absv_vec_32 -; CHECK-PWR8: xxlxor -; CHECK-PWR8: vsubuwm -; CHECK-PWR8: vmaxsw -; CHECK-PWR8: blr +; CHECK-PWR8-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-PWR8-DAG: vsubuwm v[[SUB:[0-9]+]], v2, v3 +; CHECK-PWR8: vsubuwm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-PWR8-NEXT: vmaxsw v2, v[[SUB]], v[[SUB1]] +; CHECK-PWR8-NEXT: blr } -; Function Attrs: nounwind readnone define <8 x i16> @sub_absv_vec_16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr { entry: %sub = sub <8 x i16> %a, %b @@ -328,16 +382,21 @@ %0 = tail call <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16> %sub, <8 x i16> %sub.i) ret <8 x i16> %0 ; CHECK-LABEL: sub_absv_vec_16 -; CHECK: vabsduh 2, 2, 3 +; CHECK-NOT: mtvsrws +; CHECK-NOT: vabsduh +; CHECK-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-DAG: vsubuhm v[[SUB:[0-9]+]], v2, v3 +; CHECK: vsubuhm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-NEXT: vmaxsh v2, v[[SUB]], v[[SUB1]] ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: sub_absv_vec_16 -; CHECK-PWR8: xxlxor -; CHECK-PWR8: vsubuhm -; CHECK-PWR8: vmaxsh -; CHECK-PWR8: blr +; CHECK-PWR8-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-PWR8-DAG: vsubuhm v[[SUB:[0-9]+]], v2, v3 +; CHECK-PWR8: vsubuhm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-PWR8-NEXT: vmaxsh v2, v[[SUB]], v[[SUB1]] +; CHECK-PWR8-NEXT: blr } -; Function Attrs: nounwind readnone define <16 x i8> @sub_absv_vec_8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr { entry: %sub = sub <16 x i8> %a, %b @@ -345,22 +404,316 @@ %0 = tail call <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8> %sub, <16 x i8> %sub.i) ret <16 x i8> %0 ; CHECK-LABEL: sub_absv_vec_8 -; CHECK: vabsdub 2, 2, 3 +; CHECK-NOT: xxspltib +; CHECK-NOT: vabsdub +; CHECK-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-DAG: vsububm v[[SUB:[0-9]+]], v2, v3 +; CHECK: vsububm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-NEXT: vmaxsb v2, v[[SUB]], v[[SUB1]] ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: sub_absv_vec_8 -; CHECK-PWR8: xxlxor -; CHECK-PWR8: vsububm +; CHECK-PWR8-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-PWR8-DAG: vsububm v[[SUB:[0-9]+]], v2, v3 +; CHECK-PWR8: vsububm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-PWR8-NEXT: vmaxsb v2, v[[SUB]], v[[SUB1]] +; CHECK-PWR8-NEXT: blr +} + +define <4 x i32> @zext_sub_absd32(<4 x i16>, <4 x i16>) local_unnamed_addr { + %3 = zext <4 x i16> %0 to <4 x i32> + %4 = zext <4 x i16> %1 to <4 x i32> + %5 = sub <4 x i32> %3, %4 + %6 = sub <4 x i32> zeroinitializer, %5 + %7 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %5, <4 x i32> %6) + ret <4 x i32> %7 +; CHECK-LABEL: zext_sub_absd32 +; CHECK-NOT: xvnegsp +; CHECK: vabsduw +; CHECK: blr +; CHECK-PWR8-LABEL: zext_sub_absd32 +; CHECK-PWR8: vmaxsw +; CHECK-PWR8: blr +} + +define <8 x i16> @zext_sub_absd16(<8 x i8>, <8 x i8>) local_unnamed_addr { + %3 = zext <8 x i8> %0 to <8 x i16> + %4 = zext <8 x i8> %1 to <8 x i16> + %5 = sub <8 x i16> %3, %4 + %6 = sub <8 x i16> zeroinitializer, %5 + %7 = tail call <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16> %5, <8 x i16> %6) + ret <8 x i16> %7 +; CHECK-LABEL: zext_sub_absd16 +; CHECK-NOT: vadduhm +; CHECK: vabsduh +; CHECK: blr +; CHECK-PWR8-LABEL: zext_sub_absd16 +; CHECK-PWR8: vmaxsh +; CHECK-PWR8: blr +} + +define <16 x i8> @zext_sub_absd8(<16 x i4>, <16 x i4>) local_unnamed_addr { + %3 = zext <16 x i4> %0 to <16 x i8> + %4 = zext <16 x i4> %1 to <16 x i8> + %5 = sub <16 x i8> %3, %4 + %6 = sub <16 x i8> zeroinitializer, %5 + %7 = tail call <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8> %5, <16 x i8> %6) + ret <16 x i8> %7 +; CHECK-LABEL: zext_sub_absd8 +; CHECK-NOT: vaddubm +; CHECK: vabsdub +; CHECK: blr +; CHECK-PWR8-LABEL: zext_sub_absd8 ; CHECK-PWR8: vmaxsb ; CHECK-PWR8: blr } +; To verify vabsdu* exploitation for ucmp + sub + select sequence + +define <4 x i32> @absd_int32_ugt(<4 x i32>, <4 x i32>) { + %3 = icmp ugt <4 x i32> %0, %1 + %4 = sub <4 x i32> %0, %1 + %5 = sub <4 x i32> %1, %0 + %6 = select <4 x i1> %3, <4 x i32> %4, <4 x i32> %5 + ret <4 x i32> %6 +; CHECK-LABEL: absd_int32_ugt +; CHECK-NOT: vcmpgtuw +; CHECK-NOT: xxsel +; CHECK: vabsduw v2, v2, v3 +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: absd_int32_ugt +; CHECK-PWR8: vcmpgtuw +; CHECK-PWR8: xxsel +; CHECK-PWR8: blr +} + +define <4 x i32> @absd_int32_uge(<4 x i32>, <4 x i32>) { + %3 = icmp uge <4 x i32> %0, %1 + %4 = sub <4 x i32> %0, %1 + %5 = sub <4 x i32> %1, %0 + %6 = select <4 x i1> %3, <4 x i32> %4, <4 x i32> %5 + ret <4 x i32> %6 +; CHECK-LABEL: absd_int32_uge +; CHECK-NOT: vcmpgtuw +; CHECK-NOT: xxsel +; CHECK: vabsduw v2, v2, v3 +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: absd_int32_uge +; CHECK-PWR8: vcmpgtuw +; CHECK-PWR8: xxsel +; CHECK-PWR8: blr +} + +define <4 x i32> @absd_int32_ult(<4 x i32>, <4 x i32>) { + %3 = icmp ult <4 x i32> %0, %1 + %4 = sub <4 x i32> %0, %1 + %5 = sub <4 x i32> %1, %0 + %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32> %4 + ret <4 x i32> %6 +; CHECK-LABEL: absd_int32_ult +; CHECK-NOT: vcmpgtuw +; CHECK-NOT: xxsel +; CHECK: vabsduw v2, v2, v3 +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: absd_int32_ult +; CHECK-PWR8: vcmpgtuw +; CHECK-PWR8: xxsel +; CHECK-PWR8: blr +} + +define <4 x i32> @absd_int32_ule(<4 x i32>, <4 x i32>) { + %3 = icmp ule <4 x i32> %0, %1 + %4 = sub <4 x i32> %0, %1 + %5 = sub <4 x i32> %1, %0 + %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32> %4 + ret <4 x i32> %6 +; CHECK-LABEL: absd_int32_ule +; CHECK-NOT: vcmpgtuw +; CHECK-NOT: xxsel +; CHECK: vabsduw v2, v2, v3 +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: absd_int32_ule +; CHECK-PWR8: vcmpgtuw +; CHECK-PWR8: xxsel +; CHECK-PWR8: blr +} + +define <8 x i16> @absd_int16_ugt(<8 x i16>, <8 x i16>) { + %3 = icmp ugt <8 x i16> %0, %1 + %4 = sub <8 x i16> %0, %1 + %5 = sub <8 x i16> %1, %0 + %6 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> %5 + ret <8 x i16> %6 +; CHECK-LABEL: absd_int16_ugt +; CHECK-NOT: vcmpgtuh +; CHECK-NOT: xxsel +; CHECK: vabsduh v2, v2, v3 +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: absd_int16_ugt +; CHECK-PWR8: vcmpgtuh +; CHECK-PWR8: xxsel +; CHECK-PWR8: blr +} + +define <8 x i16> @absd_int16_uge(<8 x i16>, <8 x i16>) { + %3 = icmp uge <8 x i16> %0, %1 + %4 = sub <8 x i16> %0, %1 + %5 = sub <8 x i16> %1, %0 + %6 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> %5 + ret <8 x i16> %6 +; CHECK-LABEL: absd_int16_uge +; CHECK-NOT: vcmpgtuh +; CHECK-NOT: xxsel +; CHECK: vabsduh v2, v2, v3 +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: absd_int16_uge +; CHECK-PWR8: vcmpgtuh +; CHECK-PWR8: xxsel +; CHECK-PWR8: blr +} + +define <8 x i16> @absd_int16_ult(<8 x i16>, <8 x i16>) { + %3 = icmp ult <8 x i16> %0, %1 + %4 = sub <8 x i16> %0, %1 + %5 = sub <8 x i16> %1, %0 + %6 = select <8 x i1> %3, <8 x i16> %5, <8 x i16> %4 + ret <8 x i16> %6 +; CHECK-LABEL: absd_int16_ult +; CHECK-NOT: vcmpgtuh +; CHECK-NOT: xxsel +; CHECK: vabsduh v2, v2, v3 +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: absd_int16_ult +; CHECK-PWR8: vcmpgtuh +; CHECK-PWR8: xxsel +; CHECK-PWR8: blr +} + +define <8 x i16> @absd_int16_ule(<8 x i16>, <8 x i16>) { + %3 = icmp ule <8 x i16> %0, %1 + %4 = sub <8 x i16> %0, %1 + %5 = sub <8 x i16> %1, %0 + %6 = select <8 x i1> %3, <8 x i16> %5, <8 x i16> %4 + ret <8 x i16> %6 +; CHECK-LABEL: absd_int16_ule +; CHECK-NOT: vcmpgtuh +; CHECK-NOT: xxsel +; CHECK: vabsduh v2, v2, v3 +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: absd_int16_ule +; CHECK-PWR8: vcmpgtuh +; CHECK-PWR8: xxsel +; CHECK-PWR8: blr +} + +define <16 x i8> @absd_int8_ugt(<16 x i8>, <16 x i8>) { + %3 = icmp ugt <16 x i8> %0, %1 + %4 = sub <16 x i8> %0, %1 + %5 = sub <16 x i8> %1, %0 + %6 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> %5 + ret <16 x i8> %6 +; CHECK-LABEL: absd_int8_ugt +; CHECK-NOT: vcmpgtub +; CHECK-NOT: xxsel +; CHECK: vabsdub v2, v2, v3 +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: absd_int8_ugt +; CHECK-PWR8: vcmpgtub +; CHECK-PWR8: xxsel +; CHECK-PWR8: blr +} + +define <16 x i8> @absd_int8_uge(<16 x i8>, <16 x i8>) { + %3 = icmp uge <16 x i8> %0, %1 + %4 = sub <16 x i8> %0, %1 + %5 = sub <16 x i8> %1, %0 + %6 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> %5 + ret <16 x i8> %6 +; CHECK-LABEL: absd_int8_uge +; CHECK-NOT: vcmpgtub +; CHECK-NOT: xxsel +; CHECK: vabsdub v2, v2, v3 +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: absd_int8_uge +; CHECK-PWR8: vcmpgtub +; CHECK-PWR8: xxsel +; CHECK-PWR8: blr +} + +define <16 x i8> @absd_int8_ult(<16 x i8>, <16 x i8>) { + %3 = icmp ult <16 x i8> %0, %1 + %4 = sub <16 x i8> %0, %1 + %5 = sub <16 x i8> %1, %0 + %6 = select <16 x i1> %3, <16 x i8> %5, <16 x i8> %4 + ret <16 x i8> %6 +; CHECK-LABEL: absd_int8_ult +; CHECK-NOT: vcmpgtub +; CHECK-NOT: xxsel +; CHECK: vabsdub v2, v2, v3 +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: absd_int8_ult +; CHECK-PWR8: vcmpgtub +; CHECK-PWR8: xxsel +; CHECK-PWR8: blr +} + +define <16 x i8> @absd_int8_ule(<16 x i8>, <16 x i8>) { + %3 = icmp ule <16 x i8> %0, %1 + %4 = sub <16 x i8> %0, %1 + %5 = sub <16 x i8> %1, %0 + %6 = select <16 x i1> %3, <16 x i8> %5, <16 x i8> %4 + ret <16 x i8> %6 +; CHECK-LABEL: absd_int8_ule +; CHECK-NOT: vcmpgtub +; CHECK-NOT: xxsel +; CHECK: vabsdub v2, v2, v3 +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: absd_int8_ule +; CHECK-PWR8: vcmpgtub +; CHECK-PWR8: xxsel +; CHECK-PWR8: blr +} + +; some cases we are unable to optimize +; check whether goes beyond the scope +define <4 x i32> @absd_int32_ugt_opp(<4 x i32>, <4 x i32>) { + %3 = icmp ugt <4 x i32> %0, %1 + %4 = sub <4 x i32> %0, %1 + %5 = sub <4 x i32> %1, %0 + %6 = select <4 x i1> %3, <4 x i32> %5, <4 x i32> %4 + ret <4 x i32> %6 +; CHECK-LABEL: absd_int32_ugt_opp +; CHECK-NOT: vabsduw +; CHECK: vcmpgtuw +; CHECK: xxsel +; CHECK: blr +; CHECK-PWR8-LABEL: absd_int32_ugt_opp +; CHECK-PWR8: vcmpgtuw +; CHECK-PWR8: xxsel +; CHECK-PWR8: blr +} + +define <2 x i64> @absd_int64_ugt(<2 x i64>, <2 x i64>) { + %3 = icmp ugt <2 x i64> %0, %1 + %4 = sub <2 x i64> %0, %1 + %5 = sub <2 x i64> %1, %0 + %6 = select <2 x i1> %3, <2 x i64> %4, <2 x i64> %5 + ret <2 x i64> %6 +; CHECK-LABEL: absd_int64_ugt +; CHECK-NOT: vabsduw +; CHECK: vcmpgtud +; CHECK: xxsel +; CHECK: blr +; CHECK-PWR8-LABEL: absd_int64_ugt +; CHECK-PWR8: vcmpgtud +; CHECK-PWR8: xxsel +; CHECK-PWR8: blr +} + -; Function Attrs: nounwind readnone declare <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32>, <4 x i32>) -; Function Attrs: nounwind readnone declare <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16>, <8 x i16>) -; Function Attrs: nounwind readnone declare <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8>, <16 x i8>) + Index: llvm/test/CodeGen/PowerPC/pre-inc-disable.ll =================================================================== --- llvm/test/CodeGen/PowerPC/pre-inc-disable.ll +++ llvm/test/CodeGen/PowerPC/pre-inc-disable.ll @@ -19,20 +19,20 @@ ; CHECK: lxvx v2, 0, r5 ; CHECK: lxvx v3, 0, r6 ; CHECK: xxpermdi v5, f0, f0, 2 -; CHECK: vperm v0, v4, v5, v2 -; CHECK: vperm v5, v5, v4, v3 -; CHECK: xvnegsp v5, v5 -; CHECK: xvnegsp v0, v0 +; CHECK-DAG: vperm v[[VR1:[0-9]+]], v4, v5, v2 +; CHECK-DAG: vperm v[[VR2:[0-9]+]], v5, v4, v3 +; CHECK-DAG: xvnegsp v[[VR3:[0-9]+]], v[[VR1]] +; CHECK-DAG: xvnegsp v[[VR4:[0-9]+]], v[[VR2]] ; CHECK: .LBB0_1: # %for.cond1.preheader ; CHECK: lfd f0, 0(r3) ; CHECK: xxpermdi v1, f0, f0, 2 ; CHECK: vperm v6, v1, v4, v3 ; CHECK: vperm v1, v4, v1, v2 -; CHECK: xvnegsp v6, v6 ; CHECK: xvnegsp v1, v1 -; CHECK: vabsduw v1, v1, v0 -; CHECK: vabsduw v6, v6, v5 +; CHECK: xvnegsp v6, v6 +; CHECK-DAG: vabsduw v1, v1, v[[VR3]] +; CHECK-DAG: vabsduw v6, v6, v[[VR4]] ; CHECK: vadduwm v1, v6, v1 ; CHECK: xxswapd v6, v1 ; CHECK: vadduwm v1, v1, v6 @@ -46,10 +46,10 @@ ; CHECK: xxswapd v1, vs0 ; CHECK: vperm v6, v1, v4, v3 ; CHECK: vperm v1, v4, v1, v2 -; CHECK: xvnegsp v6, v6 -; CHECK: xvnegsp v1, v1 -; CHECK: vabsduw v1, v1, v0 -; CHECK: vabsduw v6, v6, v5 +; CHECK-DAG: xvnegsp v6, v6 +; CHECK-DAG: xvnegsp v1, v1 +; CHECK-DAG: vabsduw v1, v1, v[[VR3]] +; CHECK-DAG: vabsduw v6, v6, v[[VR4]] ; CHECK: vadduwm v1, v6, v1 ; CHECK: xxswapd v6, v1 ; CHECK: vadduwm v1, v1, v6 @@ -72,36 +72,36 @@ ; P9BE: lxvx v3, 0, r6 ; P9BE: xxlor v5, vs0, vs0 ; P9BE: li r6, 0 -; P9BE: vperm v0, v4, v5, v2 -; P9BE: vperm v5, v4, v5, v3 -; P9BE: xvnegsp v5, v5 -; P9BE: xvnegsp v0, v0 +; P9BE-DAG: vperm v[[VR1:[0-9]+]], v4, v5, v2 +; P9BE-DAG: vperm v[[VR2:[0-9]+]], v4, v5, v3 +; P9BE-DAG: xvnegsp v[[VR3:[0-9]+]], v[[VR1]] +; P9BE-DAG: xvnegsp v[[VR4:[0-9]+]], v[[VR2]] ; P9BE: .LBB0_1: # %for.cond1.preheader ; P9BE: lfd f0, 0(r3) ; P9BE: xxlor v1, vs0, vs0 ; P9BE: vperm v6, v4, v1, v3 ; P9BE: vperm v1, v4, v1, v2 -; P9BE: xvnegsp v6, v6 -; P9BE: xvnegsp v1, v1 -; P9BE: vabsduw v1, v1, v0 -; P9BE: vabsduw v6, v6, v5 +; P9BE-DAG: xvnegsp v6, v6 +; P9BE-DAG: xvnegsp v1, v1 +; P9BE-DAG: vabsduw v1, v1, v[[VR3]] +; P9BE-DAG: vabsduw v6, v6, v[[VR4]] ; P9BE: vadduwm v1, v6, v1 ; P9BE: xxswapd v6, v1 ; P9BE: vadduwm v1, v1, v6 ; P9BE: xxspltw v6, v1, 1 ; P9BE: vadduwm v1, v1, v6 -; P9BE: vextuwlx r7, r6, v1 -; P9BE: ldux r8, r3, r4 +; P9BE: vextuwlx r[[GR1:[0-9]+]], r6, v1 +; P9BE: ldux r[[GR2:[0-9]+]], r3, r4 ; P9BE: add r3, r3, r4 -; P9BE: add r5, r7, r5 -; P9BE: mtvsrd v1, r8 +; P9BE: add r5, r[[GR1]], r5 +; P9BE: mtvsrd v1, r[[GR2]] ; P9BE: vperm v6, v4, v1, v3 ; P9BE: vperm v1, v4, v1, v2 -; P9BE: xvnegsp v6, v6 -; P9BE: xvnegsp v1, v1 -; P9BE: vabsduw v1, v1, v0 -; P9BE: vabsduw v6, v6, v5 +; P9BE-DAG: xvnegsp v6, v6 +; P9BE-DAG: xvnegsp v1, v1 +; P9BE-DAG: vabsduw v1, v1, v[[VR3]] +; P9BE-DAG: vabsduw v6, v6, v[[VR4]] ; P9BE: vadduwm v1, v6, v1 ; P9BE: xxswapd v6, v1 ; P9BE: vadduwm v1, v1, v6 @@ -181,12 +181,8 @@ ; CHECK: vperm v4, v0, v4, v3 ; CHECK: vperm v2, v5, v0, v2 ; CHECK: vperm v3, v0, v5, v3 -; CHECK: xvnegsp v5, v1 -; CHECK: xvnegsp v4, v4 -; CHECK: xvnegsp v2, v2 -; CHECK: xvnegsp v3, v3 ; CHECK: vabsduw v3, v4, v3 -; CHECK: vabsduw v2, v5, v2 +; CHECK: vabsduw v2, v1, v2 ; CHECK: vadduwm v2, v2, v3 ; CHECK: xxswapd v3, v2 ; CHECK: vadduwm v2, v2, v3 @@ -212,12 +208,8 @@ ; P9BE: vperm v4, v5, v4, v3 ; P9BE: vperm v2, v5, v0, v2 ; P9BE: vperm v3, v5, v0, v3 -; P9BE: xvnegsp v5, v1 -; P9BE: xvnegsp v4, v4 -; P9BE: xvnegsp v2, v2 -; P9BE: xvnegsp v3, v3 ; P9BE: vabsduw v3, v4, v3 -; P9BE: vabsduw v2, v5, v2 +; P9BE: vabsduw v2, v1, v2 ; P9BE: vadduwm v2, v2, v3 ; P9BE: xxswapd v3, v2 ; P9BE: vadduwm v2, v2, v3