Index: llvm/trunk/lib/Target/PowerPC/PPCISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -327,7 +327,6 @@ bool isOffsetMultipleOf(SDNode *N, unsigned Val) const; void transferMemOperands(SDNode *N, SDNode *Result); - MachineSDNode *flipSignBit(const SDValue &N, SDNode **SignBit = nullptr); }; } // end anonymous namespace @@ -4138,51 +4137,6 @@ CurDAG->setNodeMemRefs(cast(Result), {MemOp}); } -/// This method returns a node after flipping the MSB of each element -/// of vector integer type. Additionally, if SignBitVec is non-null, -/// this method sets a node with one at MSB of all elements -/// and zero at other bits in SignBitVec. -MachineSDNode * -PPCDAGToDAGISel::flipSignBit(const SDValue &N, SDNode **SignBitVec) { - SDLoc dl(N); - EVT VecVT = N.getValueType(); - if (VecVT == MVT::v4i32) { - if (SignBitVec) { - SDNode *ZV = CurDAG->getMachineNode(PPC::V_SET0, dl, MVT::v4i32); - *SignBitVec = CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT, - SDValue(ZV, 0)); - } - return CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT, N); - } - else if (VecVT == MVT::v8i16) { - SDNode *Hi = CurDAG->getMachineNode(PPC::LIS, dl, MVT::i32, - getI32Imm(0x8000, dl)); - SDNode *ScaImm = CurDAG->getMachineNode(PPC::ORI, dl, MVT::i32, - SDValue(Hi, 0), - getI32Imm(0x8000, dl)); - SDNode *VecImm = CurDAG->getMachineNode(PPC::MTVSRWS, dl, VecVT, - SDValue(ScaImm, 0)); - /* - Alternatively, we can do this as follow to use VRF instead of GPR. - vspltish 5, 1 - vspltish 6, 15 - vslh 5, 6, 5 - */ - if (SignBitVec) *SignBitVec = VecImm; - return CurDAG->getMachineNode(PPC::VADDUHM, dl, VecVT, N, - SDValue(VecImm, 0)); - } - else if (VecVT == MVT::v16i8) { - SDNode *VecImm = CurDAG->getMachineNode(PPC::XXSPLTIB, dl, MVT::i32, - getI32Imm(0x80, dl)); - if (SignBitVec) *SignBitVec = VecImm; - return CurDAG->getMachineNode(PPC::VADDUBM, dl, VecVT, N, - SDValue(VecImm, 0)); - } - else - llvm_unreachable("Unsupported vector data type for flipSignBit"); -} - // Select - Convert the specified operand from a target-independent to a // target-specific node if it hasn't already been changed. void PPCDAGToDAGISel::Select(SDNode *N) { @@ -4993,55 +4947,6 @@ return; } } - case ISD::ABS: { - assert(PPCSubTarget->hasP9Vector() && "ABS is supported with P9 Vector"); - - // For vector absolute difference, we use VABSDUW instruction of POWER9. - // Since VABSDU instructions are for unsigned integers, we need adjustment - // for signed integers. - // For abs(sub(a, b)), we generate VABSDUW(a+0x80000000, b+0x80000000). - // Otherwise, abs(sub(-1, 0)) returns 0xFFFFFFFF(=-1) instead of 1. - // For abs(a), we generate VABSDUW(a+0x80000000, 0x80000000). - EVT VecVT = N->getOperand(0).getValueType(); - SDNode *AbsOp = nullptr; - unsigned AbsOpcode; - - if (VecVT == MVT::v4i32) - AbsOpcode = PPC::VABSDUW; - else if (VecVT == MVT::v8i16) - AbsOpcode = PPC::VABSDUH; - else if (VecVT == MVT::v16i8) - AbsOpcode = PPC::VABSDUB; - else - llvm_unreachable("Unsupported vector data type for ISD::ABS"); - - // Even for signed integers, we can skip adjustment if all values are - // known to be positive (as signed integer) due to zero-extended inputs. - if (N->getOperand(0).getOpcode() == ISD::SUB && - N->getOperand(0)->getOperand(0).getOpcode() == ISD::ZERO_EXTEND && - N->getOperand(0)->getOperand(1).getOpcode() == ISD::ZERO_EXTEND) { - AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, - SDValue(N->getOperand(0)->getOperand(0)), - SDValue(N->getOperand(0)->getOperand(1))); - ReplaceNode(N, AbsOp); - return; - } - if (N->getOperand(0).getOpcode() == ISD::SUB) { - SDValue SubVal = N->getOperand(0); - SDNode *Op0 = flipSignBit(SubVal->getOperand(0)); - SDNode *Op1 = flipSignBit(SubVal->getOperand(1)); - AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, - SDValue(Op0, 0), SDValue(Op1, 0)); - } - else { - SDNode *Op1 = nullptr; - SDNode *Op0 = flipSignBit(N->getOperand(0), &Op1); - AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, SDValue(Op0, 0), - SDValue(Op1, 0)); - } - ReplaceNode(N, AbsOp); - return; - } } SelectCode(N); Index: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h +++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.h @@ -373,6 +373,21 @@ /// An SDNode for swaps that are not associated with any loads/stores /// and thereby have no chain. SWAP_NO_CHAIN, + + /// An SDNode for Power9 vector absolute value difference. + /// operand #0 vector + /// operand #1 vector + /// operand #2 constant i32 0 or 1, to indicate whether needs to patch + /// the most significant bit for signed i32 + /// + /// Power9 VABSD* instructions are designed to support unsigned integer + /// vectors (byte/halfword/word), if we want to make use of them for signed + /// integer vectors, we have to flip their sign bits first. To flip sign bit + /// for byte/halfword integer vector would become inefficient, but for word + /// integer vector, we can leverage XVNEGSP to make it efficiently. eg: + /// abs(sub(a,b)) => VABSDUW(a+0x80000000, b+0x80000000) + /// => VABSDUW((XVNEGSP a), (XVNEGSP b)) + VABSD, /// QVFPERM = This corresponds to the QPX qvfperm instruction. QVFPERM, @@ -998,6 +1013,7 @@ SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const; SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const; @@ -1101,6 +1117,7 @@ SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineABS(SDNode *N, DAGCombinerInfo &DCI) const; /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces /// SETCC with integer subtraction when (1) there is a legal way of doing it Index: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp @@ -251,12 +251,6 @@ setOperationAction(ISD::UREM, MVT::i64, Expand); } - if (Subtarget.hasP9Vector()) { - setOperationAction(ISD::ABS, MVT::v4i32, Legal); - setOperationAction(ISD::ABS, MVT::v8i16, Legal); - setOperationAction(ISD::ABS, MVT::v16i8, Legal); - } - // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM. setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); @@ -556,6 +550,7 @@ // add/sub are legal for all supported vector VT's. setOperationAction(ISD::ADD, VT, Legal); setOperationAction(ISD::SUB, VT, Legal); + setOperationAction(ISD::ABS, VT, Custom); // Vector instructions introduced in P8 if (Subtarget.hasP8Altivec() && (VT.SimpleTy != MVT::v1i128)) { @@ -661,6 +656,11 @@ setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); + // Without hasP8Altivec set, v2i64 SMAX isn't available. + // But ABS custom lowering requires SMAX support. + if (!Subtarget.hasP8Altivec()) + setOperationAction(ISD::ABS, MVT::v2i64, Expand); + addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass); addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass); addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass); @@ -1083,6 +1083,10 @@ setTargetDAGCombine(ISD::FSQRT); } + if (Subtarget.hasP9Altivec()) { + setTargetDAGCombine(ISD::ABS); + } + // Darwin long double math library functions have $LDBL128 appended. if (Subtarget.isDarwin()) { setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128"); @@ -1343,6 +1347,7 @@ case PPCISD::RFEBB: return "PPCISD::RFEBB"; case PPCISD::XXSWAPD: return "PPCISD::XXSWAPD"; case PPCISD::SWAP_NO_CHAIN: return "PPCISD::SWAP_NO_CHAIN"; + case PPCISD::VABSD: return "PPCISD::VABSD"; case PPCISD::QVFPERM: return "PPCISD::QVFPERM"; case PPCISD::QVGPCI: return "PPCISD::QVGPCI"; case PPCISD::QVALIGNI: return "PPCISD::QVALIGNI"; @@ -9003,35 +9008,6 @@ return DAG.getRegister(PPC::R2, MVT::i32); } - // We are looking for absolute values here. - // The idea is to try to fit one of two patterns: - // max (a, (0-a)) OR max ((0-a), a) - if (Subtarget.hasP9Vector() && - (IntrinsicID == Intrinsic::ppc_altivec_vmaxsw || - IntrinsicID == Intrinsic::ppc_altivec_vmaxsh || - IntrinsicID == Intrinsic::ppc_altivec_vmaxsb)) { - SDValue V1 = Op.getOperand(1); - SDValue V2 = Op.getOperand(2); - if (V1.getSimpleValueType() == V2.getSimpleValueType() && - (V1.getSimpleValueType() == MVT::v4i32 || - V1.getSimpleValueType() == MVT::v8i16 || - V1.getSimpleValueType() == MVT::v16i8)) { - if ( V1.getOpcode() == ISD::SUB && - ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) && - V1.getOperand(1) == V2 ) { - // Generate the abs instruction with the operands - return DAG.getNode(ISD::ABS, dl, V2.getValueType(),V2); - } - - if ( V2.getOpcode() == ISD::SUB && - ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) && - V2.getOperand(1) == V1 ) { - // Generate the abs instruction with the operands - return DAG.getNode(ISD::ABS, dl, V1.getValueType(),V1); - } - } - } - // If this is a lowered altivec predicate compare, CompareOpc is set to the // opcode number of the comparison. int CompareOpc; @@ -9572,6 +9548,44 @@ } } +SDValue PPCTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const { + + assert(Op.getOpcode() == ISD::ABS && "Should only be called for ISD::ABS"); + + EVT VT = Op.getValueType(); + assert(VT.isVector() && + "Only set vector abs as custom, scalar abs shouldn't reach here!"); + assert((VT == MVT::v2i64 || VT == MVT::v4i32 || VT == MVT::v8i16 || + VT == MVT::v16i8) && + "Unexpected vector element type!"); + assert((VT != MVT::v2i64 || Subtarget.hasP8Altivec()) && + "Current subtarget doesn't support smax v2i64!"); + + // For vector abs, it can be lowered to: + // abs x + // ==> + // y = -x + // smax(x, y) + + SDLoc dl(Op); + SDValue X = Op.getOperand(0); + SDValue Zero = DAG.getConstant(0, dl, VT); + SDValue Y = DAG.getNode(ISD::SUB, dl, VT, Zero, X); + + // SMAX patch https://reviews.llvm.org/D47332 + // hasn't landed yet, so use intrinsic first here. + // TODO: Should use SMAX directly once SMAX patch landed + Intrinsic::ID BifID = Intrinsic::ppc_altivec_vmaxsw; + if (VT == MVT::v2i64) + BifID = Intrinsic::ppc_altivec_vmaxsd; + else if (VT == MVT::v8i16) + BifID = Intrinsic::ppc_altivec_vmaxsh; + else if (VT == MVT::v16i8) + BifID = Intrinsic::ppc_altivec_vmaxsb; + + return BuildIntrinsicOp(BifID, X, Y, DAG, dl, VT); +} + /// LowerOperation - Provide custom lowering hooks for some operations. /// SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { @@ -9624,6 +9638,7 @@ case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); case ISD::MUL: return LowerMUL(Op, DAG); + case ISD::ABS: return LowerABS(Op, DAG); // For counter-based loop handling. case ISD::INTRINSIC_W_CHAIN: return SDValue(); @@ -12985,6 +13000,39 @@ } } } + + // Combine vmaxsw/h/b(a, a's negation) to abs(a) + // Expose the vabsduw/h/b opportunity for down stream + if (!DCI.isAfterLegalizeDAG() && Subtarget.hasP9Altivec() && + (IID == Intrinsic::ppc_altivec_vmaxsw || + IID == Intrinsic::ppc_altivec_vmaxsh || + IID == Intrinsic::ppc_altivec_vmaxsb)) { + SDValue V1 = N->getOperand(1); + SDValue V2 = N->getOperand(2); + if ((V1.getSimpleValueType() == MVT::v4i32 || + V1.getSimpleValueType() == MVT::v8i16 || + V1.getSimpleValueType() == MVT::v16i8) && + V1.getSimpleValueType() == V2.getSimpleValueType()) { + // (0-a, a) + if (V1.getOpcode() == ISD::SUB && + ISD::isBuildVectorAllZeros(V1.getOperand(0).getNode()) && + V1.getOperand(1) == V2) { + return DAG.getNode(ISD::ABS, dl, V2.getValueType(), V2); + } + // (a, 0-a) + if (V2.getOpcode() == ISD::SUB && + ISD::isBuildVectorAllZeros(V2.getOperand(0).getNode()) && + V2.getOperand(1) == V1) { + return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1); + } + // (x-y, y-x) + if (V1.getOpcode() == ISD::SUB && V2.getOpcode() == ISD::SUB && + V1.getOperand(0) == V2.getOperand(1) && + V1.getOperand(1) == V2.getOperand(0)) { + return DAG.getNode(ISD::ABS, dl, V1.getValueType(), V1); + } + } + } } break; @@ -13217,6 +13265,8 @@ } case ISD::BUILD_VECTOR: return DAGCombineBuildVector(N, DCI); + case ISD::ABS: + return combineABS(N, DCI); } return SDValue(); @@ -14503,3 +14553,47 @@ // For non-constant masks, we can always use the record-form and. return true; } + +// Transform (abs (sub (zext a), (zext b))) to (vabsd a b 0) +// Transform (abs (sub (zext a), (zext_invec b))) to (vabsd a b 0) +// Transform (abs (sub (zext_invec a), (zext_invec b))) to (vabsd a b 0) +// Transform (abs (sub (zext_invec a), (zext b))) to (vabsd a b 0) +// Transform (abs (sub a, b) to (vabsd a b 1)) if a & b of type v4i32 +SDValue PPCTargetLowering::combineABS(SDNode *N, DAGCombinerInfo &DCI) const { + assert((N->getOpcode() == ISD::ABS) && "Need ABS node here"); + assert(Subtarget.hasP9Altivec() && + "Only combine this when P9 altivec supported!"); + EVT VT = N->getValueType(0); + if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + if (N->getOperand(0).getOpcode() == ISD::SUB) { + // Even for signed integers, if it's known to be positive (as signed + // integer) due to zero-extended inputs. + unsigned SubOpcd0 = N->getOperand(0)->getOperand(0).getOpcode(); + unsigned SubOpcd1 = N->getOperand(0)->getOperand(1).getOpcode(); + if ((SubOpcd0 == ISD::ZERO_EXTEND || + SubOpcd0 == ISD::ZERO_EXTEND_VECTOR_INREG) && + (SubOpcd1 == ISD::ZERO_EXTEND || + SubOpcd1 == ISD::ZERO_EXTEND_VECTOR_INREG)) { + return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(), + N->getOperand(0)->getOperand(0), + N->getOperand(0)->getOperand(1), + DAG.getTargetConstant(0, dl, MVT::i32)); + } + + // For type v4i32, it can be optimized with xvnegsp + vabsduw + if (N->getOperand(0).getValueType() == MVT::v4i32 && + N->getOperand(0).hasOneUse()) { + return DAG.getNode(PPCISD::VABSD, dl, N->getOperand(0).getValueType(), + N->getOperand(0)->getOperand(0), + N->getOperand(0)->getOperand(1), + DAG.getTargetConstant(1, dl, MVT::i32)); + } + } + + return SDValue(); +} + Index: llvm/trunk/lib/Target/PowerPC/PPCInstrVSX.td =================================================================== --- llvm/trunk/lib/Target/PowerPC/PPCInstrVSX.td +++ llvm/trunk/lib/Target/PowerPC/PPCInstrVSX.td @@ -67,6 +67,10 @@ def SDTVecConv : SDTypeProfile<1, 2, [ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2> ]>; +def SDTVabsd : SDTypeProfile<1, 3, [ + SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<3, i32> +]>; + def PPClxvd2x : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; @@ -79,6 +83,7 @@ def PPCsvec2fp : SDNode<"PPCISD::SINT_VEC_TO_FP", SDTVecConv, []>; def PPCuvec2fp: SDNode<"PPCISD::UINT_VEC_TO_FP", SDTVecConv, []>; def PPCswapNoChain : SDNode<"PPCISD::SWAP_NO_CHAIN", SDT_PPCxxswapd>; +def PPCvabsd : SDNode<"PPCISD::VABSD", SDTVabsd, []>; multiclass XX3Form_Rcr opcode, bits<7> xo, string asmbase, string asmstr, InstrItinClass itin, Intrinsic Int, @@ -4017,3 +4022,21 @@ } } +// Put this P9Altivec related definition here since it's possible to be +// selected to VSX instruction xvnegsp, avoid possible undef. +let Predicates = [HasP9Altivec] in { + + def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 0))), + (v4i32 (VABSDUW $A, $B))>; + + def : Pat<(v8i16 (PPCvabsd v8i16:$A, v8i16:$B, (i32 0))), + (v8i16 (VABSDUH $A, $B))>; + + def : Pat<(v16i8 (PPCvabsd v16i8:$A, v16i8:$B, (i32 0))), + (v16i8 (VABSDUB $A, $B))>; + + // As PPCVABSD description, the last operand indicates whether do the + // sign bit flip. + def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 1))), + (v4i32 (VABSDUW (XVNEGSP $A), (XVNEGSP $B)))>; +} Index: llvm/trunk/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll +++ llvm/trunk/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll @@ -1,6 +1,7 @@ ; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs | FileCheck %s ; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs | FileCheck %s ; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-PWR8 -implicit-check-not vabsdu +; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -ppc-vsr-nums-as-vr -ppc-asm-full-reg-names -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-PWR7 -implicit-check-not vmaxsd define <4 x i32> @simple_absv_32(<4 x i32> %a) local_unnamed_addr { entry: @@ -8,16 +9,21 @@ %0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %a, <4 x i32> %sub.i) ret <4 x i32> %0 ; CHECK-LABEL: simple_absv_32 -; CHECK-DAG: vxor v{{[0-9]+}}, v[[REG:[0-9]+]], v[[REG]] -; CHECK-DAG: xvnegsp v2, v2 -; CHECK-DAG: xvnegsp v3, v{{[0-9]+}} -; CHECK-NEXT: vabsduw v2, v2, v{{[0-9]+}} +; CHECK-NOT: vxor +; CHECK-NOT: vabsduw +; CHECK: vnegw v[[REG:[0-9]+]], v2 +; CHECK-NEXT: vmaxsw v2, v2, v[[REG]] ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: simple_absv_32 ; CHECK-PWR8: xxlxor ; CHECK-PWR8: vsubuwm ; CHECK-PWR8: vmaxsw ; CHECK-PWR8: blr +; CHECK-PWR7-LABEL: simple_absv_32 +; CHECK-PWR7: xxlxor +; CHECK-PWR7: vsubuwm +; CHECK-PWR7: vmaxsw +; CHECK-PWR7: blr } define <4 x i32> @simple_absv_32_swap(<4 x i32> %a) local_unnamed_addr { @@ -26,10 +32,10 @@ %0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %sub.i, <4 x i32> %a) ret <4 x i32> %0 ; CHECK-LABEL: simple_absv_32_swap -; CHECK-DAG: vxor v{{[0-9]+}}, v[[REG:[0-9]+]], v[[REG]] -; CHECK-DAG: xvnegsp v2, v2 -; CHECK-DAG: xvnegsp v3, v{{[0-9]+}} -; CHECK-NEXT: vabsduw v2, v2, v{{[0-9]+}} +; CHECK-NOT: vxor +; CHECK-NOT: vabsduw +; CHECK: vnegw v[[REG:[0-9]+]], v2 +; CHECK-NEXT: vmaxsw v2, v2, v[[REG]] ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: simple_absv_32_swap ; CHECK-PWR8: xxlxor @@ -44,15 +50,22 @@ %0 = tail call <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16> %a, <8 x i16> %sub.i) ret <8 x i16> %0 ; CHECK-LABEL: simple_absv_16 -; CHECK: mtvsrws v{{[0-9]+}}, r{{[0-9]+}} -; CHECK-NEXT: vadduhm v2, v2, v[[IMM:[0-9]+]] -; CHECK-NEXT: vabsduh v2, v2, v[[IMM]] +; CHECK-NOT: mtvsrws +; CHECK-NOT: vabsduh +; CHECK: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-NEXT: vsubuhm v[[REG:[0-9]+]], v[[ZERO]], v2 +; CHECK-NEXT: vmaxsh v2, v2, v[[REG]] ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: simple_absv_16 ; CHECK-PWR8: xxlxor ; CHECK-PWR8: vsubuhm ; CHECK-PWR8: vmaxsh ; CHECK-PWR8: blr +; CHECK-PWR7-LABEL: simple_absv_16 +; CHECK-PWR7: xxlxor +; CHECK-PWR7: vsubuhm +; CHECK-PWR7: vmaxsh +; CHECK-PWR7: blr } define <16 x i8> @simple_absv_8(<16 x i8> %a) local_unnamed_addr { @@ -61,15 +74,45 @@ %0 = tail call <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8> %a, <16 x i8> %sub.i) ret <16 x i8> %0 ; CHECK-LABEL: simple_absv_8 -; CHECK: xxspltib v{{[0-9]+}}, 128 -; CHECK-NEXT: vaddubm v2, v2, v[[IMM:[0-9]+]] -; CHECK-NEXT: vabsdub v2, v2, v[[IMM]] +; CHECK-NOT: xxspltib +; CHECK-NOT: vabsdub +; CHECK: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-NEXT: vsububm v[[REG:[0-9]+]], v[[ZERO]], v2 +; CHECK-NEXT: vmaxsb v2, v2, v[[REG]] ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: simple_absv_8 ; CHECK-PWR8: xxlxor ; CHECK-PWR8: vsububm ; CHECK-PWR8: vmaxsb ; CHECK-PWR8: blr +; CHECK-PWR7-LABEL: simple_absv_8 +; CHECK-PWR7: xxlxor +; CHECK-PWR7: vsububm +; CHECK-PWR7: vmaxsb +; CHECK-PWR7: blr +} + +; v2i64 vmax isn't avaiable on pwr7 +define <2 x i64> @sub_absv_64(<2 x i64> %a, <2 x i64> %b) local_unnamed_addr { +entry: + %0 = sub nsw <2 x i64> %a, %b + %1 = icmp sgt <2 x i64> %0, + %2 = sub <2 x i64> zeroinitializer, %0 + %3 = select <2 x i1> %1, <2 x i64> %0, <2 x i64> %2 + ret <2 x i64> %3 +; CHECK-LABEL: sub_absv_64 +; CHECK: vsubudm +; CHECK: vnegd +; CHECK: vmaxsd +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: sub_absv_64 +; CHECK-PWR8-DAG: vsubudm +; CHECK-PWR8-DAG: xxlxor +; CHECK-PWR8: vmaxsd +; CHECK-PWR8: blr +; CHECK-PWR7-LABEL: sub_absv_64 +; CHECK-PWR7-NOT: vmaxsd +; CHECK-PWR7: blr } ; The select pattern can only be detected for v4i32. @@ -81,14 +124,77 @@ %3 = select <4 x i1> %1, <4 x i32> %0, <4 x i32> %2 ret <4 x i32> %3 ; CHECK-LABEL: sub_absv_32 -; CHECK-DAG: xvnegsp v3, v3 -; CHECK-DAG: xvnegsp v2, v2 -; CHECK-NEXT: vabsduw v2, v2, v3 +; CHECK-NOT: vsubuwm +; CHECK-NOT: vnegw +; CHECK-NOT: vmaxsw +; CHECK-DAG: xvnegsp v2, v2 +; CHECK-DAG: xvnegsp v3, v3 +; CHECK-NEXT: vabsduw v2, v{{[23]}}, v{{[23]}} ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: sub_absv_32 -; CHECK-PWR8: vsubuwm -; CHECK-PWR8: xxlxor +; CHECK-PWR8-DAG: vsubuwm +; CHECK-PWR8-DAG: xxlxor +; CHECK-PWR8: vmaxsw ; CHECK-PWR8: blr +; CHECK-PWR7-LABEL: sub_absv_32 +; CHECK-PWR7-DAG: vsubuwm +; CHECK-PWR7-DAG: xxlxor +; CHECK-PWR7: vmaxsw +; CHECK-PWR7: blr +} + +define <8 x i16> @sub_absv_16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr { +entry: + %0 = sub nsw <8 x i16> %a, %b + %1 = icmp sgt <8 x i16> %0, + %2 = sub <8 x i16> zeroinitializer, %0 + %3 = select <8 x i1> %1, <8 x i16> %0, <8 x i16> %2 + ret <8 x i16> %3 +; CHECK-LABEL: sub_absv_16 +; CHECK-NOT: vabsduh +; CHECK-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-DAG: vsubuhm v[[SUB:[0-9]+]], v2, v3 +; CHECK: vsubuhm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-NEXT: vmaxsh v2, v[[SUB]], v[[SUB1]] +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: sub_absv_16 +; CHECK-PWR8-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-PWR8-DAG: vsubuhm v[[SUB:[0-9]+]], v2, v3 +; CHECK-PWR8: vsubuhm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-PWR8-NEXT: vmaxsh v2, v[[SUB]], v[[SUB1]] +; CHECK-PWR8-NEXT: blr +; CHECK-PWR7-LABEL: sub_absv_16 +; CHECK-PWR7-DAG: vsubuhm +; CHECK-PWR7-DAG: xxlxor +; CHECK-PWR7: vmaxsh +; CHECK-PWR7-NEXT: blr +} + +define <16 x i8> @sub_absv_8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr { +entry: + %0 = sub nsw <16 x i8> %a, %b + %1 = icmp sgt <16 x i8> %0, + %2 = sub <16 x i8> zeroinitializer, %0 + %3 = select <16 x i1> %1, <16 x i8> %0, <16 x i8> %2 + ret <16 x i8> %3 +; CHECK-LABEL: sub_absv_8 +; CHECK-NOT: vabsdub +; CHECK-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-DAG: vsububm v[[SUB:[0-9]+]], v2, v3 +; CHECK: vsububm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-NEXT: vmaxsb v2, v[[SUB]], v[[SUB1]] +; CHECK-NEXT: blr +; CHECK-PWR8-LABEL: sub_absv_8 +; CHECK-PWR8-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-PWR8-DAG: vsububm v[[SUB:[0-9]+]], v2, v3 +; CHECK-PWR8: vsububm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-PWR8-NEXT: vmaxsb v2, v[[SUB]], v[[SUB1]] +; CHECK-PWR8-NEXT: blr +; CHECK-PWR7-LABEL: sub_absv_8 +; CHECK-PWR7-DAG: xxlxor +; CHECK-PWR7-DAG: vsububm +; CHECK-PWR7: vmaxsb +; CHECK-PWR7-NEXT: blr } ; FIXME: This does not produce the ISD::ABS that we are looking for. @@ -96,7 +202,7 @@ ; We do manage to find the word version of ABS but not the halfword. ; Threfore, we end up doing more work than is required with a pair of abs for word ; instead of just one for the halfword. -define <8 x i16> @sub_absv_16(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr { +define <8 x i16> @sub_absv_16_ext(<8 x i16> %a, <8 x i16> %b) local_unnamed_addr { entry: %0 = sext <8 x i16> %a to <8 x i32> %1 = sext <8 x i16> %b to <8 x i32> @@ -106,23 +212,25 @@ %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4 %6 = trunc <8 x i32> %5 to <8 x i16> ret <8 x i16> %6 -; CHECK-LABEL: sub_absv_16 +; CHECK-LABEL: sub_absv_16_ext ; CHECK-NOT: vabsduh ; CHECK: vabsduw +; CHECK-NOT: vnegw ; CHECK-NOT: vabsduh ; CHECK: vabsduw +; CHECK-NOT: vnegw ; CHECK-NOT: vabsduh ; CHECK: blr ; CHECK-PWR8-LABEL: sub_absv_16 -; CHECK-PWR8: vsubuwm -; CHECK-PWR8: xxlxor +; CHECK-PWR8-DAG: vsubuwm +; CHECK-PWR8-DAG: xxlxor ; CHECK-PWR8: blr } ; FIXME: This does not produce ISD::ABS. This does not even vectorize correctly! ; This function should look like sub_absv_32 and sub_absv_16 except that the type is v16i8. ; Function Attrs: norecurse nounwind readnone -define <16 x i8> @sub_absv_8(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr { +define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr { entry: %vecext = extractelement <16 x i8> %a, i32 0 %conv = zext i8 %vecext to i32 @@ -285,14 +393,14 @@ %conv122 = trunc i32 %15 to i8 %vecins123 = insertelement <16 x i8> %vecins115, i8 %conv122, i32 15 ret <16 x i8> %vecins123 -; CHECK-LABEL: sub_absv_8 +; CHECK-LABEL: sub_absv_8_ext ; CHECK-NOT: vabsdub ; CHECK: subf ; CHECK-NOT: vabsdub ; CHECK: xor ; CHECK-NOT: vabsdub ; CHECK: blr -; CHECK-PWR8-LABEL: sub_absv_8 +; CHECK-PWR8-LABEL: sub_absv_8_ext ; CHECK-PWR8: subf ; CHECK-PWR8: xor ; CHECK-PWR8: blr @@ -305,11 +413,16 @@ %0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %sub, <4 x i32> %sub.i) ret <4 x i32> %0 ; CHECK-LABEL: sub_absv_vec_32 -; CHECK: vabsduw v2, v2, v3 +; CHECK-NOT: vsubuwm +; CHECK-NOT: vnegw +; CHECK-NOT: vmaxsw +; CHECK-DAG: xvnegsp v2, v2 +; CHECK-DAG: xvnegsp v3, v3 +; CHECK-NEXT: vabsduw v2, v{{[23]}}, v{{[23]}} ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: sub_absv_vec_32 -; CHECK-PWR8: xxlxor -; CHECK-PWR8: vsubuwm +; CHECK-PWR8-DAG: xxlxor +; CHECK-PWR8-DAG: vsubuwm ; CHECK-PWR8: vmaxsw ; CHECK-PWR8: blr } @@ -321,11 +434,16 @@ %0 = tail call <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16> %sub, <8 x i16> %sub.i) ret <8 x i16> %0 ; CHECK-LABEL: sub_absv_vec_16 -; CHECK: vabsduh v2, v2, v3 +; CHECK-NOT: mtvsrws +; CHECK-NOT: vabsduh +; CHECK-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-DAG: vsubuhm v[[SUB:[0-9]+]], v2, v3 +; CHECK: vsubuhm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-NEXT: vmaxsh v2, v[[SUB]], v[[SUB1]] ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: sub_absv_vec_16 -; CHECK-PWR8: xxlxor -; CHECK-PWR8: vsubuhm +; CHECK-PWR8-DAG: xxlxor +; CHECK-PWR8-DAG: vsubuhm ; CHECK-PWR8: vmaxsh ; CHECK-PWR8: blr } @@ -337,15 +455,67 @@ %0 = tail call <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8> %sub, <16 x i8> %sub.i) ret <16 x i8> %0 ; CHECK-LABEL: sub_absv_vec_8 -; CHECK: vabsdub v2, v2, v3 +; CHECK-NOT: xxspltib +; CHECK-NOT: vabsdub +; CHECK-DAG: xxlxor v[[ZERO:[0-9]+]], v[[ZERO]], v[[ZERO]] +; CHECK-DAG: vsububm v[[SUB:[0-9]+]], v2, v3 +; CHECK: vsububm v[[SUB1:[0-9]+]], v[[ZERO]], v[[SUB]] +; CHECK-NEXT: vmaxsb v2, v[[SUB]], v[[SUB1]] ; CHECK-NEXT: blr ; CHECK-PWR8-LABEL: sub_absv_vec_8 -; CHECK-PWR8: xxlxor -; CHECK-PWR8: vsububm +; CHECK-PWR8-DAG: xxlxor +; CHECK-PWR8-DAG: vsububm ; CHECK-PWR8: vmaxsb ; CHECK-PWR8: blr } +define <4 x i32> @zext_sub_absd32(<4 x i16>, <4 x i16>) local_unnamed_addr { + %3 = zext <4 x i16> %0 to <4 x i32> + %4 = zext <4 x i16> %1 to <4 x i32> + %5 = sub <4 x i32> %3, %4 + %6 = sub <4 x i32> zeroinitializer, %5 + %7 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %5, <4 x i32> %6) + ret <4 x i32> %7 +; CHECK-LABEL: zext_sub_absd32 +; CHECK-NOT: xvnegsp +; CHECK: vabsduw +; CHECK: blr +; CHECK-PWR8-LABEL: zext_sub_absd32 +; CHECK-PWR8: vmaxsw +; CHECK-PWR8: blr +} + +define <8 x i16> @zext_sub_absd16(<8 x i8>, <8 x i8>) local_unnamed_addr { + %3 = zext <8 x i8> %0 to <8 x i16> + %4 = zext <8 x i8> %1 to <8 x i16> + %5 = sub <8 x i16> %3, %4 + %6 = sub <8 x i16> zeroinitializer, %5 + %7 = tail call <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16> %5, <8 x i16> %6) + ret <8 x i16> %7 +; CHECK-LABEL: zext_sub_absd16 +; CHECK-NOT: vadduhm +; CHECK: vabsduh +; CHECK: blr +; CHECK-PWR8-LABEL: zext_sub_absd16 +; CHECK-PWR8: vmaxsh +; CHECK-PWR8: blr +} + +define <16 x i8> @zext_sub_absd8(<16 x i4>, <16 x i4>) local_unnamed_addr { + %3 = zext <16 x i4> %0 to <16 x i8> + %4 = zext <16 x i4> %1 to <16 x i8> + %5 = sub <16 x i8> %3, %4 + %6 = sub <16 x i8> zeroinitializer, %5 + %7 = tail call <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8> %5, <16 x i8> %6) + ret <16 x i8> %7 +; CHECK-LABEL: zext_sub_absd8 +; CHECK-NOT: vaddubm +; CHECK: vabsdub +; CHECK: blr +; CHECK-PWR8-LABEL: zext_sub_absd8 +; CHECK-PWR8: vmaxsb +; CHECK-PWR8: blr +} declare <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32>, <4 x i32>) Index: llvm/trunk/test/CodeGen/PowerPC/pre-inc-disable.ll =================================================================== --- llvm/trunk/test/CodeGen/PowerPC/pre-inc-disable.ll +++ llvm/trunk/test/CodeGen/PowerPC/pre-inc-disable.ll @@ -181,12 +181,8 @@ ; CHECK: vperm v4, v0, v4, v3 ; CHECK: vperm v2, v5, v0, v2 ; CHECK: vperm v3, v0, v5, v3 -; CHECK: xvnegsp v5, v1 -; CHECK: xvnegsp v4, v4 -; CHECK: xvnegsp v2, v2 -; CHECK: xvnegsp v3, v3 ; CHECK: vabsduw v3, v4, v3 -; CHECK: vabsduw v2, v5, v2 +; CHECK: vabsduw v2, v1, v2 ; CHECK: vadduwm v2, v2, v3 ; CHECK: xxswapd v3, v2 ; CHECK: vadduwm v2, v2, v3 @@ -212,12 +208,8 @@ ; P9BE: vperm v4, v5, v4, v3 ; P9BE: vperm v2, v5, v0, v2 ; P9BE: vperm v3, v5, v0, v3 -; P9BE: xvnegsp v5, v1 -; P9BE: xvnegsp v4, v4 -; P9BE: xvnegsp v2, v2 -; P9BE: xvnegsp v3, v3 ; P9BE: vabsduw v3, v4, v3 -; P9BE: vabsduw v2, v5, v2 +; P9BE: vabsduw v2, v1, v2 ; P9BE: vadduwm v2, v2, v3 ; P9BE: xxswapd v3, v2 ; P9BE: vadduwm v2, v2, v3