diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp --- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -3836,7 +3836,7 @@ Opc = Subtarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD; } else { assert(LHS.getValueType() == MVT::f128 && "Unknown vt!"); - assert(Subtarget->hasVSX() && "__float128 requires VSX"); + assert(Subtarget->hasP9Vector() && "XSCMPUQP requires Power9 Vector"); Opc = PPC::XSCMPUQP; } if (Chain) diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -1219,6 +1219,20 @@ setOperationAction(ISD::FP_ROUND, VT, Custom); setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom); } + + // Expand the SELECT to SELECT_CC + setOperationAction(ISD::SELECT, MVT::f128, Expand); + + setOperationAction(ISD::SETCC, MVT::f128, Custom); + setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom); + setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom); + + // Lower the select_cc as follows for fp128. + // select_cc x, y, tv, fv, cc -> + // z = setcc x, y, cc (expand as libcall) + // select_cc z, 0, tv, fv, NE + for (auto VT : {MVT::i32, MVT::i64, MVT::f128}) + setOperationAction(ISD::SELECT_CC, VT, Custom); } if (Subtarget.hasP9Altivec()) { @@ -3290,21 +3304,43 @@ } SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { - ISD::CondCode CC = cast(Op.getOperand(2))->get(); + bool IsStrict = Op->isStrictFPOpcode(); + ISD::CondCode CC = + cast(Op.getOperand(IsStrict ? 3 : 2))->get(); + SDValue LHS = Op.getOperand(IsStrict ? 1 : 0); + SDValue RHS = Op.getOperand(IsStrict ? 2 : 1); + SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); + EVT LHSVT = LHS.getValueType(); SDLoc dl(Op); + // Soften the setcc with libcall if it is fp128. + if (LHSVT == MVT::f128) { + assert(!Subtarget.hasP9Vector() && + "Don't custom lower the setcc for fp128 with P9 vector enabled"); + softenSetCCOperands(DAG, LHSVT, LHS, RHS, CC, dl, LHS, RHS, Chain, + Op->getOpcode() == ISD::STRICT_FSETCCS); + if (RHS.getNode()) + LHS = DAG.getNode(ISD::SETCC, dl, Op.getValueType(), LHS, RHS, + DAG.getCondCode(CC)); + if (IsStrict) + return DAG.getMergeValues({LHS, Chain}, dl); + return LHS; + } + + assert(!IsStrict && "Don't know how to handle the strict setcc"); + if (Op.getValueType() == MVT::v2i64) { // When the operands themselves are v2i64 values, we need to do something // special because VSX has no underlying comparison operations for these. - if (Op.getOperand(0).getValueType() == MVT::v2i64) { + if (LHS.getValueType() == MVT::v2i64) { // Equality can be handled by casting to the legal type for Altivec // comparisons, everything else needs to be expanded. if (CC == ISD::SETEQ || CC == ISD::SETNE) { - return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, - DAG.getSetCC(dl, MVT::v4i32, - DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)), - DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)), - CC)); + return DAG.getNode( + ISD::BITCAST, dl, MVT::v2i64, + DAG.getSetCC(dl, MVT::v4i32, + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, LHS), + DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, RHS), CC)); } return SDValue(); @@ -3320,7 +3356,7 @@ if (SDValue V = lowerCmpEqZeroToCtlzSrl(Op, DAG)) return V; - if (ConstantSDNode *C = dyn_cast(Op.getOperand(1))) { + if (ConstantSDNode *C = dyn_cast(RHS)) { // Leave comparisons against 0 and -1 alone for now, since they're usually // optimized. FIXME: revisit this when we can custom lower all setcc // optimizations. @@ -3333,11 +3369,9 @@ // condition register, reading it back out, and masking the correct bit. The // normal approach here uses sub to do this instead of xor. Using xor exposes // the result to other bit-twiddling opportunities. - EVT LHSVT = Op.getOperand(0).getValueType(); if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) { EVT VT = Op.getValueType(); - SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0), - Op.getOperand(1)); + SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, LHS, RHS); return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, dl, LHSVT), CC); } return SDValue(); @@ -7370,18 +7404,32 @@ /// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when /// possible. SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { - // Not FP, or using SPE? Not a fsel. - if (!Op.getOperand(0).getValueType().isFloatingPoint() || - !Op.getOperand(2).getValueType().isFloatingPoint() || Subtarget.hasSPE()) - return Op; - ISD::CondCode CC = cast(Op.getOperand(4))->get(); - EVT ResVT = Op.getValueType(); EVT CmpVT = Op.getOperand(0).getValueType(); SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1); - SDValue TV = Op.getOperand(2), FV = Op.getOperand(3); + SDValue TV = Op.getOperand(2), FV = Op.getOperand(3); SDLoc dl(Op); + + // PowerPC didn't have native instruction to do the comparision for fp128 if + // Power9 vector is not enabled. So, we need to do the following + // transformation so that, setcc will be converted into libcall. + // select_cc lhs, rhs, tv, fv, cc -> + // z = setcc cc, x, y + // select_cc z, 0, tv, fv, NE + if (CmpVT == MVT::f128 && !Subtarget.hasP9Vector()) { + SDValue Z = DAG.getSetCC( + dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT), + LHS, RHS, CC); + SDValue Zero = DAG.getConstant(0, dl, Z.getValueType()); + return DAG.getSelectCC(dl, Z, Zero, TV, FV, ISD::SETNE); + } + + // Not FP, or using SPE? Not a fsel. + if (!CmpVT.isFloatingPoint() || !TV.getValueType().isFloatingPoint() || + Subtarget.hasSPE()) + return Op; + SDNodeFlags Flags = Op.getNode()->getFlags(); // We have xsmaxcdp/xsmincdp which are OK to emit even in the @@ -10308,6 +10356,8 @@ case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); case ISD::JumpTable: return LowerJumpTable(Op, DAG); + case ISD::STRICT_FSETCC: + case ISD::STRICT_FSETCCS: case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); diff --git a/llvm/test/CodeGen/PowerPC/f128-compare.ll b/llvm/test/CodeGen/PowerPC/f128-compare.ll --- a/llvm/test/CodeGen/PowerPC/f128-compare.ll +++ b/llvm/test/CodeGen/PowerPC/f128-compare.ll @@ -2,7 +2,7 @@ ; RUN: llc -mcpu=pwr9 -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs \ ; RUN: -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | FileCheck %s ; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs \ -; RUN: -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | FileCheck %s \ +; RUN: -enable-soft-fp128 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | FileCheck %s \ ; RUN: -check-prefix=CHECK-P8 @a_qp = common dso_local global fp128 0xL00000000000000000000000000000000, align 16 @@ -31,14 +31,12 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: addis r3, r2, a_qp@toc@ha ; CHECK-P8-NEXT: addis r4, r2, b_qp@toc@ha -; CHECK-P8-NEXT: addis r5, r2, a_qp@toc@ha -; CHECK-P8-NEXT: addi r6, r5, a_qp@toc@l -; CHECK-P8-NEXT: addi r7, r4, b_qp@toc@l -; CHECK-P8-NEXT: ld r3, a_qp@toc@l(r5) -; CHECK-P8-NEXT: ld r5, b_qp@toc@l(r4) -; CHECK-P8-NEXT: ld r4, 8(r6) -; CHECK-P8-NEXT: ld r6, 8(r7) +; CHECK-P8-NEXT: addi r3, r3, a_qp@toc@l +; CHECK-P8-NEXT: addi r4, r4, b_qp@toc@l +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: bl __gtkf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: extsw r3, r3 @@ -79,14 +77,12 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: addis r3, r2, a_qp@toc@ha ; CHECK-P8-NEXT: addis r4, r2, b_qp@toc@ha -; CHECK-P8-NEXT: addis r5, r2, a_qp@toc@ha -; CHECK-P8-NEXT: addi r6, r5, a_qp@toc@l -; CHECK-P8-NEXT: addi r7, r4, b_qp@toc@l -; CHECK-P8-NEXT: ld r3, a_qp@toc@l(r5) -; CHECK-P8-NEXT: ld r5, b_qp@toc@l(r4) -; CHECK-P8-NEXT: ld r4, 8(r6) -; CHECK-P8-NEXT: ld r6, 8(r7) +; CHECK-P8-NEXT: addi r3, r3, a_qp@toc@l +; CHECK-P8-NEXT: addi r4, r4, b_qp@toc@l +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: bl __ltkf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: rlwinm r3, r3, 1, 31, 31 @@ -125,14 +121,12 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: addis r3, r2, a_qp@toc@ha ; CHECK-P8-NEXT: addis r4, r2, b_qp@toc@ha -; CHECK-P8-NEXT: addis r5, r2, a_qp@toc@ha -; CHECK-P8-NEXT: addi r6, r5, a_qp@toc@l -; CHECK-P8-NEXT: addi r7, r4, b_qp@toc@l -; CHECK-P8-NEXT: ld r3, a_qp@toc@l(r5) -; CHECK-P8-NEXT: ld r5, b_qp@toc@l(r4) -; CHECK-P8-NEXT: ld r4, 8(r6) -; CHECK-P8-NEXT: ld r6, 8(r7) +; CHECK-P8-NEXT: addi r3, r3, a_qp@toc@l +; CHECK-P8-NEXT: addi r4, r4, b_qp@toc@l +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: bl __gekf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: rlwinm r3, r3, 1, 31, 31 @@ -172,14 +166,12 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: addis r3, r2, a_qp@toc@ha ; CHECK-P8-NEXT: addis r4, r2, b_qp@toc@ha -; CHECK-P8-NEXT: addis r5, r2, a_qp@toc@ha -; CHECK-P8-NEXT: addi r6, r5, a_qp@toc@l -; CHECK-P8-NEXT: addi r7, r4, b_qp@toc@l -; CHECK-P8-NEXT: ld r3, a_qp@toc@l(r5) -; CHECK-P8-NEXT: ld r5, b_qp@toc@l(r4) -; CHECK-P8-NEXT: ld r4, 8(r6) -; CHECK-P8-NEXT: ld r6, 8(r7) +; CHECK-P8-NEXT: addi r3, r3, a_qp@toc@l +; CHECK-P8-NEXT: addi r4, r4, b_qp@toc@l +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: bl __lekf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: extsw r3, r3 @@ -221,14 +213,12 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: addis r3, r2, a_qp@toc@ha ; CHECK-P8-NEXT: addis r4, r2, b_qp@toc@ha -; CHECK-P8-NEXT: addis r5, r2, a_qp@toc@ha -; CHECK-P8-NEXT: addi r6, r5, a_qp@toc@l -; CHECK-P8-NEXT: addi r7, r4, b_qp@toc@l -; CHECK-P8-NEXT: ld r3, a_qp@toc@l(r5) -; CHECK-P8-NEXT: ld r5, b_qp@toc@l(r4) -; CHECK-P8-NEXT: ld r4, 8(r6) -; CHECK-P8-NEXT: ld r6, 8(r7) +; CHECK-P8-NEXT: addi r3, r3, a_qp@toc@l +; CHECK-P8-NEXT: addi r4, r4, b_qp@toc@l +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: bl __eqkf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: cntlzw r3, r3 @@ -267,14 +257,12 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: addis r3, r2, a_qp@toc@ha ; CHECK-P8-NEXT: addis r4, r2, b_qp@toc@ha -; CHECK-P8-NEXT: addis r5, r2, a_qp@toc@ha -; CHECK-P8-NEXT: addi r6, r5, a_qp@toc@l -; CHECK-P8-NEXT: addi r7, r4, b_qp@toc@l -; CHECK-P8-NEXT: ld r3, a_qp@toc@l(r5) -; CHECK-P8-NEXT: ld r5, b_qp@toc@l(r4) -; CHECK-P8-NEXT: ld r4, 8(r6) -; CHECK-P8-NEXT: ld r6, 8(r7) +; CHECK-P8-NEXT: addi r3, r3, a_qp@toc@l +; CHECK-P8-NEXT: addi r4, r4, b_qp@toc@l +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: bl __gtkf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: extsw r3, r3 @@ -316,14 +304,12 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: addis r3, r2, a_qp@toc@ha ; CHECK-P8-NEXT: addis r4, r2, b_qp@toc@ha -; CHECK-P8-NEXT: addis r5, r2, a_qp@toc@ha -; CHECK-P8-NEXT: addi r6, r5, a_qp@toc@l -; CHECK-P8-NEXT: addi r7, r4, b_qp@toc@l -; CHECK-P8-NEXT: ld r3, a_qp@toc@l(r5) -; CHECK-P8-NEXT: ld r5, b_qp@toc@l(r4) -; CHECK-P8-NEXT: ld r4, 8(r6) -; CHECK-P8-NEXT: ld r6, 8(r7) +; CHECK-P8-NEXT: addi r3, r3, a_qp@toc@l +; CHECK-P8-NEXT: addi r4, r4, b_qp@toc@l +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: bl __ltkf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: rlwinm r3, r3, 1, 31, 31 @@ -364,14 +350,12 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: addis r3, r2, a_qp@toc@ha ; CHECK-P8-NEXT: addis r4, r2, b_qp@toc@ha -; CHECK-P8-NEXT: addis r5, r2, a_qp@toc@ha -; CHECK-P8-NEXT: addi r6, r5, a_qp@toc@l -; CHECK-P8-NEXT: addi r7, r4, b_qp@toc@l -; CHECK-P8-NEXT: ld r3, a_qp@toc@l(r5) -; CHECK-P8-NEXT: ld r5, b_qp@toc@l(r4) -; CHECK-P8-NEXT: ld r4, 8(r6) -; CHECK-P8-NEXT: ld r6, 8(r7) +; CHECK-P8-NEXT: addi r3, r3, a_qp@toc@l +; CHECK-P8-NEXT: addi r4, r4, b_qp@toc@l +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: bl __gekf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: rlwinm r3, r3, 1, 31, 31 @@ -411,14 +395,12 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: addis r3, r2, a_qp@toc@ha ; CHECK-P8-NEXT: addis r4, r2, b_qp@toc@ha -; CHECK-P8-NEXT: addis r5, r2, a_qp@toc@ha -; CHECK-P8-NEXT: addi r6, r5, a_qp@toc@l -; CHECK-P8-NEXT: addi r7, r4, b_qp@toc@l -; CHECK-P8-NEXT: ld r3, a_qp@toc@l(r5) -; CHECK-P8-NEXT: ld r5, b_qp@toc@l(r4) -; CHECK-P8-NEXT: ld r4, 8(r6) -; CHECK-P8-NEXT: ld r6, 8(r7) +; CHECK-P8-NEXT: addi r3, r3, a_qp@toc@l +; CHECK-P8-NEXT: addi r4, r4, b_qp@toc@l +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: bl __lekf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: extsw r3, r3 @@ -459,14 +441,12 @@ ; CHECK-P8-NEXT: stdu r1, -32(r1) ; CHECK-P8-NEXT: .cfi_def_cfa_offset 32 ; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: addis r3, r2, a_qp@toc@ha ; CHECK-P8-NEXT: addis r4, r2, b_qp@toc@ha -; CHECK-P8-NEXT: addis r5, r2, a_qp@toc@ha -; CHECK-P8-NEXT: addi r6, r5, a_qp@toc@l -; CHECK-P8-NEXT: addi r7, r4, b_qp@toc@l -; CHECK-P8-NEXT: ld r3, a_qp@toc@l(r5) -; CHECK-P8-NEXT: ld r5, b_qp@toc@l(r4) -; CHECK-P8-NEXT: ld r4, 8(r6) -; CHECK-P8-NEXT: ld r6, 8(r7) +; CHECK-P8-NEXT: addi r3, r3, a_qp@toc@l +; CHECK-P8-NEXT: addi r4, r4, b_qp@toc@l +; CHECK-P8-NEXT: lvx v2, 0, r3 +; CHECK-P8-NEXT: lvx v3, 0, r4 ; CHECK-P8-NEXT: bl __nekf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: cntlzw r3, r3 @@ -503,41 +483,38 @@ ; CHECK-P8-LABEL: greater_sel_qp: ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8-NEXT: mflr r0 -; CHECK-P8-NEXT: .cfi_def_cfa_offset 80 -; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: .cfi_offset r27, -40 -; CHECK-P8-NEXT: .cfi_offset r28, -32 -; CHECK-P8-NEXT: .cfi_offset r29, -24 -; CHECK-P8-NEXT: .cfi_offset r30, -16 -; CHECK-P8-NEXT: std r27, -40(r1) # 8-byte Folded Spill -; CHECK-P8-NEXT: std r28, -32(r1) # 8-byte Folded Spill -; CHECK-P8-NEXT: std r29, -24(r1) # 8-byte Folded Spill -; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -80(r1) -; CHECK-P8-NEXT: addis r3, r2, b_qp@toc@ha -; CHECK-P8-NEXT: addis r4, r2, a_qp@toc@ha -; CHECK-P8-NEXT: ld r30, a_qp@toc@l(r4) -; CHECK-P8-NEXT: addi r4, r4, a_qp@toc@l -; CHECK-P8-NEXT: ld r29, b_qp@toc@l(r3) -; CHECK-P8-NEXT: addi r3, r3, b_qp@toc@l -; CHECK-P8-NEXT: ld r28, 8(r4) -; CHECK-P8-NEXT: ld r27, 8(r3) -; CHECK-P8-NEXT: mr r3, r30 -; CHECK-P8-NEXT: mr r5, r29 -; CHECK-P8-NEXT: mr r4, r28 -; CHECK-P8-NEXT: mr r6, r27 +; CHECK-P8-NEXT: .cfi_def_cfa_offset 80 +; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: .cfi_offset v30, -32 +; CHECK-P8-NEXT: .cfi_offset v31, -16 +; CHECK-P8-NEXT: li r3, 48 +; CHECK-P8-NEXT: addis r4, r2, b_qp@toc@ha +; CHECK-P8-NEXT: stvx v30, r1, r3 # 16-byte Folded Spill +; CHECK-P8-NEXT: li r3, 64 +; CHECK-P8-NEXT: addi r4, r4, b_qp@toc@l +; CHECK-P8-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill +; CHECK-P8-NEXT: addis r3, r2, a_qp@toc@ha +; CHECK-P8-NEXT: lvx v30, 0, r4 +; CHECK-P8-NEXT: addi r3, r3, a_qp@toc@l +; CHECK-P8-NEXT: lvx v31, 0, r3 +; CHECK-P8-NEXT: vmr v3, v30 +; CHECK-P8-NEXT: vmr v2, v31 ; CHECK-P8-NEXT: bl __gtkf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: cmpwi r3, 0 -; CHECK-P8-NEXT: iselgt r3, r30, r29 -; CHECK-P8-NEXT: iselgt r4, r28, r27 +; CHECK-P8-NEXT: bgt cr0, .LBB10_2 +; CHECK-P8-NEXT: # %bb.1: # %entry +; CHECK-P8-NEXT: vmr v31, v30 +; CHECK-P8-NEXT: .LBB10_2: # %entry +; CHECK-P8-NEXT: li r3, 64 +; CHECK-P8-NEXT: vmr v2, v31 +; CHECK-P8-NEXT: lvx v31, r1, r3 # 16-byte Folded Reload +; CHECK-P8-NEXT: li r3, 48 +; CHECK-P8-NEXT: lvx v30, r1, r3 # 16-byte Folded Reload ; CHECK-P8-NEXT: addi r1, r1, 80 ; CHECK-P8-NEXT: ld r0, 16(r1) -; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; CHECK-P8-NEXT: ld r29, -24(r1) # 8-byte Folded Reload -; CHECK-P8-NEXT: ld r28, -32(r1) # 8-byte Folded Reload -; CHECK-P8-NEXT: ld r27, -40(r1) # 8-byte Folded Reload ; CHECK-P8-NEXT: mtlr r0 ; CHECK-P8-NEXT: blr entry: @@ -567,41 +544,38 @@ ; CHECK-P8-LABEL: less_sel_qp: ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8-NEXT: mflr r0 -; CHECK-P8-NEXT: .cfi_def_cfa_offset 80 -; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: .cfi_offset r27, -40 -; CHECK-P8-NEXT: .cfi_offset r28, -32 -; CHECK-P8-NEXT: .cfi_offset r29, -24 -; CHECK-P8-NEXT: .cfi_offset r30, -16 -; CHECK-P8-NEXT: std r27, -40(r1) # 8-byte Folded Spill -; CHECK-P8-NEXT: std r28, -32(r1) # 8-byte Folded Spill -; CHECK-P8-NEXT: std r29, -24(r1) # 8-byte Folded Spill -; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -80(r1) -; CHECK-P8-NEXT: addis r3, r2, b_qp@toc@ha -; CHECK-P8-NEXT: addis r4, r2, a_qp@toc@ha -; CHECK-P8-NEXT: ld r30, a_qp@toc@l(r4) -; CHECK-P8-NEXT: addi r4, r4, a_qp@toc@l -; CHECK-P8-NEXT: ld r29, b_qp@toc@l(r3) -; CHECK-P8-NEXT: addi r3, r3, b_qp@toc@l -; CHECK-P8-NEXT: ld r28, 8(r4) -; CHECK-P8-NEXT: ld r27, 8(r3) -; CHECK-P8-NEXT: mr r3, r30 -; CHECK-P8-NEXT: mr r5, r29 -; CHECK-P8-NEXT: mr r4, r28 -; CHECK-P8-NEXT: mr r6, r27 +; CHECK-P8-NEXT: .cfi_def_cfa_offset 80 +; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: .cfi_offset v30, -32 +; CHECK-P8-NEXT: .cfi_offset v31, -16 +; CHECK-P8-NEXT: li r3, 48 +; CHECK-P8-NEXT: addis r4, r2, b_qp@toc@ha +; CHECK-P8-NEXT: stvx v30, r1, r3 # 16-byte Folded Spill +; CHECK-P8-NEXT: li r3, 64 +; CHECK-P8-NEXT: addi r4, r4, b_qp@toc@l +; CHECK-P8-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill +; CHECK-P8-NEXT: addis r3, r2, a_qp@toc@ha +; CHECK-P8-NEXT: lvx v30, 0, r4 +; CHECK-P8-NEXT: addi r3, r3, a_qp@toc@l +; CHECK-P8-NEXT: lvx v31, 0, r3 +; CHECK-P8-NEXT: vmr v3, v30 +; CHECK-P8-NEXT: vmr v2, v31 ; CHECK-P8-NEXT: bl __ltkf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: cmpwi r3, 0 -; CHECK-P8-NEXT: isellt r3, r30, r29 -; CHECK-P8-NEXT: isellt r4, r28, r27 +; CHECK-P8-NEXT: blt cr0, .LBB11_2 +; CHECK-P8-NEXT: # %bb.1: # %entry +; CHECK-P8-NEXT: vmr v31, v30 +; CHECK-P8-NEXT: .LBB11_2: # %entry +; CHECK-P8-NEXT: li r3, 64 +; CHECK-P8-NEXT: vmr v2, v31 +; CHECK-P8-NEXT: lvx v31, r1, r3 # 16-byte Folded Reload +; CHECK-P8-NEXT: li r3, 48 +; CHECK-P8-NEXT: lvx v30, r1, r3 # 16-byte Folded Reload ; CHECK-P8-NEXT: addi r1, r1, 80 ; CHECK-P8-NEXT: ld r0, 16(r1) -; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; CHECK-P8-NEXT: ld r29, -24(r1) # 8-byte Folded Reload -; CHECK-P8-NEXT: ld r28, -32(r1) # 8-byte Folded Reload -; CHECK-P8-NEXT: ld r27, -40(r1) # 8-byte Folded Reload ; CHECK-P8-NEXT: mtlr r0 ; CHECK-P8-NEXT: blr entry: @@ -632,41 +606,38 @@ ; CHECK-P8-LABEL: greater_eq_sel_qp: ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8-NEXT: mflr r0 -; CHECK-P8-NEXT: .cfi_def_cfa_offset 80 -; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: .cfi_offset r27, -40 -; CHECK-P8-NEXT: .cfi_offset r28, -32 -; CHECK-P8-NEXT: .cfi_offset r29, -24 -; CHECK-P8-NEXT: .cfi_offset r30, -16 -; CHECK-P8-NEXT: std r27, -40(r1) # 8-byte Folded Spill -; CHECK-P8-NEXT: std r28, -32(r1) # 8-byte Folded Spill -; CHECK-P8-NEXT: std r29, -24(r1) # 8-byte Folded Spill -; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -80(r1) -; CHECK-P8-NEXT: addis r3, r2, b_qp@toc@ha -; CHECK-P8-NEXT: addis r4, r2, a_qp@toc@ha -; CHECK-P8-NEXT: ld r30, a_qp@toc@l(r4) -; CHECK-P8-NEXT: addi r4, r4, a_qp@toc@l -; CHECK-P8-NEXT: ld r29, b_qp@toc@l(r3) -; CHECK-P8-NEXT: addi r3, r3, b_qp@toc@l -; CHECK-P8-NEXT: ld r28, 8(r4) -; CHECK-P8-NEXT: ld r27, 8(r3) -; CHECK-P8-NEXT: mr r3, r30 -; CHECK-P8-NEXT: mr r5, r29 -; CHECK-P8-NEXT: mr r4, r28 -; CHECK-P8-NEXT: mr r6, r27 +; CHECK-P8-NEXT: .cfi_def_cfa_offset 80 +; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: .cfi_offset v30, -32 +; CHECK-P8-NEXT: .cfi_offset v31, -16 +; CHECK-P8-NEXT: li r3, 48 +; CHECK-P8-NEXT: addis r4, r2, b_qp@toc@ha +; CHECK-P8-NEXT: stvx v30, r1, r3 # 16-byte Folded Spill +; CHECK-P8-NEXT: li r3, 64 +; CHECK-P8-NEXT: addi r4, r4, b_qp@toc@l +; CHECK-P8-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill +; CHECK-P8-NEXT: addis r3, r2, a_qp@toc@ha +; CHECK-P8-NEXT: lvx v30, 0, r4 +; CHECK-P8-NEXT: addi r3, r3, a_qp@toc@l +; CHECK-P8-NEXT: lvx v31, 0, r3 +; CHECK-P8-NEXT: vmr v3, v30 +; CHECK-P8-NEXT: vmr v2, v31 ; CHECK-P8-NEXT: bl __gekf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: cmpwi r3, -1 -; CHECK-P8-NEXT: iselgt r3, r30, r29 -; CHECK-P8-NEXT: iselgt r4, r28, r27 +; CHECK-P8-NEXT: bgt cr0, .LBB12_2 +; CHECK-P8-NEXT: # %bb.1: # %entry +; CHECK-P8-NEXT: vmr v31, v30 +; CHECK-P8-NEXT: .LBB12_2: # %entry +; CHECK-P8-NEXT: li r3, 64 +; CHECK-P8-NEXT: vmr v2, v31 +; CHECK-P8-NEXT: lvx v31, r1, r3 # 16-byte Folded Reload +; CHECK-P8-NEXT: li r3, 48 +; CHECK-P8-NEXT: lvx v30, r1, r3 # 16-byte Folded Reload ; CHECK-P8-NEXT: addi r1, r1, 80 ; CHECK-P8-NEXT: ld r0, 16(r1) -; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; CHECK-P8-NEXT: ld r29, -24(r1) # 8-byte Folded Reload -; CHECK-P8-NEXT: ld r28, -32(r1) # 8-byte Folded Reload -; CHECK-P8-NEXT: ld r27, -40(r1) # 8-byte Folded Reload ; CHECK-P8-NEXT: mtlr r0 ; CHECK-P8-NEXT: blr entry: @@ -697,41 +668,38 @@ ; CHECK-P8-LABEL: less_eq_sel_qp: ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8-NEXT: mflr r0 -; CHECK-P8-NEXT: .cfi_def_cfa_offset 80 -; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: .cfi_offset r27, -40 -; CHECK-P8-NEXT: .cfi_offset r28, -32 -; CHECK-P8-NEXT: .cfi_offset r29, -24 -; CHECK-P8-NEXT: .cfi_offset r30, -16 -; CHECK-P8-NEXT: std r27, -40(r1) # 8-byte Folded Spill -; CHECK-P8-NEXT: std r28, -32(r1) # 8-byte Folded Spill -; CHECK-P8-NEXT: std r29, -24(r1) # 8-byte Folded Spill -; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -80(r1) -; CHECK-P8-NEXT: addis r3, r2, b_qp@toc@ha -; CHECK-P8-NEXT: addis r4, r2, a_qp@toc@ha -; CHECK-P8-NEXT: ld r30, a_qp@toc@l(r4) -; CHECK-P8-NEXT: addi r4, r4, a_qp@toc@l -; CHECK-P8-NEXT: ld r29, b_qp@toc@l(r3) -; CHECK-P8-NEXT: addi r3, r3, b_qp@toc@l -; CHECK-P8-NEXT: ld r28, 8(r4) -; CHECK-P8-NEXT: ld r27, 8(r3) -; CHECK-P8-NEXT: mr r3, r30 -; CHECK-P8-NEXT: mr r5, r29 -; CHECK-P8-NEXT: mr r4, r28 -; CHECK-P8-NEXT: mr r6, r27 +; CHECK-P8-NEXT: .cfi_def_cfa_offset 80 +; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: .cfi_offset v30, -32 +; CHECK-P8-NEXT: .cfi_offset v31, -16 +; CHECK-P8-NEXT: li r3, 48 +; CHECK-P8-NEXT: addis r4, r2, b_qp@toc@ha +; CHECK-P8-NEXT: stvx v30, r1, r3 # 16-byte Folded Spill +; CHECK-P8-NEXT: li r3, 64 +; CHECK-P8-NEXT: addi r4, r4, b_qp@toc@l +; CHECK-P8-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill +; CHECK-P8-NEXT: addis r3, r2, a_qp@toc@ha +; CHECK-P8-NEXT: lvx v30, 0, r4 +; CHECK-P8-NEXT: addi r3, r3, a_qp@toc@l +; CHECK-P8-NEXT: lvx v31, 0, r3 +; CHECK-P8-NEXT: vmr v3, v30 +; CHECK-P8-NEXT: vmr v2, v31 ; CHECK-P8-NEXT: bl __lekf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: cmpwi r3, 1 -; CHECK-P8-NEXT: isellt r3, r30, r29 -; CHECK-P8-NEXT: isellt r4, r28, r27 +; CHECK-P8-NEXT: blt cr0, .LBB13_2 +; CHECK-P8-NEXT: # %bb.1: # %entry +; CHECK-P8-NEXT: vmr v31, v30 +; CHECK-P8-NEXT: .LBB13_2: # %entry +; CHECK-P8-NEXT: li r3, 64 +; CHECK-P8-NEXT: vmr v2, v31 +; CHECK-P8-NEXT: lvx v31, r1, r3 # 16-byte Folded Reload +; CHECK-P8-NEXT: li r3, 48 +; CHECK-P8-NEXT: lvx v30, r1, r3 # 16-byte Folded Reload ; CHECK-P8-NEXT: addi r1, r1, 80 ; CHECK-P8-NEXT: ld r0, 16(r1) -; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; CHECK-P8-NEXT: ld r29, -24(r1) # 8-byte Folded Reload -; CHECK-P8-NEXT: ld r28, -32(r1) # 8-byte Folded Reload -; CHECK-P8-NEXT: ld r27, -40(r1) # 8-byte Folded Reload ; CHECK-P8-NEXT: mtlr r0 ; CHECK-P8-NEXT: blr entry: @@ -761,41 +729,38 @@ ; CHECK-P8-LABEL: equal_sel_qp: ; CHECK-P8: # %bb.0: # %entry ; CHECK-P8-NEXT: mflr r0 -; CHECK-P8-NEXT: .cfi_def_cfa_offset 80 -; CHECK-P8-NEXT: .cfi_offset lr, 16 -; CHECK-P8-NEXT: .cfi_offset r27, -40 -; CHECK-P8-NEXT: .cfi_offset r28, -32 -; CHECK-P8-NEXT: .cfi_offset r29, -24 -; CHECK-P8-NEXT: .cfi_offset r30, -16 -; CHECK-P8-NEXT: std r27, -40(r1) # 8-byte Folded Spill -; CHECK-P8-NEXT: std r28, -32(r1) # 8-byte Folded Spill -; CHECK-P8-NEXT: std r29, -24(r1) # 8-byte Folded Spill -; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill ; CHECK-P8-NEXT: std r0, 16(r1) ; CHECK-P8-NEXT: stdu r1, -80(r1) -; CHECK-P8-NEXT: addis r3, r2, b_qp@toc@ha -; CHECK-P8-NEXT: addis r4, r2, a_qp@toc@ha -; CHECK-P8-NEXT: ld r30, a_qp@toc@l(r4) -; CHECK-P8-NEXT: addi r4, r4, a_qp@toc@l -; CHECK-P8-NEXT: ld r29, b_qp@toc@l(r3) -; CHECK-P8-NEXT: addi r3, r3, b_qp@toc@l -; CHECK-P8-NEXT: ld r28, 8(r4) -; CHECK-P8-NEXT: ld r27, 8(r3) -; CHECK-P8-NEXT: mr r3, r30 -; CHECK-P8-NEXT: mr r5, r29 -; CHECK-P8-NEXT: mr r4, r28 -; CHECK-P8-NEXT: mr r6, r27 +; CHECK-P8-NEXT: .cfi_def_cfa_offset 80 +; CHECK-P8-NEXT: .cfi_offset lr, 16 +; CHECK-P8-NEXT: .cfi_offset v30, -32 +; CHECK-P8-NEXT: .cfi_offset v31, -16 +; CHECK-P8-NEXT: li r3, 48 +; CHECK-P8-NEXT: addis r4, r2, b_qp@toc@ha +; CHECK-P8-NEXT: stvx v30, r1, r3 # 16-byte Folded Spill +; CHECK-P8-NEXT: li r3, 64 +; CHECK-P8-NEXT: addi r4, r4, b_qp@toc@l +; CHECK-P8-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill +; CHECK-P8-NEXT: addis r3, r2, a_qp@toc@ha +; CHECK-P8-NEXT: lvx v30, 0, r4 +; CHECK-P8-NEXT: addi r3, r3, a_qp@toc@l +; CHECK-P8-NEXT: lvx v31, 0, r3 +; CHECK-P8-NEXT: vmr v3, v30 +; CHECK-P8-NEXT: vmr v2, v31 ; CHECK-P8-NEXT: bl __eqkf2 ; CHECK-P8-NEXT: nop ; CHECK-P8-NEXT: cmplwi r3, 0 -; CHECK-P8-NEXT: iseleq r3, r30, r29 -; CHECK-P8-NEXT: iseleq r4, r28, r27 +; CHECK-P8-NEXT: beq cr0, .LBB14_2 +; CHECK-P8-NEXT: # %bb.1: # %entry +; CHECK-P8-NEXT: vmr v31, v30 +; CHECK-P8-NEXT: .LBB14_2: # %entry +; CHECK-P8-NEXT: li r3, 64 +; CHECK-P8-NEXT: vmr v2, v31 +; CHECK-P8-NEXT: lvx v31, r1, r3 # 16-byte Folded Reload +; CHECK-P8-NEXT: li r3, 48 +; CHECK-P8-NEXT: lvx v30, r1, r3 # 16-byte Folded Reload ; CHECK-P8-NEXT: addi r1, r1, 80 ; CHECK-P8-NEXT: ld r0, 16(r1) -; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload -; CHECK-P8-NEXT: ld r29, -24(r1) # 8-byte Folded Reload -; CHECK-P8-NEXT: ld r28, -32(r1) # 8-byte Folded Reload -; CHECK-P8-NEXT: ld r27, -40(r1) # 8-byte Folded Reload ; CHECK-P8-NEXT: mtlr r0 ; CHECK-P8-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/fp-strict-fcmp.ll b/llvm/test/CodeGen/PowerPC/fp-strict-fcmp.ll --- a/llvm/test/CodeGen/PowerPC/fp-strict-fcmp.ll +++ b/llvm/test/CodeGen/PowerPC/fp-strict-fcmp.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ -; RUN: < %s -mtriple=powerpc64-unknown-linux -mcpu=pwr8 | FileCheck %s \ +; RUN: < %s -enable-soft-fp128 -mtriple=powerpc64-unknown-linux -mcpu=pwr8 | FileCheck %s \ ; RUN: -check-prefix=P8 ; RUN: llc -verify-machineinstrs -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr \ ; RUN: < %s -mtriple=powerpc64le-unknown-linux -mcpu=pwr9 \ @@ -1701,36 +1701,32 @@ ; P8: # %bb.0: ; P8-NEXT: mflr r0 ; P8-NEXT: std r0, 16(r1) -; P8-NEXT: stdu r1, -160(r1) -; P8-NEXT: std r26, 112(r1) # 8-byte Folded Spill -; P8-NEXT: std r27, 120(r1) # 8-byte Folded Spill -; P8-NEXT: std r28, 128(r1) # 8-byte Folded Spill -; P8-NEXT: std r29, 136(r1) # 8-byte Folded Spill -; P8-NEXT: mr r29, r5 -; P8-NEXT: mr r28, r4 -; P8-NEXT: mr r27, r3 -; P8-NEXT: std r30, 144(r1) # 8-byte Folded Spill -; P8-NEXT: mr r30, r6 +; P8-NEXT: stdu r1, -176(r1) +; P8-NEXT: li r3, 128 +; P8-NEXT: std r30, 160(r1) # 8-byte Folded Spill +; P8-NEXT: stxvd2x v30, r1, r3 # 16-byte Folded Spill +; P8-NEXT: li r3, 144 +; P8-NEXT: vmr v30, v2 +; P8-NEXT: stxvd2x v31, r1, r3 # 16-byte Folded Spill +; P8-NEXT: vmr v31, v3 ; P8-NEXT: bl __unordkf2 ; P8-NEXT: nop +; P8-NEXT: vmr v2, v30 ; P8-NEXT: cntlzw r3, r3 -; P8-NEXT: mr r4, r28 -; P8-NEXT: mr r5, r29 -; P8-NEXT: mr r6, r30 -; P8-NEXT: srwi r26, r3, 5 -; P8-NEXT: mr r3, r27 +; P8-NEXT: vmr v3, v31 +; P8-NEXT: srwi r30, r3, 5 ; P8-NEXT: bl __eqkf2 ; P8-NEXT: nop ; P8-NEXT: cntlzw r3, r3 -; P8-NEXT: ld r30, 144(r1) # 8-byte Folded Reload -; P8-NEXT: ld r29, 136(r1) # 8-byte Folded Reload -; P8-NEXT: ld r28, 128(r1) # 8-byte Folded Reload -; P8-NEXT: ld r27, 120(r1) # 8-byte Folded Reload +; P8-NEXT: li r4, 144 ; P8-NEXT: srwi r3, r3, 5 +; P8-NEXT: lxvd2x v31, r1, r4 # 16-byte Folded Reload +; P8-NEXT: li r4, 128 ; P8-NEXT: xori r3, r3, 1 -; P8-NEXT: and r3, r26, r3 -; P8-NEXT: ld r26, 112(r1) # 8-byte Folded Reload -; P8-NEXT: addi r1, r1, 160 +; P8-NEXT: lxvd2x v30, r1, r4 # 16-byte Folded Reload +; P8-NEXT: and r3, r30, r3 +; P8-NEXT: ld r30, 160(r1) # 8-byte Folded Reload +; P8-NEXT: addi r1, r1, 176 ; P8-NEXT: ld r0, 16(r1) ; P8-NEXT: mtlr r0 ; P8-NEXT: blr @@ -1959,36 +1955,32 @@ ; P8: # %bb.0: ; P8-NEXT: mflr r0 ; P8-NEXT: std r0, 16(r1) -; P8-NEXT: stdu r1, -160(r1) -; P8-NEXT: std r26, 112(r1) # 8-byte Folded Spill -; P8-NEXT: std r27, 120(r1) # 8-byte Folded Spill -; P8-NEXT: std r28, 128(r1) # 8-byte Folded Spill -; P8-NEXT: std r29, 136(r1) # 8-byte Folded Spill -; P8-NEXT: mr r29, r5 -; P8-NEXT: mr r28, r4 -; P8-NEXT: mr r27, r3 -; P8-NEXT: std r30, 144(r1) # 8-byte Folded Spill -; P8-NEXT: mr r30, r6 +; P8-NEXT: stdu r1, -176(r1) +; P8-NEXT: li r3, 128 +; P8-NEXT: std r30, 160(r1) # 8-byte Folded Spill +; P8-NEXT: stxvd2x v30, r1, r3 # 16-byte Folded Spill +; P8-NEXT: li r3, 144 +; P8-NEXT: vmr v30, v2 +; P8-NEXT: stxvd2x v31, r1, r3 # 16-byte Folded Spill +; P8-NEXT: vmr v31, v3 ; P8-NEXT: bl __eqkf2 ; P8-NEXT: nop +; P8-NEXT: vmr v2, v30 ; P8-NEXT: cntlzw r3, r3 -; P8-NEXT: mr r4, r28 -; P8-NEXT: mr r5, r29 -; P8-NEXT: mr r6, r30 -; P8-NEXT: srwi r26, r3, 5 -; P8-NEXT: mr r3, r27 +; P8-NEXT: vmr v3, v31 +; P8-NEXT: srwi r30, r3, 5 ; P8-NEXT: bl __unordkf2 ; P8-NEXT: nop ; P8-NEXT: cntlzw r3, r3 -; P8-NEXT: ld r30, 144(r1) # 8-byte Folded Reload -; P8-NEXT: ld r29, 136(r1) # 8-byte Folded Reload -; P8-NEXT: ld r28, 128(r1) # 8-byte Folded Reload -; P8-NEXT: ld r27, 120(r1) # 8-byte Folded Reload +; P8-NEXT: li r4, 144 ; P8-NEXT: srwi r3, r3, 5 +; P8-NEXT: lxvd2x v31, r1, r4 # 16-byte Folded Reload +; P8-NEXT: li r4, 128 ; P8-NEXT: xori r3, r3, 1 -; P8-NEXT: or r3, r3, r26 -; P8-NEXT: ld r26, 112(r1) # 8-byte Folded Reload -; P8-NEXT: addi r1, r1, 160 +; P8-NEXT: lxvd2x v30, r1, r4 # 16-byte Folded Reload +; P8-NEXT: or r3, r3, r30 +; P8-NEXT: ld r30, 160(r1) # 8-byte Folded Reload +; P8-NEXT: addi r1, r1, 176 ; P8-NEXT: ld r0, 16(r1) ; P8-NEXT: mtlr r0 ; P8-NEXT: blr @@ -2305,36 +2297,32 @@ ; P8: # %bb.0: ; P8-NEXT: mflr r0 ; P8-NEXT: std r0, 16(r1) -; P8-NEXT: stdu r1, -160(r1) -; P8-NEXT: std r26, 112(r1) # 8-byte Folded Spill -; P8-NEXT: std r27, 120(r1) # 8-byte Folded Spill -; P8-NEXT: std r28, 128(r1) # 8-byte Folded Spill -; P8-NEXT: std r29, 136(r1) # 8-byte Folded Spill -; P8-NEXT: mr r29, r5 -; P8-NEXT: mr r28, r4 -; P8-NEXT: mr r27, r3 -; P8-NEXT: std r30, 144(r1) # 8-byte Folded Spill -; P8-NEXT: mr r30, r6 +; P8-NEXT: stdu r1, -176(r1) +; P8-NEXT: li r3, 128 +; P8-NEXT: std r30, 160(r1) # 8-byte Folded Spill +; P8-NEXT: stxvd2x v30, r1, r3 # 16-byte Folded Spill +; P8-NEXT: li r3, 144 +; P8-NEXT: vmr v30, v2 +; P8-NEXT: stxvd2x v31, r1, r3 # 16-byte Folded Spill +; P8-NEXT: vmr v31, v3 ; P8-NEXT: bl __unordkf2 ; P8-NEXT: nop +; P8-NEXT: vmr v2, v30 ; P8-NEXT: cntlzw r3, r3 -; P8-NEXT: mr r4, r28 -; P8-NEXT: mr r5, r29 -; P8-NEXT: mr r6, r30 -; P8-NEXT: srwi r26, r3, 5 -; P8-NEXT: mr r3, r27 +; P8-NEXT: vmr v3, v31 +; P8-NEXT: srwi r30, r3, 5 ; P8-NEXT: bl __eqkf2 ; P8-NEXT: nop ; P8-NEXT: cntlzw r3, r3 -; P8-NEXT: ld r30, 144(r1) # 8-byte Folded Reload -; P8-NEXT: ld r29, 136(r1) # 8-byte Folded Reload -; P8-NEXT: ld r28, 128(r1) # 8-byte Folded Reload -; P8-NEXT: ld r27, 120(r1) # 8-byte Folded Reload +; P8-NEXT: li r4, 144 ; P8-NEXT: srwi r3, r3, 5 +; P8-NEXT: lxvd2x v31, r1, r4 # 16-byte Folded Reload +; P8-NEXT: li r4, 128 ; P8-NEXT: xori r3, r3, 1 -; P8-NEXT: and r3, r26, r3 -; P8-NEXT: ld r26, 112(r1) # 8-byte Folded Reload -; P8-NEXT: addi r1, r1, 160 +; P8-NEXT: lxvd2x v30, r1, r4 # 16-byte Folded Reload +; P8-NEXT: and r3, r30, r3 +; P8-NEXT: ld r30, 160(r1) # 8-byte Folded Reload +; P8-NEXT: addi r1, r1, 176 ; P8-NEXT: ld r0, 16(r1) ; P8-NEXT: mtlr r0 ; P8-NEXT: blr @@ -2563,36 +2551,32 @@ ; P8: # %bb.0: ; P8-NEXT: mflr r0 ; P8-NEXT: std r0, 16(r1) -; P8-NEXT: stdu r1, -160(r1) -; P8-NEXT: std r26, 112(r1) # 8-byte Folded Spill -; P8-NEXT: std r27, 120(r1) # 8-byte Folded Spill -; P8-NEXT: std r28, 128(r1) # 8-byte Folded Spill -; P8-NEXT: std r29, 136(r1) # 8-byte Folded Spill -; P8-NEXT: mr r29, r5 -; P8-NEXT: mr r28, r4 -; P8-NEXT: mr r27, r3 -; P8-NEXT: std r30, 144(r1) # 8-byte Folded Spill -; P8-NEXT: mr r30, r6 +; P8-NEXT: stdu r1, -176(r1) +; P8-NEXT: li r3, 128 +; P8-NEXT: std r30, 160(r1) # 8-byte Folded Spill +; P8-NEXT: stxvd2x v30, r1, r3 # 16-byte Folded Spill +; P8-NEXT: li r3, 144 +; P8-NEXT: vmr v30, v2 +; P8-NEXT: stxvd2x v31, r1, r3 # 16-byte Folded Spill +; P8-NEXT: vmr v31, v3 ; P8-NEXT: bl __eqkf2 ; P8-NEXT: nop +; P8-NEXT: vmr v2, v30 ; P8-NEXT: cntlzw r3, r3 -; P8-NEXT: mr r4, r28 -; P8-NEXT: mr r5, r29 -; P8-NEXT: mr r6, r30 -; P8-NEXT: srwi r26, r3, 5 -; P8-NEXT: mr r3, r27 +; P8-NEXT: vmr v3, v31 +; P8-NEXT: srwi r30, r3, 5 ; P8-NEXT: bl __unordkf2 ; P8-NEXT: nop ; P8-NEXT: cntlzw r3, r3 -; P8-NEXT: ld r30, 144(r1) # 8-byte Folded Reload -; P8-NEXT: ld r29, 136(r1) # 8-byte Folded Reload -; P8-NEXT: ld r28, 128(r1) # 8-byte Folded Reload -; P8-NEXT: ld r27, 120(r1) # 8-byte Folded Reload +; P8-NEXT: li r4, 144 ; P8-NEXT: srwi r3, r3, 5 +; P8-NEXT: lxvd2x v31, r1, r4 # 16-byte Folded Reload +; P8-NEXT: li r4, 128 ; P8-NEXT: xori r3, r3, 1 -; P8-NEXT: or r3, r3, r26 -; P8-NEXT: ld r26, 112(r1) # 8-byte Folded Reload -; P8-NEXT: addi r1, r1, 160 +; P8-NEXT: lxvd2x v30, r1, r4 # 16-byte Folded Reload +; P8-NEXT: or r3, r3, r30 +; P8-NEXT: ld r30, 160(r1) # 8-byte Folded Reload +; P8-NEXT: addi r1, r1, 176 ; P8-NEXT: ld r0, 16(r1) ; P8-NEXT: mtlr r0 ; P8-NEXT: blr