diff --git a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h --- a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h +++ b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h @@ -25,6 +25,10 @@ REASSOC_XA_BY, REASSOC_XA_YB, + // These are patterns matched by the PowerPC to reassociate FMA chains. + REASSOC_XY_AMM_BMM, + REASSOC_XMM_AMM_BMM, + // These are multiply-add patterns matched by the AArch64 machine combiner. MULADDW_OP1, MULADDW_OP2, diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -269,6 +269,8 @@ case MachineCombinerPattern::REASSOC_AX_YB: case MachineCombinerPattern::REASSOC_XA_BY: case MachineCombinerPattern::REASSOC_XA_YB: + case MachineCombinerPattern::REASSOC_XY_AMM_BMM: + case MachineCombinerPattern::REASSOC_XMM_AMM_BMM: return CombinerObjective::MustReduceDepth; default: return CombinerObjective::Default; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h @@ -229,6 +229,11 @@ unsigned getSpillTarget() const; const unsigned *getStoreOpcodesForSpillArray() const; const unsigned *getLoadOpcodesForSpillArray() const; + int16_t getFMAOpIdxInfo(unsigned Opcode) const; + void reassociateFMA(MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const; virtual void anchor(); protected: @@ -308,6 +313,20 @@ return true; } + /// When getMachineCombinerPatterns() finds patterns, this function generates + /// the instructions that could replace the original code sequence + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const override; + + /// Return true when there is potentially a faster code sequence for a fma + /// chain ending in \p Root. All potential patterns are output in the \p + /// P array. + bool getFMAPatterns(MachineInstr &Root, + SmallVectorImpl &P) const; + /// Return true when there is potentially a faster code sequence /// for an instruction chain ending in . All potential patterns are /// output in the array. @@ -317,6 +336,16 @@ bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; + /// On PowerPC, we try to reassociate FMA chain which will increase + /// instruction size. Set extension resource length limit to 1 for edge case. + /// Resource Length is calculated by scaled resource usage in getCycles(). + /// Because of the division in getCycles(), it returns different cycles due to + /// legacy scaled resource usage. So new resource length may be same with + /// legacy or 1 bigger than legacy. + /// We need to execlude the 1 bigger case even the resource length is not + /// perserved for more FMA chain reassociations on PowerPC. + int getExtendResourceLenLimit() const override { return 1; } + void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, MachineInstr &NewMI1, MachineInstr &NewMI2) const override; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -280,6 +280,144 @@ } } +#define InfoArrayIdxFMAInst 0 +#define InfoArrayIdxFAddInst 1 +#define InfoArrayIdxFMULInst 2 +#define InfoArrayIdxAddOpIdx 3 +#define InfoArrayIdxMULOpIdx 4 +// Array keeps info for FMA instructions: +// Index 0(InfoArrayIdxFMAInst): FMA instruction; +// Index 1(InfoArrayIdxFAddInst): ADD instruction assoaicted with FMA; +// Index 2(InfoArrayIdxFMULInst): MUL instruction assoaicted with FMA; +// Index 3(InfoArrayIdxAddOpIdx): ADD operand index in the FMA operand list; +// Index 4(InfoArrayIdxMULOpIdx): first MUL operand index in the FMA operand +// list; +// second MUL operand index is plus 1. +static const uint16_t FMAOpIdxInfo[][5] = { + // FIXME: add more FMA instructions like XSNMADDADP and so on. + {PPC::XSMADDADP, PPC::XSADDDP, PPC::XSMULDP, 1, 2}, + {PPC::XSMADDASP, PPC::XSADDSP, PPC::XSMULSP, 1, 2}, + {PPC::XVMADDADP, PPC::XVADDDP, PPC::XVMULDP, 1, 2}, + {PPC::XVMADDASP, PPC::XVADDSP, PPC::XVMULSP, 1, 2}, + {PPC::FMADD, PPC::FADD, PPC::FMUL, 3, 1}, + {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1}, + {PPC::QVFMADDSs, PPC::QVFADDSs, PPC::QVFMULSs, 3, 1}, + {PPC::QVFMADD, PPC::QVFADD, PPC::QVFMUL, 3, 1}}; + +// Check if an opcode is a FMA instruction. If it is, return the index in array +// FMAOpIdxInfo. Otherwise, return -1. +int16_t PPCInstrInfo::getFMAOpIdxInfo(unsigned Opcode) const { + for (unsigned I = 0; I < array_lengthof(FMAOpIdxInfo); I++) + if (FMAOpIdxInfo[I][InfoArrayIdxFMAInst] == Opcode) + return I; + return -1; +} + +// Try to reassociate FMA chains like below: +// +// Pattern 1: +// A = FADD X, Y (Leaf) +// B = FMA A, M21, M22 (Prev) +// C = FMA B, M31, M32 (Root) +// --> +// A = FMA X, M21, M22 +// B = FMA Y, M31, M32 +// C = FADD A, B +// +// Pattern 2: +// A = FMA X, M11, M12 (Leaf) +// B = FMA A, M21, M22 (Prev) +// C = FMA B, M31, M32 (Root) +// --> +// A = FMUL M11, M12 +// B = FMA X, M21, M22 +// D = FMA A, M31, M32 +// C = FADD B, D +// +// breaking the dependency between A and B, allowing FMA to be executed in +// parallel (or back-to-back in a pipeline) instead of depending on each other. +bool PPCInstrInfo::getFMAPatterns( + MachineInstr &Root, + SmallVectorImpl &Patterns) const { + MachineBasicBlock *MBB = Root.getParent(); + const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + + auto IsAllOpsVirtualReg = [](const MachineInstr &Instr) { + for (const auto &MO : Instr.explicit_operands()) + if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg()))) + return false; + return true; + }; + + auto IsReassociable = [&](const MachineInstr &Instr, int16_t &AddOpIdx, + bool IsLeaf, bool IsAdd) { + int16_t Idx = -1; + if (!IsAdd) { + Idx = getFMAOpIdxInfo(Instr.getOpcode()); + if (Idx < 0) + return false; + } else if (Instr.getOpcode() != + FMAOpIdxInfo[getFMAOpIdxInfo(Root.getOpcode())] + [InfoArrayIdxFAddInst]) + return false; + + // Instruction can be reassociated. + // fast match flags may prohibit reassociation. + if (!(Instr.getFlag(MachineInstr::MIFlag::FmReassoc) && + Instr.getFlag(MachineInstr::MIFlag::FmNsz))) + return false; + + // Instruction operands are virtual registers for reassociating. + if (!IsAllOpsVirtualReg(Instr)) + return false; + + if (IsAdd && IsLeaf) + return true; + + AddOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxAddOpIdx]; + + const MachineOperand &OpAdd = Instr.getOperand(AddOpIdx); + MachineInstr *MIAdd = MRI.getUniqueVRegDef(OpAdd.getReg()); + // If 'add' operand's def is not in current block, don't do ILP related opt. + if (!MIAdd || MIAdd->getParent() != MBB) + return false; + + // If this is not Leaf FMA Instr, its 'add' operand should only have one use + // as this fma will be changed later. + return IsLeaf ? true : MRI.hasOneNonDBGUse(OpAdd.getReg()); + }; + + int16_t AddOpIdx = -1; + // Root must be a valid FMA like instruction. + if (!IsReassociable(Root, AddOpIdx, false, false)) + return false; + + assert((AddOpIdx >= 0) && "add operand index not right!"); + + Register RegB = Root.getOperand(AddOpIdx).getReg(); + MachineInstr *Prev = MRI.getUniqueVRegDef(RegB); + + // Prev must be a valid FMA like instruction. + AddOpIdx = -1; + if (!IsReassociable(*Prev, AddOpIdx, false, false)) + return false; + + assert((AddOpIdx >= 0) && "add operand index not right!"); + + Register RegA = Prev->getOperand(AddOpIdx).getReg(); + MachineInstr *Leaf = MRI.getUniqueVRegDef(RegA); + AddOpIdx = -1; + if (IsReassociable(*Leaf, AddOpIdx, true, false)) { + Patterns.push_back(MachineCombinerPattern::REASSOC_XMM_AMM_BMM); + return true; + } + if (IsReassociable(*Leaf, AddOpIdx, true, true)) { + Patterns.push_back(MachineCombinerPattern::REASSOC_XY_AMM_BMM); + return true; + } + return false; +} + bool PPCInstrInfo::getMachineCombinerPatterns( MachineInstr &Root, SmallVectorImpl &Patterns) const { @@ -288,9 +426,198 @@ if (Subtarget.getTargetMachine().getOptLevel() != CodeGenOpt::Aggressive) return false; + if (getFMAPatterns(Root, Patterns)) + return true; + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); } +void PPCInstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { + switch (Pattern) { + case MachineCombinerPattern::REASSOC_XY_AMM_BMM: + case MachineCombinerPattern::REASSOC_XMM_AMM_BMM: + reassociateFMA(Root, Pattern, InsInstrs, DelInstrs, InstrIdxForVirtReg); + break; + default: + // Reassociate default patterns. + TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, + DelInstrs, InstrIdxForVirtReg); + break; + } +} + +// Currently, only handle two patterns REASSOC_XY_AMM_BMM and +// REASSOC_XMM_AMM_BMM. See comments for getFMAPatterns. +void PPCInstrInfo::reassociateFMA( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { + MachineFunction *MF = Root.getMF(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + MachineOperand &OpC = Root.getOperand(0); + Register RegC = OpC.getReg(); + const TargetRegisterClass *RC = MRI.getRegClass(RegC); + MRI.constrainRegClass(RegC, RC); + + unsigned FmaOp = Root.getOpcode(); + int16_t Idx = getFMAOpIdxInfo(FmaOp); + assert(Idx >= 0 && "Root must be a FMA instruction"); + + uint16_t AddOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxAddOpIdx]; + uint16_t FirstMulOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxMULOpIdx]; + MachineInstr *Prev = MRI.getUniqueVRegDef(Root.getOperand(AddOpIdx).getReg()); + MachineInstr *Leaf = + MRI.getUniqueVRegDef(Prev->getOperand(AddOpIdx).getReg()); + uint16_t IntersectedFlags = + Root.getFlags() & Prev->getFlags() & Leaf->getFlags(); + + auto GetOperandInfo = [&](const MachineOperand &Operand, Register &Reg, + bool &KillFlag) { + Reg = Operand.getReg(); + MRI.constrainRegClass(Reg, RC); + KillFlag = Operand.isKill(); + }; + + auto GetFMAInstrInfo = [&](const MachineInstr &Instr, Register &MulOp1, + Register &MulOp2, bool &MulOp1KillFlag, + bool &MulOp2KillFlag) { + GetOperandInfo(Instr.getOperand(FirstMulOpIdx), MulOp1, MulOp1KillFlag); + GetOperandInfo(Instr.getOperand(FirstMulOpIdx + 1), MulOp2, MulOp2KillFlag); + }; + + Register RegM11, RegM12, RegX, RegY, RegM21, RegM22, RegM31, RegM32; + bool KillX = false, KillY = false, KillM11 = false, KillM12 = false, + KillM21 = false, KillM22 = false, KillM31 = false, KillM32 = false; + + GetFMAInstrInfo(Root, RegM31, RegM32, KillM31, KillM32); + GetFMAInstrInfo(*Prev, RegM21, RegM22, KillM21, KillM22); + + if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) { + GetFMAInstrInfo(*Leaf, RegM11, RegM12, KillM11, KillM12); + GetOperandInfo(Leaf->getOperand(AddOpIdx), RegX, KillX); + } else if (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) { + GetOperandInfo(Leaf->getOperand(1), RegX, KillX); + GetOperandInfo(Leaf->getOperand(2), RegY, KillY); + } + + // Create new virtual registers for the new results instead of + // recycling legacy ones because the MachineCombiner's computation of the + // critical path requires a new register definition rather than an existing + // one. + Register NewVRA = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(NewVRA, 0)); + + Register NewVRB = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(NewVRB, 1)); + + Register NewVRD = 0; + if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) { + NewVRD = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(NewVRD, 2)); + } + + auto AdjustOperandOrder = [&](MachineInstr *MI, Register RegAdd, bool KillAdd, + Register RegMul1, bool KillRegMul1, + Register RegMul2, bool KillRegMul2) { + MI->getOperand(AddOpIdx).setReg(RegAdd); + MI->getOperand(AddOpIdx).setIsKill(KillAdd); + MI->getOperand(FirstMulOpIdx).setReg(RegMul1); + MI->getOperand(FirstMulOpIdx).setIsKill(KillRegMul1); + MI->getOperand(FirstMulOpIdx + 1).setReg(RegMul2); + MI->getOperand(FirstMulOpIdx + 1).setIsKill(KillRegMul2); + }; + + if (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) { + // Create new instructions for insertion. + MachineInstrBuilder MINewB = + BuildMI(*MF, Prev->getDebugLoc(), get(FmaOp), NewVRB) + .addReg(RegX, getKillRegState(KillX)) + .addReg(RegM21, getKillRegState(KillM21)) + .addReg(RegM22, getKillRegState(KillM22)); + MachineInstrBuilder MINewA = + BuildMI(*MF, Root.getDebugLoc(), get(FmaOp), NewVRA) + .addReg(RegY, getKillRegState(KillY)) + .addReg(RegM31, getKillRegState(KillM31)) + .addReg(RegM32, getKillRegState(KillM32)); + // if AddOpIdx is not 1, adjust the order. + if (AddOpIdx != 1) { + AdjustOperandOrder(MINewB, RegX, KillX, RegM21, KillM21, RegM22, KillM22); + AdjustOperandOrder(MINewA, RegY, KillY, RegM31, KillM31, RegM32, KillM32); + } + + MachineInstrBuilder MINewC = + BuildMI(*MF, Root.getDebugLoc(), + get(FMAOpIdxInfo[Idx][InfoArrayIdxFAddInst]), RegC) + .addReg(NewVRB, getKillRegState(true)) + .addReg(NewVRA, getKillRegState(true)); + + // update flags for new created instructions. + setSpecialOperandAttr(*MINewA, IntersectedFlags); + setSpecialOperandAttr(*MINewB, IntersectedFlags); + setSpecialOperandAttr(*MINewC, IntersectedFlags); + + // Record new instructions for insertion. + InsInstrs.push_back(MINewA); + InsInstrs.push_back(MINewB); + InsInstrs.push_back(MINewC); + } else if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) { + assert(NewVRD && "new FMA register not created!"); + // Create new instructions for insertion. + MachineInstrBuilder MINewA = + BuildMI(*MF, Leaf->getDebugLoc(), + get(FMAOpIdxInfo[Idx][InfoArrayIdxFMULInst]), NewVRA) + .addReg(RegM11, getKillRegState(KillM11)) + .addReg(RegM12, getKillRegState(KillM12)); + MachineInstrBuilder MINewB = + BuildMI(*MF, Prev->getDebugLoc(), get(FmaOp), NewVRB) + .addReg(RegX, getKillRegState(KillX)) + .addReg(RegM21, getKillRegState(KillM21)) + .addReg(RegM22, getKillRegState(KillM22)); + MachineInstrBuilder MINewD = + BuildMI(*MF, Root.getDebugLoc(), get(FmaOp), NewVRD) + .addReg(NewVRA, getKillRegState(true)) + .addReg(RegM31, getKillRegState(KillM31)) + .addReg(RegM32, getKillRegState(KillM32)); + // If AddOpIdx is not 1, adjust the order. + if (AddOpIdx != 1) { + AdjustOperandOrder(MINewB, RegX, KillX, RegM21, KillM21, RegM22, KillM22); + AdjustOperandOrder(MINewD, NewVRA, true, RegM31, KillM31, RegM32, + KillM32); + } + + MachineInstrBuilder MINewC = + BuildMI(*MF, Root.getDebugLoc(), + get(FMAOpIdxInfo[Idx][InfoArrayIdxFAddInst]), RegC) + .addReg(NewVRB, getKillRegState(true)) + .addReg(NewVRD, getKillRegState(true)); + + // update flags for new created instructions. + setSpecialOperandAttr(*MINewA, IntersectedFlags); + setSpecialOperandAttr(*MINewB, IntersectedFlags); + setSpecialOperandAttr(*MINewD, IntersectedFlags); + setSpecialOperandAttr(*MINewC, IntersectedFlags); + + // Record new instructions for insertion. + InsInstrs.push_back(MINewA); + InsInstrs.push_back(MINewB); + InsInstrs.push_back(MINewD); + InsInstrs.push_back(MINewC); + } + + assert(!InsInstrs.empty() && + "Insertion instructions set should not be empty!"); + + // Record old instructions for deletion. + DelInstrs.push_back(Leaf); + DelInstrs.push_back(Prev); + DelInstrs.push_back(&Root); +} + // Detect 32 -> 64-bit extensions where we may reuse the low sub-register. bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg, Register &DstReg, diff --git a/llvm/test/CodeGen/PowerPC/machine-combiner.ll b/llvm/test/CodeGen/PowerPC/machine-combiner.ll --- a/llvm/test/CodeGen/PowerPC/machine-combiner.ll +++ b/llvm/test/CodeGen/PowerPC/machine-combiner.ll @@ -217,12 +217,12 @@ define double @reassociate_mamaa_double(double %0, double %1, double %2, double %3, double %4, double %5) { ; CHECK-LABEL: reassociate_mamaa_double: ; CHECK: # %bb.0: -; CHECK-QPX: fadd 0, 2, 1 -; CHECK-QPX: fmadd 0, 4, 3, 0 -; CHECK-QPX: fmadd 1, 6, 5, 0 -; CHECK-PWR: xsadddp 1, 2, 1 -; CHECK-PWR: xsmaddadp 1, 4, 3 -; CHECK-PWR: xsmaddadp 1, 6, 5 +; CHECK-QPX-DAG: fmadd [[REG0:[0-9]+]], 4, 3, 2 +; CHECK-QPX-DAG: fmadd [[REG1:[0-9]+]], 6, 5, 1 +; CHECK-QPX: fadd 1, [[REG0]], [[REG1]] +; CHECK-PWR-DAG: xsmaddadp 1, 6, 5 +; CHECK-PWR-DAG: xsmaddadp 2, 4, 3 +; CHECK-PWR: xsadddp 1, 2, 1 ; CHECK-NEXT: blr %7 = fmul reassoc nsz double %3, %2 %8 = fmul reassoc nsz double %5, %4 @@ -235,10 +235,10 @@ ; FIXME: should use xsmaddasp instead of fmadds for pwr7 arch. define float @reassociate_mamaa_float(float %0, float %1, float %2, float %3, float %4, float %5) { ; CHECK-LABEL: reassociate_mamaa_float: -; CHECK: # %bb.0: -; CHECK: fadds 0, 2, 1 -; CHECK: fmadds 0, 4, 3, 0 -; CHECK: fmadds 1, 6, 5, 0 +; CHECK: # %bb.0: +; CHECK-DAG: fmadds [[REG0:[0-9]+]], 4, 3, 2 +; CHECK-DAG: fmadds [[REG1:[0-9]+]], 6, 5, 1 +; CHECK: fadds 1, [[REG0]], [[REG1]] ; CHECK-NEXT: blr %7 = fmul reassoc nsz float %3, %2 %8 = fmul reassoc nsz float %5, %4 @@ -251,12 +251,12 @@ define <4 x float> @reassociate_mamaa_vec(<4 x float> %0, <4 x float> %1, <4 x float> %2, <4 x float> %3, <4 x float> %4, <4 x float> %5) { ; CHECK-LABEL: reassociate_mamaa_vec: ; CHECK: # %bb.0: -; CHECK-QPX: qvfadds 0, 2, 1 -; CHECK-QPX: qvfmadds 0, 4, 3, 0 -; CHECK-QPX: qvfmadds 1, 6, 5, 0 -; CHECK-PWR: xvaddsp 34, 35, 34 -; CHECK-PWR: xvmaddasp 34, 37, 36 -; CHECK-PWR: xvmaddasp 34, 39, 38 +; CHECK-QPX-DAG: qvfmadds [[REG0:[0-9]+]], 4, 3, 2 +; CHECK-QPX-DAG: qvfmadds [[REG1:[0-9]+]], 6, 5, 1 +; CHECK-QPX: qvfadds 1, [[REG0]], [[REG1]] +; CHECK-PWR-DAG: xvmaddasp [[REG0:[0-9]+]], 39, 38 +; CHECK-PWR-DAG: xvmaddasp [[REG1:[0-9]+]], 37, 36 +; CHECK-PWR: xvaddsp 34, [[REG1]], [[REG0]] ; CHECK-NEXT: blr %7 = fmul reassoc nsz <4 x float> %3, %2 %8 = fmul reassoc nsz <4 x float> %5, %4 @@ -269,15 +269,16 @@ define double @reassociate_mamama_double(double %0, double %1, double %2, double %3, double %4, double %5, double %6, double %7, double %8) { ; CHECK-LABEL: reassociate_mamama_double: ; CHECK: # %bb.0: -; CHECK-QPX: fmadd 0, 2, 1, 7 -; CHECK-QPX-DAG: fmadd 0, 4, 3, 0 -; CHECK-QPX-DAG: fmadd 0, 6, 5, 0 -; CHECK-QPX: fmadd 1, 9, 8, 0 +; CHECK-QPX: fmadd [[REG0:[0-9]+]], 2, 1, 7 +; CHECK-QPX-DAG: fmul [[REG1:[0-9]+]], 4, 3 +; CHECK-QPX-DAG: fmadd [[REG2:[0-9]+]], 6, 5, [[REG0]] +; CHECK-QPX-DAG: fmadd [[REG3:[0-9]+]], 9, 8, [[REG1]] +; CHECK-QPX: fadd 1, [[REG2]], [[REG3]] ; CHECK-PWR: xsmaddadp 7, 2, 1 -; CHECK-PWR-DAG: xsmaddadp 7, 4, 3 +; CHECK-PWR-DAG: xsmuldp [[REG0:[0-9]+]], 4, 3 ; CHECK-PWR-DAG: xsmaddadp 7, 6, 5 -; CHECK-PWR-DAG: xsmaddadp 7, 9, 8 -; CHECK-PWR: fmr 1, 7 +; CHECK-PWR-DAG: xsmaddadp [[REG0]], 9, 8 +; CHECK-PWR: xsadddp 1, 7, [[REG0]] ; CHECK-NEXT: blr %10 = fmul reassoc nsz double %1, %0 %11 = fmul reassoc nsz double %3, %2 @@ -295,16 +296,18 @@ float %9, float %10, float %11, float %12, float %13, float %14, float %15, float %16) { ; CHECK-LABEL: reassociate_mamama_8: ; CHECK: # %bb.0: -; CHECK: fmadds [[REG0:[0-9]+]], 3, 2, 1 -; CHECK-DAG: fmadds [[REG0]], 5, 4, [[REG0]] -; CHECK-DAG: fmadds [[REG0]], 7, 6, [[REG0]] -; CHECK-DAG: fmadds [[REG0]], 9, 8, [[REG0]] -; CHECK-DAG: fmadds [[REG0]], 13, 12, [[REG0]] -; CHECK-DAG: fmadds [[REG0]], 11, 10, [[REG0]] +; CHECK-DAG: fmadds [[REG0:[0-9]+]], 3, 2, 1 +; CHECK-DAG: fmuls [[REG1:[0-9]+]], 5, 4 +; CHECK-DAG: fmadds [[REG2:[0-9]+]], 7, 6, [[REG0]] +; CHECK-DAG: fmadds [[REG3:[0-9]+]], 9, 8, [[REG1]] ; -; CHECK: fmadds [[REG0]], -; CHECK: fmadds 1, -; CHECK-NEXT: blr +; CHECK-DAG: fmadds [[REG4:[0-9]+]], 13, 12, [[REG3]] +; CHECK-DAG: fmadds [[REG5:[0-9]+]], 11, 10, [[REG2]] +; +; CHECK-DAG: fmadds [[REG6:[0-9]+]], 3, 2, [[REG4]] +; CHECK-DAG: fmadds [[REG7:[0-9]+]], 5, 4, [[REG5]] +; CHECK: fadds 1, [[REG7]], [[REG6]] +; CHECK-NEXT: blr %18 = fmul reassoc nsz float %2, %1 %19 = fadd reassoc nsz float %18, %0 %20 = fmul reassoc nsz float %4, %3