Index: llvm/include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- llvm/include/llvm/CodeGen/MachineCombinerPattern.h +++ llvm/include/llvm/CodeGen/MachineCombinerPattern.h @@ -176,6 +176,16 @@ FMSUB, FNMSUB, + // SystemZ FMA patterns (experimental). + REASSOC_FMA1Add_L, + REASSOC_FMA1Add_R, + REASSOC_FMA2Add, // PPC REASSOC_XY_AMM_BMM + REASSOC_FMA2, + REASSOC_FMA3, + REASSOC_FMA3_Ch, // PPC REASSOC_XMM_AMM_BMM + REASSOC_FMA4, + REASSOC_FMA4_Ch, + // X86 VNNI DPWSSD, Index: llvm/include/llvm/CodeGen/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1634,7 +1634,7 @@ /// instruction that defines FoldAsLoadDefReg, and the function returns /// the machine instruction generated due to folding. virtual MachineInstr *optimizeLoadInstr(MachineInstr &MI, - const MachineRegisterInfo *MRI, + MachineRegisterInfo *MRI, Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const { return nullptr; Index: llvm/lib/CodeGen/MachineCombiner.cpp =================================================================== --- llvm/lib/CodeGen/MachineCombiner.cpp +++ llvm/lib/CodeGen/MachineCombiner.cpp @@ -305,6 +305,9 @@ Default // The critical path must not be lengthened. }; +// EXPERIMENTAL +static cl::opt FMA4_EQDEPTH("fma4-eqdepth", cl::init(false)); + static CombinerObjective getCombinerObjective(MachineCombinerPattern P) { // TODO: If C++ ever gets a real enum class, make this part of the // MachineCombinerPattern class. @@ -321,10 +324,23 @@ case MachineCombinerPattern::FMADD_XA: case MachineCombinerPattern::FMSUB: case MachineCombinerPattern::FNMSUB: + + case MachineCombinerPattern::REASSOC_FMA1Add_L: + case MachineCombinerPattern::REASSOC_FMA1Add_R: + case MachineCombinerPattern::REASSOC_FMA2: + case MachineCombinerPattern::REASSOC_FMA3: + case MachineCombinerPattern::REASSOC_FMA4_Ch: return CombinerObjective::MustReduceDepth; + + case MachineCombinerPattern::REASSOC_FMA4: + return FMA4_EQDEPTH ? CombinerObjective::Default + : CombinerObjective::MustReduceDepth; + case MachineCombinerPattern::REASSOC_XY_BCA: case MachineCombinerPattern::REASSOC_XY_BAC: return CombinerObjective::MustReduceRegisterPressure; + + default: return CombinerObjective::Default; } Index: llvm/lib/Target/SystemZ/CMakeLists.txt =================================================================== --- llvm/lib/Target/SystemZ/CMakeLists.txt +++ llvm/lib/Target/SystemZ/CMakeLists.txt @@ -20,6 +20,7 @@ SystemZConstantPoolValue.cpp SystemZCopyPhysRegs.cpp SystemZElimCompare.cpp + SystemZFinalizeReassociation.cpp SystemZFrameLowering.cpp SystemZHazardRecognizer.cpp SystemZISelDAGToDAG.cpp @@ -34,6 +35,7 @@ SystemZRegisterInfo.cpp SystemZSelectionDAGInfo.cpp SystemZShortenInst.cpp + SystemZReassocAdditions.cpp SystemZSubtarget.cpp SystemZTargetMachine.cpp SystemZTargetTransformInfo.cpp Index: llvm/lib/Target/SystemZ/SystemZ.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZ.h +++ llvm/lib/Target/SystemZ/SystemZ.h @@ -195,16 +195,20 @@ FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM); FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM); FunctionPass *createSystemZCopyPhysRegsPass(SystemZTargetMachine &TM); +FunctionPass *createSystemZReassocAdditionsPass(SystemZTargetMachine &TM); +FunctionPass *createSystemZFinalizeReassociationPass(SystemZTargetMachine &TM); FunctionPass *createSystemZPostRewritePass(SystemZTargetMachine &TM); FunctionPass *createSystemZTDCPass(); void initializeSystemZCopyPhysRegsPass(PassRegistry &); void initializeSystemZDAGToDAGISelPass(PassRegistry &); void initializeSystemZElimComparePass(PassRegistry &); +void initializeSystemZFinalizeReassociationPass(PassRegistry &); void initializeSystemZLDCleanupPass(PassRegistry &); void initializeSystemZLongBranchPass(PassRegistry &); void initializeSystemZPostRewritePass(PassRegistry &); void initializeSystemZShortenInstPass(PassRegistry &); +void initializeSystemZReassocAdditionsPass(PassRegistry &); void initializeSystemZTDCPassPass(PassRegistry &); } // end namespace llvm Index: llvm/lib/Target/SystemZ/SystemZFinalizeReassociation.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/SystemZ/SystemZFinalizeReassociation.cpp @@ -0,0 +1,127 @@ +//===----- SystemZFinalizeReassociation.cpp - Handle phys reg copies ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass is the last step of the process of enabling reassociation with +// the MachineCombiner. These are the steps involved: +// +// 1. Instruction selection: Disable reg/mem folding for any operations that +// are reassociable since MachineCombiner will not succeed +// otherwise. Instead select a reg/reg pseudo that pretends to clobber CC. +// +// 2. MachineCombiner: Performs reassociation with the reg/reg instructions. +// +// 3. PeepholeOptimizer: fold loads into reg/mem instructions after +// reassociation. The reg/mem opcode sets CC which is why the special +// reg/reg pseudo is needed. +// +// 4. Convert any remaining pseudos into the target opcodes that do not +// clobber CC (this pass). +// +//===----------------------------------------------------------------------===// + +#include "SystemZMachineFunctionInfo.h" +#include "SystemZTargetMachine.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +namespace { + +class SystemZFinalizeReassociation : public MachineFunctionPass { +public: + static char ID; + SystemZFinalizeReassociation() + : MachineFunctionPass(ID), TII(nullptr), MRI(nullptr) { + initializeSystemZFinalizeReassociationPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + + bool visitMBB(MachineBasicBlock &MBB); + + const SystemZInstrInfo *TII; + MachineRegisterInfo *MRI; +}; + +char SystemZFinalizeReassociation::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS(SystemZFinalizeReassociation, "systemz-finalize-reassoc", + "SystemZ Finalize Reassociation", false, false) + +FunctionPass *llvm:: +createSystemZFinalizeReassociationPass(SystemZTargetMachine &TM) { + return new SystemZFinalizeReassociation(); +} + +void SystemZFinalizeReassociation::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +bool SystemZFinalizeReassociation::visitMBB(MachineBasicBlock &MBB) { + bool Changed = false; + for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { + unsigned PseudoOpcode = MI.getOpcode(); + unsigned TargetOpcode = + PseudoOpcode == SystemZ::WFADB_CCPseudo ? SystemZ::WFADB + : PseudoOpcode == SystemZ::WFASB_CCPseudo ? SystemZ::WFASB + : PseudoOpcode == SystemZ::WFSDB_CCPseudo ? SystemZ::WFSDB + : PseudoOpcode == SystemZ::WFSSB_CCPseudo ? SystemZ::WFSSB + : PseudoOpcode == SystemZ::WFMDB_CCPseudo ? SystemZ::WFMDB + : PseudoOpcode == SystemZ::WFMSB_CCPseudo ? SystemZ::WFMSB + : PseudoOpcode == SystemZ::WFMADB_CCPseudo ? SystemZ::WFMADB + : PseudoOpcode == SystemZ::WFMASB_CCPseudo ? SystemZ::WFMASB + : 0; + if (TargetOpcode) { + // PeepholeOptimizer will not fold any loads across basic blocks, which + // however seems beneficial, so do it here: + bool Folded = false; + for (unsigned Op = 1; Op <= 2; ++Op) { + Register Reg = MI.getOperand(Op).getReg(); + if (MachineInstr *DefMI = MRI->getVRegDef(Reg)) + if (TII->optimizeLoadInstr(MI, MRI, Reg, DefMI)) { + MI.eraseFromParent(); + DefMI->eraseFromParent(); + MRI->markUsesInDebugValueAsUndef(Reg); + Folded = true; + break; + } + } + + if (!Folded) { + MI.setDesc(TII->get(TargetOpcode)); + int CCIdx = MI.findRegisterDefOperandIdx(SystemZ::CC); + MI.removeOperand(CCIdx); + } + Changed = true; + } + } + return Changed; +} + +bool SystemZFinalizeReassociation::runOnMachineFunction(MachineFunction &F) { + TII = F.getSubtarget().getInstrInfo(); + MRI = &F.getRegInfo(); + + bool Modified = false; + for (auto &MBB : F) + Modified |= visitMBB(MBB); + + return Modified; +} Index: llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -345,6 +345,12 @@ // Try to expand a boolean SELECT_CCMASK using an IPM sequence. SDValue expandSelectBoolean(SDNode *Node); + bool doReassociation(SDNode *N) const { + return N->getFlags().hasAllowReassociation() && + N->getFlags().hasNoSignedZeros() && + Subtarget->hasVector(); + } + public: static char ID; Index: llvm/lib/Target/SystemZ/SystemZInstrFP.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrFP.td +++ llvm/lib/Target/SystemZ/SystemZInstrFP.td @@ -439,8 +439,10 @@ def ADBR : BinaryRRE<"adbr", 0xB31A, any_fadd, FP64, FP64>; def AXBR : BinaryRRE<"axbr", 0xB34A, any_fadd, FP128, FP128>; } - defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, load, 4>; - defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, load, 8>; + defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, z_any_fadd_noreassoc, FP32, + load, 4>; + defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, z_any_fadd_noreassoc, FP64, + load, 8>; } // Subtraction. @@ -450,8 +452,10 @@ def SDBR : BinaryRRE<"sdbr", 0xB31B, any_fsub, FP64, FP64>; def SXBR : BinaryRRE<"sxbr", 0xB34B, any_fsub, FP128, FP128>; - defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, any_fsub, FP32, load, 4>; - defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, any_fsub, FP64, load, 8>; + defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, z_any_fsub_noreassoc, FP32, + load, 4>; + defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, z_any_fsub_noreassoc, FP64, + load, 8>; } // Multiplication. @@ -461,8 +465,10 @@ def MDBR : BinaryRRE<"mdbr", 0xB31C, any_fmul, FP64, FP64>; def MXBR : BinaryRRE<"mxbr", 0xB34C, any_fmul, FP128, FP128>; } - defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, load, 4>; - defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, any_fmul, FP64, load, 8>; + defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, z_any_fmul_noreassoc, FP32, + load, 4>; + defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, z_any_fmul_noreassoc, FP64, + load, 8>; } // f64 multiplication of two FP32 registers. @@ -504,8 +510,10 @@ def MAEBR : TernaryRRD<"maebr", 0xB30E, z_any_fma, FP32, FP32>; def MADBR : TernaryRRD<"madbr", 0xB31E, z_any_fma, FP64, FP64>; - defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, load, 4>; - defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, load, 8>; + defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma_noreassoc, FP32, + FP32, load, 4>; + defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma_noreassoc, FP64, + FP64, load, 8>; } // Fused multiply-subtract. Index: llvm/lib/Target/SystemZ/SystemZInstrFormats.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -5388,3 +5388,33 @@ [(set GR64:$end, (operator GR64:$start1, GR64:$start2, GR32:$char))]>; } + +multiclass BinaryVRRcAndCCPseudo opcode, + SDPatternOperator operator, + SDPatternOperator reassoc_operator, + TypedReg tr1, TypedReg tr2, bits<4> type = 0, + bits<4> m5 = 0, bits<4> m6 = 0, + string fp_mnemonic = ""> { + def "" : BinaryVRRc; + let Defs = [CC], AddedComplexity = 1 in // Win over "". + def _CCPseudo : Pseudo<(outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3), + [(set (tr1.vt tr1.op:$V1), + (reassoc_operator (tr2.vt tr2.op:$V2), + (tr2.vt tr2.op:$V3)))]>; +} + +multiclass TernaryVRReAndCCPseudo opcode, + SDPatternOperator operator, + SDPatternOperator reassoc_operator, + TypedReg tr1, TypedReg tr2, bits<4> m5 = 0, + bits<4> type = 0, string fp_mnemonic = ""> { + def "" : TernaryVRRe; + let Defs = [CC], AddedComplexity = 1 in // Win over "". + def _CCPseudo : Pseudo<(outs tr1.op:$V1), + (ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4), + [(set (tr1.vt tr1.op:$V1), (reassoc_operator (tr2.vt tr2.op:$V2), + (tr2.vt tr2.op:$V3), + (tr1.vt tr1.op:$V4)))]>; +} Index: llvm/lib/Target/SystemZ/SystemZInstrInfo.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrInfo.h +++ llvm/lib/Target/SystemZ/SystemZInstrInfo.h @@ -243,8 +243,15 @@ const DebugLoc &DL, Register DstReg, ArrayRef Cond, Register TrueReg, Register FalseReg) const override; + void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI, + MachineInstr::MIFlag Flag) const; + MachineInstr *optimizeLoadInstr(MachineInstr &MI, + MachineRegisterInfo *MRI, + Register &FoldAsLoadDefReg, + MachineInstr *&DefMI) const override; bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const override; + bool isPredicable(const MachineInstr &MI) const override; bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, @@ -274,6 +281,37 @@ Register VReg) const override; MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override; + + bool useMachineCombiner() const override { return true; } + bool IsReassociableFMA(const MachineInstr *MI) const; + bool IsReassociableAdd(const MachineInstr *MI) const; + bool getFMAPatterns(MachineInstr &Root, + SmallVectorImpl &P, + bool DoRegPressureReduce) const; + bool getMachineCombinerPatterns(MachineInstr &Root, + SmallVectorImpl &P, + bool DoRegPressureReduce) const override; + void + finalizeInsInstrs(MachineInstr &Root, MachineCombinerPattern &P, + SmallVectorImpl &InsInstrs) const override; + bool isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const override; + std::optional getInverseOpcode(unsigned Opcode) const override; + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstIdxForVirtReg) const override; + void reassociateFMA( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const; + bool accumulateInstrSeqToRootLatency(MachineInstr &Root) const override; + int getExtendResourceLenLimit() const override { return 0; } //XXX + void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2, + MachineInstr &NewMI1, MachineInstr &NewMI2) const override; + MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, Index: llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" @@ -610,6 +611,98 @@ .addImm(CCValid).addImm(CCMask); } +static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) { + if (OldMI->registerDefIsDead(SystemZ::CC)) { + MachineOperand *CCDef = NewMI->findRegisterDefOperand(SystemZ::CC); + if (CCDef != nullptr) + CCDef->setIsDead(true); + } +} + +void SystemZInstrInfo::transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI, + MachineInstr::MIFlag Flag) const { + if (OldMI->getFlag(Flag)) + NewMI->setFlag(Flag); +} + +MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI, + MachineRegisterInfo *MRI, + Register &FoldAsLoadDefReg, + MachineInstr *&DefMI) const { + const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + + // Check whether we can move DefMI here. + DefMI = MRI->getVRegDef(FoldAsLoadDefReg); + assert(DefMI); + bool SawStore = false; + if (!DefMI->isSafeToMove(nullptr, SawStore)) + return nullptr; + + // For reassociatable FP operations, any loads have been purposefully left + // unfolded so that MachineCombiner can do its work on reg/reg + // opcodes. After that, as many loads as possible are now folded. + unsigned LoadOpcode = 0; + unsigned RegMemOpcode = 0; + const TargetRegisterClass *FPRC = nullptr; + RegMemOpcode = MI.getOpcode() == SystemZ::WFADB_CCPseudo ? SystemZ::ADB + : MI.getOpcode() == SystemZ::WFSDB_CCPseudo ? SystemZ::SDB + : MI.getOpcode() == SystemZ::WFMDB_CCPseudo ? SystemZ::MDB + : MI.getOpcode() == SystemZ::WFMADB_CCPseudo ? SystemZ::MADB + : 0; + if (RegMemOpcode) { + LoadOpcode = SystemZ::VL64; + FPRC = &SystemZ::FP64BitRegClass; + } else { + RegMemOpcode = MI.getOpcode() == SystemZ::WFASB_CCPseudo ? SystemZ::AEB + : MI.getOpcode() == SystemZ::WFSSB_CCPseudo ? SystemZ::SEB + : MI.getOpcode() == SystemZ::WFMSB_CCPseudo ? SystemZ::MEEB + : MI.getOpcode() == SystemZ::WFMASB_CCPseudo ? SystemZ::MAEB + : 0; + if (RegMemOpcode) { + LoadOpcode = SystemZ::VL32; + FPRC = &SystemZ::FP32BitRegClass; + } + } + if (!RegMemOpcode) + return nullptr; + + if (DefMI->getOpcode() == LoadOpcode && + MRI->hasOneNonDBGUse(FoldAsLoadDefReg)) { + Register DstReg = MI.getOperand(0).getReg(); + MachineOperand LHS = MI.getOperand(1); + MachineOperand RHS = MI.getOperand(2); + bool MemInRHS = RHS.getReg() == FoldAsLoadDefReg; + if (!MemInRHS && + (RegMemOpcode == SystemZ::SDB || RegMemOpcode == SystemZ::SEB)) + return nullptr; + bool IsTernary = + (RegMemOpcode == SystemZ::MADB || RegMemOpcode == SystemZ::MAEB); + MachineOperand *AccMO = IsTernary ? &MI.getOperand(3) : nullptr; + MachineOperand &RegMO = MemInRHS ? LHS : RHS; + MachineInstrBuilder MIB = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(RegMemOpcode), DstReg); + if (IsTernary) + MIB.add(*AccMO); + MIB.add(RegMO) + .add(DefMI->getOperand(1)) + .add(DefMI->getOperand(2)) + .add(DefMI->getOperand(3)) + .addMemOperand(*DefMI->memoperands_begin()); + transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept); + MIB->addRegisterDead(SystemZ::CC, TRI); + if (MIB->getOperand(2 + IsTernary).isReg()) + MIB->getOperand(2 + IsTernary).setIsKill(false); + MIB->getOperand(4 + IsTernary).setIsKill(false); + MRI->setRegClass(RegMO.getReg(), FPRC); + MRI->setRegClass(DstReg, FPRC); + if (IsTernary) + MRI->setRegClass(AccMO->getReg(), FPRC); + return MIB; + } + + return nullptr; +} + bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const { @@ -937,20 +1030,6 @@ } } -static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) { - if (OldMI->registerDefIsDead(SystemZ::CC)) { - MachineOperand *CCDef = NewMI->findRegisterDefOperand(SystemZ::CC); - if (CCDef != nullptr) - CCDef->setIsDead(true); - } -} - -static void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI, - MachineInstr::MIFlag Flag) { - if (OldMI->getFlag(Flag)) - NewMI->setFlag(Flag); -} - MachineInstr * SystemZInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const { @@ -1003,6 +1082,585 @@ return nullptr; } +static bool hasReassocFlags(const MachineInstr *MI) { + return (MI->getFlag(MachineInstr::MIFlag::FmReassoc) && + MI->getFlag(MachineInstr::MIFlag::FmNsz)); +} + +bool SystemZInstrInfo::IsReassociableFMA(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + case SystemZ::VFMADB: + case SystemZ::VFMASB: + case SystemZ::WFMAXB: + return hasReassocFlags(MI); + case SystemZ::WFMADB_CCPseudo: + case SystemZ::WFMASB_CCPseudo: + return hasReassocFlags(MI) && + MI->findRegisterDefOperandIdx(SystemZ::CC, true /*isDead*/) != -1; + default: + break; + } + return false; +} + +bool SystemZInstrInfo::IsReassociableAdd(const MachineInstr *MI) const { + switch (MI->getOpcode()) { + case SystemZ::VFADB: + case SystemZ::VFASB: + case SystemZ::WFAXB: + return hasReassocFlags(MI); + case SystemZ::WFADB_CCPseudo: + case SystemZ::WFASB_CCPseudo: + return hasReassocFlags(MI) && + MI->findRegisterDefOperandIdx(SystemZ::CC, true/*isDead*/) != -1; + default: + break; + } + return false; +} + +// EXPERIMENTAL +static cl::opt FMA1ADD("fma1add", cl::init(false)); +static cl::opt FMA2ADD("fma2add", cl::init(false)); +static cl::opt FMA2("fma2", cl::init(false)); +static cl::opt FMA3("fma3", cl::init(false)); +static cl::opt FMA3Ch("fma3-ch", cl::init(false)); +static cl::opt FMA4("fma4", cl::init(false)); +static cl::opt FMA4Ch("fma4-ch", cl::init(false)); + +bool SystemZInstrInfo::getFMAPatterns( + MachineInstr &Root, SmallVectorImpl &Patterns, + bool DoRegPressureReduce) const { + assert(Patterns.empty()); + MachineBasicBlock *MBB = Root.getParent(); + const MachineRegisterInfo *MRI = &MBB->getParent()->getRegInfo(); + + if (!IsReassociableFMA(&Root)) + return false; + const TargetRegisterClass *RC = MRI->getRegClass(Root.getOperand(0).getReg()); + + auto AllOpsOK = [&MRI, &RC](const MachineInstr &Instr) { + for (const auto &MO : Instr.explicit_operands()) + if (!(MO.isReg() && MO.getReg().isVirtual() && !MO.getSubReg())) + return false; + if (MRI->getRegClass(Instr.getOperand(0).getReg()) != RC) + return false; + return true; + }; + + MachineInstr *TopAdd = nullptr; + std::vector UpChain; + UpChain.push_back(&Root); + Register Acc = Root.getOperand(3).getReg(); + while (MachineInstr *Prev = MRI->getUniqueVRegDef(Acc)) { + if (Prev->getParent() != MBB || !MRI->hasOneNonDBGUse(Acc)) + break; + if (IsReassociableFMA(Prev)) { + UpChain.push_back(Prev); + Acc = Prev->getOperand(3).getReg(); + continue; + } + if (IsReassociableAdd(Prev)) + TopAdd = Prev; + break; + } + for (auto *Node : UpChain) + assert(AllOpsOK(*Node)); + assert(!TopAdd || AllOpsOK(*TopAdd)); + + // Try to combine e.g. FMA2 and FM4 intelligently..? + // std::vector DownChain; + // MachineInstr *Curr = &Root; + // while (true) { + // if (Curr != &Root) + // DownChain.push_back(Curr); + // Register Reg = Curr->getOperand(0).getReg(); + // if (!MRI->hasOneNonDBGUse(Reg)) + // break; + // Curr = &*MRI->use_instr_nodbg_begin(Reg); + // unsigned UseIdx = Curr->findRegisterUseOperandIdx(Reg); + // if (Curr->getParent() != MBB || !(IsReassociableFMA(Curr) && UseIdx == 3)) + // break; + // } + // for (auto *Node : DownChain) + // assert(AllOpsOK(*Node)); + + // LLVM_DEBUG(dbgs() << "Analyzing " << Root;); + // LLVM_DEBUG(dbgs() << "Chain upwards: \n";); + // for (auto *MI : make_range(UpChain.rbegin(), UpChain.rend())) + // LLVM_DEBUG(dbgs() << *MI;); + // LLVM_DEBUG(dbgs() << "Chain downwards: \n";); + // for (auto *Node : DownChain) + // LLVM_DEBUG(dbgs() << *Node;); + // LLVM_DEBUG(dbgs() << "\n";); + + if (UpChain.size() >= 4) { + if (FMA4) { + // First try getting the Multiplications off the Critical Path. + // TODO: Possible to reassociate resulting adds of multiple FMA2s instead? + Patterns.push_back(MachineCombinerPattern::REASSOC_FMA4); + LLVM_DEBUG(dbgs() << "add pattern REASSOC_FMA4\n"); + } + if (FMA4Ch) { + // Try chaining the incoming accumulator: only two multiplies in parallel. + Patterns.push_back(MachineCombinerPattern::REASSOC_FMA4_Ch); + LLVM_DEBUG(dbgs() << "add pattern REASSOC_FMA4_Ch\n"); + } + } + if (UpChain.size() >= 3) { + if (FMA3) { + // First try getting the Multiplications off the Critical Path. + Patterns.push_back(MachineCombinerPattern::REASSOC_FMA3); + LLVM_DEBUG(dbgs() << "add pattern REASSOC_FMA3\n"); + } + if (FMA3Ch) { + // Try chaining the incoming accumulator: only two multiplies in parallel. + Patterns.push_back(MachineCombinerPattern::REASSOC_FMA3_Ch); + LLVM_DEBUG(dbgs() << "add pattern REASSOC_FMA3_Ch\n"); + } + } + if (UpChain.size() >= 2 && FMA2) { + // First try getting the Multiplications off the Critical Path. + Patterns.push_back(MachineCombinerPattern::REASSOC_FMA2); + LLVM_DEBUG(dbgs() << "add pattern REASSOC_FMA2\n"); + } + if (UpChain.size() == 2 && TopAdd && FMA2ADD) { + // Try reassociating by pushing the FMAs up into the Add. + Patterns.push_back(MachineCombinerPattern::REASSOC_FMA2Add); + LLVM_DEBUG(dbgs() << "add pattern REASSOC_FMA2Add\n"); + } + if (UpChain.size() == 1 && TopAdd && FMA1ADD) { + // The latency of the FMA could potentially be hidden above the add: Try + // both sides of the add and let MachineCombiner decide on profitability. + // TODO: This could be done with other patterns as well, although it may + // be too much work. Perhaps if the BlockTrace was passed here, + // reassociateFMA() could make the best of it in one try. Alternatively + // pre-sort the chains (SystemZReassocadditions). + Patterns.push_back(MachineCombinerPattern::REASSOC_FMA1Add_L); + LLVM_DEBUG(dbgs() << "add pattern REASSOC_FMA1Add_L\n"); + Patterns.push_back(MachineCombinerPattern::REASSOC_FMA1Add_R); + LLVM_DEBUG(dbgs() << "add pattern REASSOC_FMA1Add_R\n"); + } + + return Patterns.size() > 0; +} + +bool SystemZInstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, SmallVectorImpl &Patterns, + bool DoRegPressureReduce) const { + + if (getFMAPatterns(Root, Patterns, DoRegPressureReduce)) + return true; + + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, + DoRegPressureReduce); +} + +void SystemZInstrInfo::finalizeInsInstrs( + MachineInstr &Root, MachineCombinerPattern &P, + SmallVectorImpl &InsInstrs) const { + const TargetRegisterInfo *TRI = + Root.getParent()->getParent()->getSubtarget().getRegisterInfo(); + for (auto *Inst : InsInstrs) { + switch (Inst->getOpcode()) { + case SystemZ::WFADB_CCPseudo: + case SystemZ::WFASB_CCPseudo: + case SystemZ::WFMDB_CCPseudo: + case SystemZ::WFMSB_CCPseudo: + case SystemZ::WFSDB_CCPseudo: + case SystemZ::WFSSB_CCPseudo: + case SystemZ::WFMADB_CCPseudo: + case SystemZ::WFMASB_CCPseudo: + Inst->addRegisterDead(SystemZ::CC, TRI); + break; + default: break; + } + } +} + +bool SystemZInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const { + unsigned Opc = Inst.getOpcode(); + if (Invert) { + auto InverseOpcode = getInverseOpcode(Opc); + if (!InverseOpcode) + return false; + Opc = *InverseOpcode; + } + + switch (Opc) { + default: + break; + case SystemZ::VFADB: + case SystemZ::VFASB: + case SystemZ::WFAXB: + case SystemZ::WFADB_CCPseudo: + case SystemZ::WFASB_CCPseudo: + case SystemZ::VFMDB: + case SystemZ::VFMSB: + case SystemZ::WFMXB: + case SystemZ::WFMDB_CCPseudo: + case SystemZ::WFMSB_CCPseudo: + return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && + Inst.getFlag(MachineInstr::MIFlag::FmNsz); + } + + return false; +} + +std::optional +SystemZInstrInfo::getInverseOpcode(unsigned Opcode) const { + switch (Opcode) { + case SystemZ::VFADB: return SystemZ::VFSDB; + case SystemZ::VFASB: return SystemZ::VFSSB; + case SystemZ::WFAXB: return SystemZ::WFSXB; + case SystemZ::WFADB_CCPseudo: return SystemZ::WFSDB_CCPseudo; + case SystemZ::WFASB_CCPseudo: return SystemZ::WFSSB_CCPseudo; + case SystemZ::VFSDB: return SystemZ::VFADB; + case SystemZ::VFSSB: return SystemZ::VFASB; + case SystemZ::WFSXB: return SystemZ::WFAXB; + case SystemZ::WFSDB_CCPseudo: return SystemZ::WFADB_CCPseudo; + case SystemZ::WFSSB_CCPseudo: return SystemZ::WFASB_CCPseudo; + default: return std::nullopt; + } +} + +void SystemZInstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { + switch (Pattern) { + case MachineCombinerPattern::REASSOC_FMA1Add_L: + case MachineCombinerPattern::REASSOC_FMA1Add_R: + case MachineCombinerPattern::REASSOC_FMA2Add: + case MachineCombinerPattern::REASSOC_FMA2: + case MachineCombinerPattern::REASSOC_FMA3: + case MachineCombinerPattern::REASSOC_FMA3_Ch: + case MachineCombinerPattern::REASSOC_FMA4: + case MachineCombinerPattern::REASSOC_FMA4_Ch: + reassociateFMA(Root, Pattern, InsInstrs, DelInstrs, InstrIdxForVirtReg); + break; + default: + // Reassociate default patterns. + TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, + DelInstrs, InstrIdxForVirtReg); + break; + } +} + +static void getSplitFMAOpcodes(unsigned FMAOpc, unsigned &AddOpc, + unsigned &MulOpc) { + switch (FMAOpc) { + case SystemZ::VFMADB: AddOpc = SystemZ::VFADB; MulOpc = SystemZ::VFMDB; break; + case SystemZ::VFMASB: AddOpc = SystemZ::VFASB; MulOpc = SystemZ::VFMSB; break; + case SystemZ::WFMAXB: AddOpc = SystemZ::WFAXB; MulOpc = SystemZ::WFMXB; break; + case SystemZ::WFMADB_CCPseudo: + AddOpc = SystemZ::WFADB_CCPseudo; MulOpc = SystemZ::WFMDB_CCPseudo; break; + case SystemZ::WFMASB_CCPseudo: + AddOpc = SystemZ::WFASB_CCPseudo; MulOpc = SystemZ::WFMSB_CCPseudo; break; + default: + llvm_unreachable("Expected FMA opcode."); + } +} + +void SystemZInstrInfo::reassociateFMA( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { + MachineFunction *MF = Root.getMF(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + + assert(IsReassociableFMA(&Root)); + const TargetRegisterClass *RC = Root.getRegClassConstraint(0, this, TRI); + Register DstReg = Root.getOperand(0).getReg(); + unsigned FMAOpc = Root.getOpcode(); + unsigned AddOpc, MulOpc; + getSplitFMAOpcodes(FMAOpc, AddOpc, MulOpc); + std::vector Chain; + Chain.push_back(&Root); + +#ifndef NDEBUG + auto IsAllFMA = [&Chain, &FMAOpc]() { + for (auto *MI : Chain) + if (MI->getOpcode() != FMAOpc) + return false; + return true; + }; +#endif + + uint16_t IntersectedFlags = Root.getFlags(); + auto getIntersectedFlags = [&]() { + for (auto *MI : Chain) + IntersectedFlags &= MI->getFlags(); + }; + + auto createNewVReg = [&](unsigned NewInsIdx) -> Register { + Register NewReg = MRI.createVirtualRegister(RC); + InstrIdxForVirtReg.insert(std::make_pair(NewReg, NewInsIdx)); + return NewReg; + }; + + auto finalizeNewMIs = [&](ArrayRef NewMIs) { + for (auto *MI : NewMIs) { + MI->setFlags(IntersectedFlags); + MI->clearFlag(MachineInstr::MIFlag::NoSWrap); + MI->clearFlag(MachineInstr::MIFlag::NoUWrap); + MI->clearFlag(MachineInstr::MIFlag::IsExact); + MI->addRegisterDead(SystemZ::CC, TRI); + InsInstrs.push_back(MI); + } + }; + + switch (Pattern) { + case MachineCombinerPattern::REASSOC_FMA4_Ch: { + Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg())); + Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg())); + Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg())); + assert(IsAllFMA()); + getIntersectedFlags(); + Register NewVRA = createNewVReg(0); + Register NewVRB = createNewVReg(1); + Register NewVRC = createNewVReg(2); + Register NewVRD = createNewVReg(3); + MachineInstr *MINewA = + BuildMI(*MF, Chain[3]->getDebugLoc(), get(FMAOpc), NewVRA) + .add(Chain[3]->getOperand(1)) + .add(Chain[3]->getOperand(2)) + .add(Chain[3]->getOperand(3)); + MachineInstr *MINewB = + BuildMI(*MF, Chain[2]->getDebugLoc(), get(MulOpc), NewVRB) + .add(Chain[2]->getOperand(1)) + .add(Chain[2]->getOperand(2)); + MachineInstr *MINewC = + BuildMI(*MF, Chain[1]->getDebugLoc(), get(FMAOpc), NewVRC) + .add(Chain[1]->getOperand(1)) + .add(Chain[1]->getOperand(2)) + .addReg(NewVRA); + MachineInstr *MINewD = + BuildMI(*MF, Chain[0]->getDebugLoc(), get(FMAOpc), NewVRD) + .add(Chain[0]->getOperand(1)) + .add(Chain[0]->getOperand(2)) + .addReg(NewVRB); + MachineInstr *MINewE = + BuildMI(*MF, Chain[0]->getDebugLoc(), get(AddOpc), DstReg) + .addReg(NewVRC) + .addReg(NewVRD); + finalizeNewMIs({MINewA, MINewB, MINewC, MINewD, MINewE}); + break; + } + case MachineCombinerPattern::REASSOC_FMA4: { + Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg())); + Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg())); + Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg())); + assert(IsAllFMA()); + getIntersectedFlags(); + Register NewVRA = createNewVReg(0); + Register NewVRB = createNewVReg(1); + Register NewVRC = createNewVReg(2); + Register NewVRD = createNewVReg(3); + Register NewVRE = createNewVReg(4); + MachineInstr *MINewA = + BuildMI(*MF, Chain[3]->getDebugLoc(), get(MulOpc), NewVRA) + .add(Chain[3]->getOperand(1)) + .add(Chain[3]->getOperand(2)); + MachineInstr *MINewB = + BuildMI(*MF, Chain[2]->getDebugLoc(), get(MulOpc), NewVRB) + .add(Chain[2]->getOperand(1)) + .add(Chain[2]->getOperand(2)); + MachineInstr *MINewC = + BuildMI(*MF, Chain[1]->getDebugLoc(), get(FMAOpc), NewVRC) + .add(Chain[1]->getOperand(1)) + .add(Chain[1]->getOperand(2)) + .addReg(NewVRA); + MachineInstr *MINewD = + BuildMI(*MF, Chain[0]->getDebugLoc(), get(FMAOpc), NewVRD) + .add(Chain[0]->getOperand(1)) + .add(Chain[0]->getOperand(2)) + .addReg(NewVRB); + MachineInstr *MINewE = + BuildMI(*MF, Chain[0]->getDebugLoc(), get(AddOpc), NewVRE) + .addReg(NewVRC) + .addReg(NewVRD); + MachineInstr *MINewF = + BuildMI(*MF, Chain[0]->getDebugLoc(), get(AddOpc), DstReg) + .add(Chain[3]->getOperand(3)) + .addReg(NewVRE); + finalizeNewMIs({MINewA, MINewB, MINewC, MINewD, MINewE, MINewF}); + break; + } + case MachineCombinerPattern::REASSOC_FMA3_Ch: { + Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg())); + Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg())); + assert(IsAllFMA()); + getIntersectedFlags(); + Register NewVRA = createNewVReg(0); + Register NewVRB = createNewVReg(1); + Register NewVRC = createNewVReg(2); + MachineInstr *MINewA = + BuildMI(*MF, Chain[2]->getDebugLoc(), get(MulOpc), NewVRA) + .add(Chain[2]->getOperand(1)) + .add(Chain[2]->getOperand(2)); + MachineInstr *MINewB = + BuildMI(*MF, Chain[1]->getDebugLoc(), get(FMAOpc), NewVRB) + .add(Chain[1]->getOperand(1)) + .add(Chain[1]->getOperand(2)) + .add(Chain[2]->getOperand(3)); + MachineInstr *MINewC = + BuildMI(*MF, Chain[0]->getDebugLoc(), get(FMAOpc), NewVRC) + .add(Chain[0]->getOperand(1)) + .add(Chain[0]->getOperand(2)) + .addReg(NewVRA); + MachineInstr *MINewD = + BuildMI(*MF, Chain[0]->getDebugLoc(), get(AddOpc), DstReg) + .addReg(NewVRB) + .addReg(NewVRC); + finalizeNewMIs({MINewA, MINewB, MINewC, MINewD}); + break; + } + case MachineCombinerPattern::REASSOC_FMA3: { + Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg())); + Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg())); + assert(IsAllFMA()); + getIntersectedFlags(); + Register NewVRA = createNewVReg(0); + Register NewVRB = createNewVReg(1); + Register NewVRC = createNewVReg(2); + Register NewVRD = createNewVReg(3); + MachineInstr *MINewA = + BuildMI(*MF, Chain[2]->getDebugLoc(), get(MulOpc), NewVRA) + .add(Chain[2]->getOperand(1)) + .add(Chain[2]->getOperand(2)); + MachineInstr *MINewB = + BuildMI(*MF, Chain[1]->getDebugLoc(), get(MulOpc), NewVRB) + .add(Chain[1]->getOperand(1)) + .add(Chain[1]->getOperand(2)); + MachineInstr *MINewC = + BuildMI(*MF, Chain[0]->getDebugLoc(), get(FMAOpc), NewVRC) + .add(Chain[0]->getOperand(1)) + .add(Chain[0]->getOperand(2)) + .addReg(NewVRB); + MachineInstr *MINewD = + BuildMI(*MF, Chain[0]->getDebugLoc(), get(AddOpc), NewVRD) + .addReg(NewVRA) + .addReg(NewVRC); + MachineInstr *MINewE = + BuildMI(*MF, Chain[0]->getDebugLoc(), get(AddOpc), DstReg) + .add(Chain[2]->getOperand(3)) + .addReg(NewVRD); + finalizeNewMIs({MINewA, MINewB, MINewC, MINewD, MINewE}); + break; + } + case MachineCombinerPattern::REASSOC_FMA2Add: { + Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg())); + assert(IsAllFMA()); + Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg())); + assert(Chain.back()->getOpcode() == AddOpc && "Expected matching Add"); + getIntersectedFlags(); + Register NewVRA = createNewVReg(0); + Register NewVRB = createNewVReg(1); + MachineInstr *MINewA = + BuildMI(*MF, Chain[1]->getDebugLoc(), get(FMAOpc), NewVRA) + .add(Chain[1]->getOperand(1)) + .add(Chain[1]->getOperand(2)) + .add(Chain[2]->getOperand(1)); + MachineInstr *MINewB = + BuildMI(*MF, Chain[0]->getDebugLoc(), get(FMAOpc), NewVRB) + .add(Chain[0]->getOperand(1)) + .add(Chain[0]->getOperand(2)) + .add(Chain[2]->getOperand(2)); + MachineInstr *MINewC = + BuildMI(*MF, Chain[0]->getDebugLoc(), get(AddOpc), DstReg) + .addReg(NewVRA) + .addReg(NewVRB); + finalizeNewMIs({MINewA, MINewB, MINewC}); + break; + } + case MachineCombinerPattern::REASSOC_FMA2: { + Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg())); + assert(IsAllFMA()); + Register NewVRA = createNewVReg(0); + Register NewVRB = createNewVReg(1); + MachineInstr *MINewA = + BuildMI(*MF, Chain[1]->getDebugLoc(), get(MulOpc), NewVRA) + .add(Chain[1]->getOperand(1)) + .add(Chain[1]->getOperand(2)); + MachineInstr *MINewB = + BuildMI(*MF, Chain[0]->getDebugLoc(), get(FMAOpc), NewVRB) + .add(Chain[0]->getOperand(1)) + .add(Chain[0]->getOperand(2)) + .addReg(NewVRA); + MachineInstr *MINewC = + BuildMI(*MF, Chain[1]->getDebugLoc(), get(AddOpc), DstReg) + .add(Chain[1]->getOperand(3)) + .addReg(NewVRB); + finalizeNewMIs({MINewA, MINewB, MINewC}); + break; + } + case MachineCombinerPattern::REASSOC_FMA1Add_L: + case MachineCombinerPattern::REASSOC_FMA1Add_R: { + assert(IsAllFMA()); + Chain.push_back(MRI.getUniqueVRegDef(Chain.back()->getOperand(3).getReg())); + assert(Chain.back()->getOpcode() == AddOpc && "Expected matching Add"); + getIntersectedFlags(); + unsigned Op = Pattern == MachineCombinerPattern::REASSOC_FMA1Add_L ? 1 : 2; + unsigned OtherOp = Op == 1 ? 2 : 1; + Register NewVRA = createNewVReg(0); + MachineInstr *MINewA = + BuildMI(*MF, Chain[0]->getDebugLoc(), get(FMAOpc), NewVRA) + .add(Chain[0]->getOperand(1)) + .add(Chain[0]->getOperand(2)) + .add(Chain[1]->getOperand(Op)); + MachineInstr *MINewB = + BuildMI(*MF, Chain[1]->getDebugLoc(), get(AddOpc), DstReg) + .addReg(NewVRA) + .add(Chain[1]->getOperand(OtherOp)); + finalizeNewMIs({MINewA, MINewB}); + break; + } + + default: + llvm_unreachable("not recognized pattern!"); + } + + assert(!InsInstrs.empty() && + "Insertion instructions set should not be empty!"); + // Record old instructions for deletion. + + for (auto *MI : make_range(Chain.rbegin(), Chain.rend())) + DelInstrs.push_back(MI); +} + +bool +SystemZInstrInfo::accumulateInstrSeqToRootLatency(MachineInstr &Root) const { + // This doesn't make much sense for FMA patterns as they typically use an + // extra Add to do things in parallell. + if (IsReassociableFMA(&Root)) + return false; + + return true; +} + +void SystemZInstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1, + MachineInstr &OldMI2, + MachineInstr &NewMI1, + MachineInstr &NewMI2) const { + // Propagate FP flags from the original instructions. + // But clear poison-generating flags because those may not be valid now. + uint16_t IntersectedFlags = OldMI1.getFlags() & OldMI2.getFlags(); + NewMI1.setFlags(IntersectedFlags); + NewMI1.clearFlag(MachineInstr::MIFlag::NoSWrap); + NewMI1.clearFlag(MachineInstr::MIFlag::NoUWrap); + NewMI1.clearFlag(MachineInstr::MIFlag::IsExact); + + NewMI2.setFlags(IntersectedFlags); + NewMI2.clearFlag(MachineInstr::MIFlag::NoSWrap); + NewMI2.clearFlag(MachineInstr::MIFlag::NoUWrap); + NewMI2.clearFlag(MachineInstr::MIFlag::IsExact); +} + MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, Index: llvm/lib/Target/SystemZ/SystemZInstrVector.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrVector.td +++ llvm/lib/Target/SystemZ/SystemZInstrVector.td @@ -139,7 +139,7 @@ // LEY and LDY offer full 20-bit displacement fields. It's often better // to use those instructions rather than force a 20-bit displacement // into a GPR temporary. - let mayLoad = 1 in { + let mayLoad = 1, canFoldAsLoad = 1 in { def VL32 : UnaryAliasVRX; def VL64 : UnaryAliasVRX; } @@ -1045,15 +1045,15 @@ let Predicates = [FeatureVector] in { // Add. let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in { - def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>; - def VFADB : BinaryVRRc<"vfadb", 0xE7E3, any_fadd, v128db, v128db, 3, 0>; - def WFADB : BinaryVRRc<"wfadb", 0xE7E3, any_fadd, v64db, v64db, 3, 8, 0, - "adbr">; + def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>; + def VFADB : BinaryVRRc<"vfadb", 0xE7E3, any_fadd, v128db, v128db, 3, 0>; + defm WFADB : BinaryVRRcAndCCPseudo<"wfadb", 0xE7E3, any_fadd, + z_fadd_reassoc, v64db, v64db, 3, 8, 0, "adbr">; let Predicates = [FeatureVectorEnhancements1] in { - def VFASB : BinaryVRRc<"vfasb", 0xE7E3, any_fadd, v128sb, v128sb, 2, 0>; - def WFASB : BinaryVRRc<"wfasb", 0xE7E3, any_fadd, v32sb, v32sb, 2, 8, 0, - "aebr">; - def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, any_fadd, v128xb, v128xb, 4, 8>; + def VFASB : BinaryVRRc<"vfasb", 0xE7E3, any_fadd, v128sb, v128sb, 2, 0>; + defm WFASB : BinaryVRRcAndCCPseudo<"wfasb", 0xE7E3, any_fadd, + z_fadd_reassoc, v32sb, v32sb, 2, 8, 0, "aebr">; + def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, any_fadd, v128xb, v128xb, 4, 8>; } } @@ -1258,29 +1258,29 @@ // Multiply. let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in { - def VFM : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>; - def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, any_fmul, v128db, v128db, 3, 0>; - def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, any_fmul, v64db, v64db, 3, 8, 0, - "mdbr">; + def VFM : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>; + def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, any_fmul, v128db, v128db, 3, 0>; + defm WFMDB : BinaryVRRcAndCCPseudo<"wfmdb", 0xE7E7, any_fmul, + z_fmul_reassoc, v64db, v64db, 3, 8, 0, "mdbr">; let Predicates = [FeatureVectorEnhancements1] in { - def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, any_fmul, v128sb, v128sb, 2, 0>; - def WFMSB : BinaryVRRc<"wfmsb", 0xE7E7, any_fmul, v32sb, v32sb, 2, 8, 0, - "meebr">; - def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, any_fmul, v128xb, v128xb, 4, 8>; + def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, any_fmul, v128sb, v128sb, 2, 0>; + defm WFMSB : BinaryVRRcAndCCPseudo<"wfmsb", 0xE7E7, any_fmul, + z_fmul_reassoc, v32sb, v32sb, 2, 8, 0, "meebr">; + def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, any_fmul, v128xb, v128xb, 4, 8>; } } // Multiply and add. let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in { - def VFMA : TernaryVRReFloatGeneric<"vfma", 0xE78F>; - def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, any_fma, v128db, v128db, 0, 3>; - def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, any_fma, v64db, v64db, 8, 3, - "madbr">; + def VFMA : TernaryVRReFloatGeneric<"vfma", 0xE78F>; + def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, any_fma, v128db, v128db, 0, 3>; + defm WFMADB : TernaryVRReAndCCPseudo<"wfmadb", 0xE78F, any_fma, + z_fma_reassoc, v64db, v64db, 8, 3, "madbr">; let Predicates = [FeatureVectorEnhancements1] in { - def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, any_fma, v128sb, v128sb, 0, 2>; - def WFMASB : TernaryVRRe<"wfmasb", 0xE78F, any_fma, v32sb, v32sb, 8, 2, - "maebr">; - def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, any_fma, v128xb, v128xb, 8, 4>; + def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, any_fma, v128sb, v128sb, 0, 2>; + defm WFMASB : TernaryVRReAndCCPseudo<"wfmasb", 0xE78F, any_fma, + z_fma_reassoc, v32sb, v32sb, 8, 2, "maebr">; + def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, any_fma, v128xb, v128xb, 8, 4>; } } @@ -1373,15 +1373,15 @@ // Subtract. let Uses = [FPC], mayRaiseFPException = 1 in { - def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>; - def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, any_fsub, v128db, v128db, 3, 0>; - def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, any_fsub, v64db, v64db, 3, 8, 0, - "sdbr">; + def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>; + def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, any_fsub, v128db, v128db, 3, 0>; + defm WFSDB : BinaryVRRcAndCCPseudo<"wfsdb", 0xE7E2, any_fsub, + z_fsub_reassoc, v64db, v64db, 3, 8, 0, "sdbr">; let Predicates = [FeatureVectorEnhancements1] in { - def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, any_fsub, v128sb, v128sb, 2, 0>; - def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, any_fsub, v32sb, v32sb, 2, 8, 0, - "sebr">; - def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, any_fsub, v128xb, v128xb, 4, 8>; + def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, any_fsub, v128sb, v128sb, 2, 0>; + defm WFSSB : BinaryVRRcAndCCPseudo<"wfssb", 0xE7E2, any_fsub, + z_fsub_reassoc, v32sb, v32sb, 2, 8, 0, "sebr">; + def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, any_fsub, v128xb, v128xb, 4, 8>; } } Index: llvm/lib/Target/SystemZ/SystemZOperators.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZOperators.td +++ llvm/lib/Target/SystemZ/SystemZOperators.td @@ -700,6 +700,34 @@ // Floating-point negative absolute. def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>; +// Floating-point operations which are not marked as reassociable. +def z_any_fadd_noreassoc : PatFrag<(ops node:$src1, node:$src2), + (any_fadd node:$src1, node:$src2), + [{ return !doReassociation(N); }]>; +def z_any_fsub_noreassoc : PatFrag<(ops node:$src1, node:$src2), + (any_fsub node:$src1, node:$src2), + [{ return !doReassociation(N); }]>; +def z_any_fmul_noreassoc : PatFrag<(ops node:$src1, node:$src2), + (any_fmul node:$src1, node:$src2), + [{ return !doReassociation(N); }]>; +def z_any_fma_noreassoc : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (any_fma node:$src2, node:$src3, node:$src1), + [{ return !doReassociation(N); }]>; + +// Floating-point operations which are reassociable. +def z_fadd_reassoc : PatFrag<(ops node:$src1, node:$src2), + (fadd node:$src1, node:$src2), + [{ return doReassociation(N); }]>; +def z_fsub_reassoc : PatFrag<(ops node:$src1, node:$src2), + (fsub node:$src1, node:$src2), + [{ return doReassociation(N); }]>; +def z_fmul_reassoc : PatFrag<(ops node:$src1, node:$src2), + (fmul node:$src1, node:$src2), + [{ return doReassociation(N); }]>; +def z_fma_reassoc : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (fma node:$src1, node:$src2, node:$src3), + [{ return doReassociation(N); }]>; + // Strict floating-point fragments. def z_any_fcmp : PatFrags<(ops node:$lhs, node:$rhs), [(z_strict_fcmp node:$lhs, node:$rhs), Index: llvm/lib/Target/SystemZ/SystemZReassocAdditions.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/SystemZ/SystemZReassocAdditions.cpp @@ -0,0 +1,494 @@ +//===------- SystemZReassocAdditions.cpp - Handle phys reg copies -------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass handles chains of FMA/Add instructions by reordering additions +// to minimize stalls. For example: +// +// A1 = FMA F(10), F(20), A0(5) // Stall 20 cycles waiting for (F20). +// A2 = FMA F(5), F(5), A1 +// +// => +// +// A1 = FMA F(5), F(5), A0(5) // Reduced stall. +// A2 = FMA F(10), F(20), A1 +// +//===----------------------------------------------------------------------===// + +#include "SystemZMachineFunctionInfo.h" +#include "SystemZTargetMachine.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Target/TargetMachine.h" +#include +#include + +using namespace llvm; + +#define DEBUG_TYPE "systemz-reassoc-additions" + +namespace { + +class SystemZReassocAdditions : public MachineFunctionPass { +public: + static char ID; + SystemZReassocAdditions() + : MachineFunctionPass(ID), TII(nullptr), MRI(nullptr) { + initializeSystemZReassocAdditionsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + + TargetSchedModel TSchedModel; + MachineTraceMetrics *Traces = nullptr; + MachineTraceMetrics::Ensemble *TraceEnsemble = nullptr; + + bool IsReassociableFMA(const MachineInstr *MI) const; + bool IsReassociableAdd(const MachineInstr *MI) const; + unsigned getOpReadyCycle(MachineInstr *MI, unsigned OpIdx); + bool reassociate(MachineInstr *MI, std::set &Seen); + bool visitMBB(MachineBasicBlock *MBB); + + const SystemZInstrInfo *TII; + MachineRegisterInfo *MRI; +}; + +char SystemZReassocAdditions::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(SystemZReassocAdditions, DEBUG_TYPE, + "SystemZ Reassociate Additions Pass", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) +INITIALIZE_PASS_END(SystemZReassocAdditions, DEBUG_TYPE, + "SystemZ Reassociate Additions Pass", false, false) + +FunctionPass *llvm:: +createSystemZReassocAdditionsPass(SystemZTargetMachine &TM) { + return new SystemZReassocAdditions(); +} + +void SystemZReassocAdditions::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +static bool hasRequiredFlags(const MachineInstr *MI) { + return MI->getFlag(MachineInstr::MIFlag::FmContract); +} + +bool SystemZReassocAdditions::IsReassociableFMA(const MachineInstr *MI) const { + return TII->IsReassociableFMA(MI) && hasRequiredFlags(MI); +} + +bool SystemZReassocAdditions::IsReassociableAdd(const MachineInstr *MI) const { + return TII->IsReassociableAdd(MI) && hasRequiredFlags(MI); +} + +unsigned SystemZReassocAdditions::getOpReadyCycle(MachineInstr *MI, + unsigned OpIdx) { + MachineBasicBlock *MBB = MI->getParent(); + MachineTraceMetrics::Trace BlockTrace = TraceEnsemble->getTrace(MBB); + if (MachineInstr *DefMI = MRI->getVRegDef(MI->getOperand(OpIdx).getReg())) { + unsigned DefDepth = BlockTrace.getInstrCycles(*DefMI).Depth; + return DefDepth + TSchedModel.computeInstrLatency(DefMI); + } + return 0; +} + +// Operand indices. +enum { + LHS = 1, + RHS = 2, + ACC = 3, + PRD = 1, // FMA LHS and RHS together. +}; + +struct ChainNode { + MachineInstr *MI; + unsigned ChainedOp; + bool IsFMA; + unsigned ReadyCycles[4]; // PRD => FMA max(LHS, RHS) + bool DoneOps[4]; // PRD => FMA (LHS && RHS) + bool Keep; + ChainNode(MachineInstr *MI, unsigned UseIdx, bool IsFMA) + : MI(MI), ChainedOp(UseIdx), IsFMA(IsFMA), ReadyCycles{0, 0, 0, 0}, + DoneOps{false, false, false, false}, Keep(false) {} + + unsigned getUnchainedOpIdx() { + assert(!IsFMA && ChainedOp != 0); + return ChainedOp == 1 ? 2 : 1; + } + + Register useReg(unsigned OpIdx) { + DoneOps[OpIdx] = true; + return MI->getOperand(OpIdx).getReg(); + } + + bool isFullyIncluded() { + if (IsFMA) + return DoneOps[PRD] && DoneOps[ACC]; + else + return DoneOps[LHS] && DoneOps[RHS]; + } + + unsigned getReadyCycleUnused(unsigned OpIdx) { + return !DoneOps[OpIdx] ? ReadyCycles[OpIdx] : UINT32_MAX; + } + + unsigned getBestCycle() { + assert(!isFullyIncluded()); + if (IsFMA) + return std::min(getReadyCycleUnused(PRD), getReadyCycleUnused(ACC)); + else + return std::min(getReadyCycleUnused(LHS), getReadyCycleUnused(RHS)); + } + + // Return the ready cycle of the instruction, excluding the chain. + unsigned getInsnReadyCycle() { + if (IsFMA) + return std::max(!DoneOps[PRD] ? ReadyCycles[PRD] : 0, + !DoneOps[ACC] ? ReadyCycles[ACC] : 0); + else + return std::max(!DoneOps[LHS] ? ReadyCycles[LHS] : 0, + !DoneOps[RHS] ? ReadyCycles[RHS] : 0); + } +}; + +struct AddChain : std::vector { + MachineRegisterInfo *MRI; + unsigned FMAOpcode; + unsigned AddOpcode; + const TargetRegisterClass *RC; + + AddChain(MachineRegisterInfo *MRI) + : MRI(MRI), FMAOpcode(0), AddOpcode(0), RC(nullptr) {} + + void push_back(MachineInstr *MI, unsigned UseIdx, bool IsFMA) { + std::vector::push_back(ChainNode(MI, UseIdx, IsFMA)); + if (IsFMA) { + if (!FMAOpcode) + FMAOpcode = MI->getOpcode(); + assert(MI->getOpcode() == FMAOpcode && "Different FMA opcodes?"); + } else { + if (!AddOpcode) + AddOpcode = MI->getOpcode(); + assert(MI->getOpcode() == AddOpcode && "Different Add opcodes?"); + } + if (RC == nullptr) + RC = MRI->getRegClass(MI->getOperand(0).getReg()); + assert(RC == MRI->getRegClass(MI->getOperand(0).getReg()) && "Unhandled RC"); + + // Make sure we don't miss subregs anywhere. + for (auto &MO : MI->operands()) + assert(!MO.isReg() || !MO.getSubReg()); + } +}; + +bool SystemZReassocAdditions::reassociate(MachineInstr *MI, + std::set &Seen) { + bool Modified = false; + const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + MachineBasicBlock *MBB = MI->getParent(); + + // Find the chain of instructions. + AddChain Chain(MRI); + MachineInstr *Curr = MI; + unsigned UseIdx = 0; + while (true) { + Seen.insert(Curr); + Chain.push_back(Curr, UseIdx, IsReassociableFMA(Curr)); + Register Reg = Curr->getOperand(0).getReg(); + if (!MRI->hasOneNonDBGUse(Reg)) + break; + Curr = &*MRI->use_instr_nodbg_begin(Reg); + UseIdx = Curr->findRegisterUseOperandIdx(Reg); + if (Curr->getParent() != MBB || Seen.count(Curr) || + !((IsReassociableFMA(Curr) && UseIdx == 3) || IsReassociableAdd(Curr))) + break; + } + if (Chain.size() <= 1) + return false; + + // Find the operand "ready" cycle(s) for each instruction. + for (unsigned N = 0; N < Chain.size(); ++N) { + if (Chain[N].IsFMA) { + Chain[N].ReadyCycles[PRD] = std::max(getOpReadyCycle(Chain[N].MI, LHS), + getOpReadyCycle(Chain[N].MI, RHS)); + if (N == 0) + Chain[N].ReadyCycles[ACC] = getOpReadyCycle(Chain[N].MI, ACC); + else + Chain[N].DoneOps[ACC] = true; // No unchained add. + } else { + assert(!Chain[N].IsFMA); + if (N == 0) { + Chain[N].ReadyCycles[LHS] = getOpReadyCycle(Chain[N].MI, LHS); + Chain[N].ReadyCycles[RHS] = getOpReadyCycle(Chain[N].MI, RHS); + } else { + unsigned OpIdx = Chain[N].getUnchainedOpIdx(); + Chain[N].ReadyCycles[OpIdx] = getOpReadyCycle(Chain[N].MI, OpIdx); + Chain[N].DoneOps[Chain[N].ChainedOp] = true; // No unchained add. + } + } + } + +#ifndef NDEBUG + LLVM_DEBUG(dbgs() << "Found chain:\n";); + unsigned CurrC = 1; + for (unsigned N = 0; N < Chain.size(); ++N) { + std::ostringstream SS; + SS << "On cycle " << std::left << std::setw(6) << CurrC << " "; + if (Chain[N].IsFMA) { + SS << "FMA P:" << std::left << std::setw(6) << Chain[N].ReadyCycles[PRD]; + if (!Chain[N].DoneOps[ACC]) + SS << " T:" << std::left << std::setw(6) << Chain[N].ReadyCycles[ACC]; + else + SS << " chained "; + } else { + SS << "ADD "; + if (!Chain[N].DoneOps[LHS]) + SS << " T:" << std::left << std::setw(6) << Chain[N].ReadyCycles[LHS]; + else + SS << " chained "; + if (!Chain[N].DoneOps[RHS]) + SS << " T:" << std::left << std::setw(6) << Chain[N].ReadyCycles[RHS]; + else + SS << " chained "; + } + SS << (Chain[N].getInsnReadyCycle() > CurrC ? " stall " : " "); + CurrC = std::max(CurrC, Chain[N].getInsnReadyCycle()); + CurrC += TSchedModel.computeInstrLatency(Chain[N].MI); + LLVM_DEBUG(dbgs() << SS.str() << *Chain[N].MI); + } + LLVM_DEBUG(dbgs() << "Ends on " << CurrC << "\n\n";); +#endif + + // Build the new chain to minimize stalls between instructions. + Register ChainReg; + unsigned CurrCycle; + unsigned LowestNodeUsed = 0; + + auto processNextMI = [&](MachineInstr *NextMI, unsigned N, unsigned OpR) { + if (NextMI != Chain[N].MI) { + // Todo: Propagate NoFPExcept flag? + NextMI->setFlags(MachineInstr::MIFlag::FmReassoc | + MachineInstr::MIFlag::FmNsz | + MachineInstr::MIFlag::FmContract); + NextMI->addRegisterDead(SystemZ::CC, TRI); + Seen.insert(NextMI); + } else + Chain[N].Keep = true; + ChainReg = NextMI->getOperand(0).getReg(); + CurrCycle = std::max(CurrCycle, Chain[N].ReadyCycles[OpR]); + LLVM_DEBUG(std::ostringstream SS; + SS << "On cycle " << std::left << std::setw(6) << CurrCycle; + dbgs() << SS.str() << "\t" << *NextMI;); + CurrCycle += TSchedModel.computeInstrLatency(NextMI); + }; + + auto buildFMA = [&](unsigned N) { + Register LHSFac = Chain[N].useReg(LHS); + Register RHSFac = Chain[N].useReg(RHS); + LowestNodeUsed = std::max(LowestNodeUsed, N); + bool IsIdentical = + LowestNodeUsed == N && Chain[N].MI->getOperand(3).getReg() == ChainReg; + MachineInstr *NextMI; + if (!IsIdentical) { + Register NewChainReg = MRI->createVirtualRegister(Chain.RC); + NextMI = + BuildMI(*MBB, Chain[LowestNodeUsed].MI, Chain[N].MI->getDebugLoc(), + TII->get(Chain.FMAOpcode), NewChainReg) + .addReg(LHSFac) + .addReg(RHSFac) + .addReg(ChainReg); + } else + NextMI = Chain[N].MI; + processNextMI(NextMI, N, 1); + }; + + auto buildAdd = [&](unsigned N, unsigned OpIdx) { + Register Term = Chain[N].useReg(OpIdx); + LowestNodeUsed = std::max(LowestNodeUsed, N); + bool IsIdentical = + LowestNodeUsed == N && Chain[N].MI->getOpcode() == Chain.AddOpcode && + ((Chain[N].MI->getOperand(1).getReg() == ChainReg && + Chain[N].MI->getOperand(2).getReg() == Term) || + (Chain[N].MI->getOperand(2).getReg() == ChainReg && + Chain[N].MI->getOperand(1).getReg() == Term)); + MachineInstr *NextMI; + if (!IsIdentical) { + Register NewChainReg = MRI->createVirtualRegister(Chain.RC); + NextMI = BuildMI(*MBB, Chain[LowestNodeUsed].MI, Chain[N].MI->getDebugLoc(), + TII->get(Chain.AddOpcode), NewChainReg) + .addReg(ChainReg) + .addReg(Term); + } else + NextMI = Chain[N].MI; + processNextMI(NextMI, N, OpIdx); + }; + + // Find the operands for a new top of chain, to build if it is better + // (ready earlier). The first node is special as it can be either an Add + // (two terms) or an FMA (a term and a product). + struct OpRef { + unsigned Idx; + unsigned Op; // 0 => Unused + OpRef() : Idx(0), Op(0) {} + OpRef(unsigned CI, unsigned OI) : Idx(CI), Op(OI) {} + }; + OpRef T1, T2, P1; + if (Chain[0].IsFMA) { + P1 = OpRef(0, PRD); + T1 = OpRef(0, ACC); + } else { + T1 = OpRef(0, LHS); + T2 = OpRef(0, RHS); + if (Chain[0].ReadyCycles[RHS] < Chain[0].ReadyCycles[LHS]) + std::swap(T1, T2); + } + for (unsigned N = 1; N < Chain.size(); ++N) { + if (Chain[N].IsFMA) { + if (!P1.Op || Chain[N].ReadyCycles[PRD] < Chain[P1.Idx].ReadyCycles[PRD]) + P1 = OpRef(N, PRD); + } else { + unsigned OpIdx = Chain[N].getUnchainedOpIdx(); + if (!T2.Op || + Chain[N].ReadyCycles[OpIdx] < Chain[T2.Idx].ReadyCycles[T2.Op]) + T2 = OpRef(N, OpIdx); + if (Chain[T2.Idx].ReadyCycles[T2.Op] < + Chain[T1.Idx].ReadyCycles[T1.Op]) + std::swap(T1, T2); + } + } + unsigned BestTopFMA = !P1.Op ? UINT32_MAX : + std::max(Chain[T1.Idx].ReadyCycles[T1.Op], Chain[P1.Idx].ReadyCycles[PRD]); + unsigned BestTopAdd = !T2.Op ? UINT32_MAX : Chain[T2.Idx].ReadyCycles[T2.Op]; + LLVM_DEBUG(dbgs() << "Transforming:\n";); + if (std::min(BestTopFMA, BestTopAdd) < Chain[0].getInsnReadyCycle()) { + // Build a new top of the chain. + ChainReg = Chain[T1.Idx].useReg(T1.Op); + LowestNodeUsed = T1.Idx; + CurrCycle = Chain[T1.Idx].ReadyCycles[T1.Op]; + if (BestTopFMA <= BestTopAdd) + buildFMA(P1.Idx); // Begin with an FMA. + else + buildAdd(T2.Idx, T2.Op); // Begin with an Add. + } else { + // Keep original top of chain. + if (Chain[0].IsFMA) { + ChainReg = Chain[0].useReg(ACC); + CurrCycle = Chain[0].ReadyCycles[ACC]; + buildFMA(0); + } else { + ChainReg = Chain[0].useReg(LHS); + CurrCycle = Chain[0].ReadyCycles[LHS]; + buildAdd(0, RHS); + } + } + + // Build the rest, chaining each new instruction after the other. Only try + // to move things around when there is a stall. + // TODO: Try to put FMAs / Adds next to each other to help MachineCombiner? + for (unsigned N = 0; N < Chain.size();) { + if (Chain[N].isFullyIncluded()) { + ++N; + continue; + } + + // Minimize the motion of operands by looking down the chain for the next + // term or product that is ready. + unsigned NextIdx = N; + unsigned BestIdx = N; + for (NextIdx = N; NextIdx < Chain.size(); NextIdx++) { + if (Chain[NextIdx].isFullyIncluded()) + continue; + if (Chain[NextIdx].getBestCycle() <= CurrCycle) + break; + if (Chain[NextIdx].getBestCycle() < Chain[BestIdx].getBestCycle()) + BestIdx = NextIdx; + } + if (NextIdx == Chain.size()) + NextIdx = BestIdx; + + if (Chain[NextIdx].IsFMA) { + unsigned FactorsReadyC = Chain[NextIdx].getReadyCycleUnused(PRD); + unsigned TermReadyC = Chain[NextIdx].getReadyCycleUnused(ACC); + assert(FactorsReadyC != UINT32_MAX || TermReadyC != UINT32_MAX); + if (FactorsReadyC < TermReadyC) + buildFMA(NextIdx); + else + buildAdd(NextIdx, ACC); + } else { + unsigned LHSReadyC = Chain[NextIdx].getReadyCycleUnused(LHS); + unsigned RHSReadyC = Chain[NextIdx].getReadyCycleUnused(RHS); + assert(LHSReadyC != UINT32_MAX || RHSReadyC != UINT32_MAX); + unsigned BestIdx = LHSReadyC < RHSReadyC ? 1 : 2; + buildAdd(NextIdx, BestIdx); + } + } + + // Update users of the chain result. + if (!Chain.back().Keep) { + Register OrigResultReg = Chain.back().MI->getOperand(0).getReg(); + MRI->replaceRegWith(OrigResultReg, ChainReg); + } + + // Erase original instructions. + for (unsigned N = 0; N < Chain.size(); ++N) { + assert(Chain[N].isFullyIncluded() && "Missed operand?"); + if (!Chain[N].Keep) { + Chain[N].MI->eraseFromParent(); + Modified = true; + } + } + LLVM_DEBUG(dbgs() << "Ends on " << CurrCycle + << (Modified ? "\n\n" : "\n(unchanged)\n\n");); + return Modified; +} + +bool SystemZReassocAdditions::visitMBB(MachineBasicBlock *MBB) { + bool Modified = false; + + std::set Seen; + for (MachineBasicBlock::iterator I = MBB->begin(); I != MBB->end(); ++I) { + bool FirstInMBB = I == MBB->begin(); + MachineInstr *MI = &*I--; + if (!Seen.count(MI) && (IsReassociableFMA(MI) || IsReassociableAdd(MI))) + Modified |= reassociate(MI, Seen); + I = FirstInMBB ? MBB->begin() : ++I; + } + + return Modified; +} + +bool SystemZReassocAdditions::runOnMachineFunction(MachineFunction &MF) { + TII = MF.getSubtarget().getInstrInfo(); + MRI = &MF.getRegInfo(); + + const TargetSubtargetInfo *STI = &MF.getSubtarget(); + TSchedModel.init(STI); + + Traces = &getAnalysis(); + TraceEnsemble = Traces->getEnsemble(TII->getMachineCombinerTraceStrategy()); + + LLVM_DEBUG(dbgs() << getPassName() << ": " << MF.getName() << "\n\n"); + bool Modified = false; + for (auto &MBB : MF) + Modified |= visitMBB(&MBB); + + return Modified; +} Index: llvm/lib/Target/SystemZ/SystemZScheduleZ13.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZScheduleZ13.td +++ llvm/lib/Target/SystemZ/SystemZScheduleZ13.td @@ -1346,15 +1346,15 @@ // Add / subtract def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)DB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>; // Multiply / multiply-and-add/subtract def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFMDB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFMDB$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFMDB(_CCPseudo)?$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM(A|S)$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM(A|S)DB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(A|S)DB$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(A|S)DB(_CCPseudo)?$")>; // Divide / square root def : InstRW<[WLat30, VecFPd, NormalGr], (instregex "VFD$")>; Index: llvm/lib/Target/SystemZ/SystemZScheduleZ14.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZScheduleZ14.td +++ llvm/lib/Target/SystemZ/SystemZScheduleZ14.td @@ -1390,22 +1390,22 @@ // Add / subtract def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)$")>; def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VF(A|S)DB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)SB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)SB$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)SB(_CCPseudo)?$")>; def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>; // Multiply / multiply-and-add/subtract def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM$")>; def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VFMDB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(D|S)B$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(D|S)B(_CCPseudo)?$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFMSB$")>; def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(N)?M(A|S)$")>; def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB(_CCPseudo)?$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(N)?M(A|S)SB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB(_CCPseudo)?$")>; def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>; // Divide / square root Index: llvm/lib/Target/SystemZ/SystemZScheduleZ15.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZScheduleZ15.td +++ llvm/lib/Target/SystemZ/SystemZScheduleZ15.td @@ -1433,21 +1433,21 @@ // Add / subtract def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB(_CCPseudo)?$")>; def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>; // Multiply / multiply-and-add/subtract def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFM(DB)?$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B(_CCPseudo)?$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>; def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB(_CCPseudo)?$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)SB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB(_CCPseudo)?$")>; def : InstRW<[WLat30, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>; // Divide / square root Index: llvm/lib/Target/SystemZ/SystemZScheduleZ16.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZScheduleZ16.td +++ llvm/lib/Target/SystemZ/SystemZScheduleZ16.td @@ -1439,21 +1439,21 @@ // Add / subtract def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB(_CCPseudo)?$")>; def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>; // Multiply / multiply-and-add/subtract def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFM(DB)?$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B(_CCPseudo)?$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>; def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)DB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)DB(_CCPseudo)?$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)SB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(N)?M(A|S)SB(_CCPseudo)?$")>; def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WF(N)?M(A|S)XB$")>; // Divide / square root Index: llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -30,6 +30,15 @@ using namespace llvm; +static cl::opt +EnableReassocAdditions("reassoc-additions", cl::init(false), + cl::Hidden); + +static cl::opt +EnableMachineCombinerPass("systemz-machine-combiner", + cl::desc("Enable the machine combiner pass"), + cl::init(true), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() { // Register the target. RegisterTargetMachine X(getTheSystemZTarget()); @@ -240,11 +249,20 @@ bool SystemZPassConfig::addILPOpts() { addPass(&EarlyIfConverterID); + + if (getOptLevel() == CodeGenOpt::Aggressive) { + if (EnableReassocAdditions) + addPass(createSystemZReassocAdditionsPass(getSystemZTargetMachine())); + if (EnableMachineCombinerPass) + addPass(&MachineCombinerID); + } + return true; } void SystemZPassConfig::addPreRegAlloc() { addPass(createSystemZCopyPhysRegsPass(getSystemZTargetMachine())); + addPass(createSystemZFinalizeReassociationPass(getSystemZTargetMachine())); } void SystemZPassConfig::addPostRewrite() { Index: llvm/lib/Target/X86/X86InstrInfo.h =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.h +++ llvm/lib/Target/X86/X86InstrInfo.h @@ -543,7 +543,7 @@ /// instruction that defines FoldAsLoadDefReg, and the function returns /// the machine instruction generated due to folding. MachineInstr *optimizeLoadInstr(MachineInstr &MI, - const MachineRegisterInfo *MRI, + MachineRegisterInfo *MRI, Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const override; Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4695,7 +4695,7 @@ /// register, the virtual register is used once in the same BB, and the /// instructions in-between do not load or store, and have no side effects. MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, - const MachineRegisterInfo *MRI, + MachineRegisterInfo *MRI, Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const { // Check whether we can move DefMI here. Index: llvm/test/CodeGen/SystemZ/fp-add-02.ll =================================================================== --- llvm/test/CodeGen/SystemZ/fp-add-02.ll +++ llvm/test/CodeGen/SystemZ/fp-add-02.ll @@ -118,3 +118,17 @@ ret double %add10 } + +; Check that reassociation flags do not get in the way of adb. +define double @f8(ptr %x) { +; CHECK-LABEL: f8: +; CHECK: ld %f0 +; CHECK: adb %f0 +; CHECK: br %r14 +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn double %1, %0 + ret double %add +} Index: llvm/test/CodeGen/SystemZ/fp-mul-02.ll =================================================================== --- llvm/test/CodeGen/SystemZ/fp-mul-02.ll +++ llvm/test/CodeGen/SystemZ/fp-mul-02.ll @@ -1,6 +1,6 @@ ; Test multiplication of two f32s, producing an f64 result. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s declare float @foo() @@ -201,3 +201,13 @@ ret float %trunc9 } + +; Check that reassociation flags do not get in the way of mdebr. +define double @f8(float %Src) { +; CHECK-LABEL: f8: +; CHECK: mdebr %f0, %f0 +; CHECK: br %r14 + %D = fpext float %Src to double + %res = fmul reassoc nsz arcp contract afn double %D, %D + ret double %res +} Index: llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-01.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-01.ll @@ -0,0 +1,690 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \ +; RUN: | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -stop-before=processimpdefs \ +; RUN: -O3 | FileCheck %s --check-prefix=PASSOUTPUT + +; Test reassociation of fp add, subtract and multiply. + +define double @fun0_fadd(ptr %x) { +; CHECK-LABEL: fun0_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld %f0, 0(%r2) +; CHECK-NEXT: adb %f0, 8(%r2) +; CHECK-NEXT: ld %f1, 24(%r2) +; CHECK-NEXT: adb %f1, 16(%r2) +; CHECK-NEXT: adbr %f0, %f1 +; CHECK-NEXT: ld %f1, 40(%r2) +; CHECK-NEXT: adb %f1, 32(%r2) +; CHECK-NEXT: adb %f1, 48(%r2) +; CHECK-NEXT: adbr %f0, %f1 +; CHECK-NEXT: adb %f0, 56(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun0_fadd +; PASSOUTPUT-NOT: WFADB +; PASSOUTPUT: WFADB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFADB {{.*}}$cc +; PASSOUTPUT-NOT: WFADB_CCPseudo +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn double %add, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn double %add3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %4 = load double, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn double %add5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %5 = load double, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn double %add7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %6 = load double, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn double %add9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + %7 = load double, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn double %add11, %7 + ret double %add13 +} + +define float @fun1_fadd(ptr %x) { +; CHECK-LABEL: fun1_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lde %f0, 0(%r2) +; CHECK-NEXT: aeb %f0, 4(%r2) +; CHECK-NEXT: lde %f1, 12(%r2) +; CHECK-NEXT: aeb %f1, 8(%r2) +; CHECK-NEXT: aebr %f0, %f1 +; CHECK-NEXT: lde %f1, 20(%r2) +; CHECK-NEXT: aeb %f1, 16(%r2) +; CHECK-NEXT: aeb %f1, 24(%r2) +; CHECK-NEXT: aebr %f0, %f1 +; CHECK-NEXT: aeb %f0, 28(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun1_fadd +; PASSOUTPUT-NOT: WFASB +; PASSOUTPUT: WFASB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFASB {{.*}}$cc +; PASSOUTPUT-NOT: WFASB_CCPseudo +entry: + %0 = load float, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn float %add, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn float %add3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4 + %4 = load float, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn float %add5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5 + %5 = load float, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn float %add7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6 + %6 = load float, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn float %add9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7 + %7 = load float, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn float %add11, %7 + ret float %add13 +} + +define fp128 @fun2_fadd(ptr %x) { +; CHECK-LABEL: fun2_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 16(%r3), 3 +; CHECK-NEXT: wfaxb %v0, %v1, %v0 +; CHECK-NEXT: vl %v1, 32(%r3), 3 +; CHECK-NEXT: vl %v2, 48(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: wfaxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r3), 3 +; CHECK-NEXT: vl %v2, 80(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: wfaxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r3), 3 +; CHECK-NEXT: wfaxb %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load fp128, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1 + %1 = load fp128, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn fp128 %1, %0 + %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2 + %2 = load fp128, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn fp128 %add, %2 + %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3 + %3 = load fp128, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn fp128 %add3, %3 + %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 4 + %4 = load fp128, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn fp128 %add5, %4 + %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 5 + %5 = load fp128, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn fp128 %add7, %5 + %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 6 + %6 = load fp128, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn fp128 %add9, %6 + %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 7 + %7 = load fp128, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn fp128 %add11, %7 + ret fp128 %add13 +} + +define <2 x double> @fun3_fadd(ptr %x) { +; CHECK-LABEL: fun3_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfadb %v0, %v1, %v0 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vfadb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vfadb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfadb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x double>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1 + %1 = load <2 x double>, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn <2 x double> %1, %0 + %arrayidx2 = getelementptr inbounds <2 x double>, ptr %x, i64 2 + %2 = load <2 x double>, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn <2 x double> %add, %2 + %arrayidx4 = getelementptr inbounds <2 x double>, ptr %x, i64 3 + %3 = load <2 x double>, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn <2 x double> %add3, %3 + %arrayidx6 = getelementptr inbounds <2 x double>, ptr %x, i64 4 + %4 = load <2 x double>, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn <2 x double> %add5, %4 + %arrayidx8 = getelementptr inbounds <2 x double>, ptr %x, i64 5 + %5 = load <2 x double>, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn <2 x double> %add7, %5 + %arrayidx10 = getelementptr inbounds <2 x double>, ptr %x, i64 6 + %6 = load <2 x double>, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn <2 x double> %add9, %6 + %arrayidx12 = getelementptr inbounds <2 x double>, ptr %x, i64 7 + %7 = load <2 x double>, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn <2 x double> %add11, %7 + ret <2 x double> %add13 +} + +define <4 x float> @fun4_fadd(ptr %x) { +; CHECK-LABEL: fun4_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfasb %v0, %v1, %v0 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vfasb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vfasb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfasb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x float>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1 + %1 = load <4 x float>, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn <4 x float> %1, %0 + %arrayidx2 = getelementptr inbounds <4 x float>, ptr %x, i64 2 + %2 = load <4 x float>, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn <4 x float> %add, %2 + %arrayidx4 = getelementptr inbounds <4 x float>, ptr %x, i64 3 + %3 = load <4 x float>, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn <4 x float> %add3, %3 + %arrayidx6 = getelementptr inbounds <4 x float>, ptr %x, i64 4 + %4 = load <4 x float>, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn <4 x float> %add5, %4 + %arrayidx8 = getelementptr inbounds <4 x float>, ptr %x, i64 5 + %5 = load <4 x float>, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn <4 x float> %add7, %5 + %arrayidx10 = getelementptr inbounds <4 x float>, ptr %x, i64 6 + %6 = load <4 x float>, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn <4 x float> %add9, %6 + %arrayidx12 = getelementptr inbounds <4 x float>, ptr %x, i64 7 + %7 = load <4 x float>, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn <4 x float> %add11, %7 + ret <4 x float> %add13 +} + +define double @fun5_fsub(ptr %x) { +; CHECK-LABEL: fun5_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld %f0, 0(%r2) +; CHECK-NEXT: sdb %f0, 8(%r2) +; CHECK-NEXT: ld %f1, 24(%r2) +; CHECK-NEXT: adb %f1, 16(%r2) +; CHECK-NEXT: sdbr %f0, %f1 +; CHECK-NEXT: ld %f1, 40(%r2) +; CHECK-NEXT: adb %f1, 32(%r2) +; CHECK-NEXT: adb %f1, 48(%r2) +; CHECK-NEXT: sdbr %f0, %f1 +; CHECK-NEXT: sdb %f0, 56(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun5_fsub +; PASSOUTPUT-NOT: WFSDB +; PASSOUTPUT: WFSDB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFSDB {{.*}}$cc +; PASSOUTPUT-NOT: WFSDB_CCPseudo +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %sub = fsub reassoc nsz arcp contract afn double %0, %1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %sub3 = fsub reassoc nsz arcp contract afn double %sub, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %sub5 = fsub reassoc nsz arcp contract afn double %sub3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %4 = load double, ptr %arrayidx6, align 8 + %sub7 = fsub reassoc nsz arcp contract afn double %sub5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %5 = load double, ptr %arrayidx8, align 8 + %sub9 = fsub reassoc nsz arcp contract afn double %sub7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %6 = load double, ptr %arrayidx10, align 8 + %sub11 = fsub reassoc nsz arcp contract afn double %sub9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + %7 = load double, ptr %arrayidx12, align 8 + %sub13 = fsub reassoc nsz arcp contract afn double %sub11, %7 + ret double %sub13 +} + +define float @fun6_fsub(ptr %x) { +; CHECK-LABEL: fun6_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lde %f0, 0(%r2) +; CHECK-NEXT: seb %f0, 4(%r2) +; CHECK-NEXT: lde %f1, 12(%r2) +; CHECK-NEXT: aeb %f1, 8(%r2) +; CHECK-NEXT: sebr %f0, %f1 +; CHECK-NEXT: lde %f1, 20(%r2) +; CHECK-NEXT: aeb %f1, 16(%r2) +; CHECK-NEXT: aeb %f1, 24(%r2) +; CHECK-NEXT: sebr %f0, %f1 +; CHECK-NEXT: seb %f0, 28(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun6_fsub +; PASSOUTPUT-NOT: WFSSB +; PASSOUTPUT: WFSSB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFSSB {{.*}}$cc +; PASSOUTPUT-NOT: WFSSB_CCPseudo +entry: + %0 = load float, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 8 + %sub = fsub reassoc nsz arcp contract afn float %0, %1 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 8 + %sub3 = fsub reassoc nsz arcp contract afn float %sub, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 8 + %sub5 = fsub reassoc nsz arcp contract afn float %sub3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4 + %4 = load float, ptr %arrayidx6, align 8 + %sub7 = fsub reassoc nsz arcp contract afn float %sub5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5 + %5 = load float, ptr %arrayidx8, align 8 + %sub9 = fsub reassoc nsz arcp contract afn float %sub7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6 + %6 = load float, ptr %arrayidx10, align 8 + %sub11 = fsub reassoc nsz arcp contract afn float %sub9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7 + %7 = load float, ptr %arrayidx12, align 8 + %sub13 = fsub reassoc nsz arcp contract afn float %sub11, %7 + ret float %sub13 +} + +define fp128 @fun7_fsub(ptr %x) { +; CHECK-LABEL: fun7_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 16(%r3), 3 +; CHECK-NEXT: wfsxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r3), 3 +; CHECK-NEXT: vl %v2, 48(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: wfsxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r3), 3 +; CHECK-NEXT: vl %v2, 80(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: wfsxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r3), 3 +; CHECK-NEXT: wfsxb %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load fp128, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1 + %1 = load fp128, ptr %arrayidx1, align 8 + %sub = fsub reassoc nsz arcp contract afn fp128 %0, %1 + %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2 + %2 = load fp128, ptr %arrayidx2, align 8 + %sub3 = fsub reassoc nsz arcp contract afn fp128 %sub, %2 + %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3 + %3 = load fp128, ptr %arrayidx4, align 8 + %sub5 = fsub reassoc nsz arcp contract afn fp128 %sub3, %3 + %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 4 + %4 = load fp128, ptr %arrayidx6, align 8 + %sub7 = fsub reassoc nsz arcp contract afn fp128 %sub5, %4 + %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 5 + %5 = load fp128, ptr %arrayidx8, align 8 + %sub9 = fsub reassoc nsz arcp contract afn fp128 %sub7, %5 + %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 6 + %6 = load fp128, ptr %arrayidx10, align 8 + %sub11 = fsub reassoc nsz arcp contract afn fp128 %sub9, %6 + %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 7 + %7 = load fp128, ptr %arrayidx12, align 8 + %sub13 = fsub reassoc nsz arcp contract afn fp128 %sub11, %7 + ret fp128 %sub13 +} + +define <2 x double> @fun8_fsub(ptr %x) { +; CHECK-LABEL: fun8_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfsdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vfsdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vfsdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfsdb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x double>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1 + %1 = load <2 x double>, ptr %arrayidx1, align 8 + %sub = fsub reassoc nsz arcp contract afn <2 x double> %0, %1 + %arrayidx2 = getelementptr inbounds <2 x double>, ptr %x, i64 2 + %2 = load <2 x double>, ptr %arrayidx2, align 8 + %sub3 = fsub reassoc nsz arcp contract afn <2 x double> %sub, %2 + %arrayidx4 = getelementptr inbounds <2 x double>, ptr %x, i64 3 + %3 = load <2 x double>, ptr %arrayidx4, align 8 + %sub5 = fsub reassoc nsz arcp contract afn <2 x double> %sub3, %3 + %arrayidx6 = getelementptr inbounds <2 x double>, ptr %x, i64 4 + %4 = load <2 x double>, ptr %arrayidx6, align 8 + %sub7 = fsub reassoc nsz arcp contract afn <2 x double> %sub5, %4 + %arrayidx8 = getelementptr inbounds <2 x double>, ptr %x, i64 5 + %5 = load <2 x double>, ptr %arrayidx8, align 8 + %sub9 = fsub reassoc nsz arcp contract afn <2 x double> %sub7, %5 + %arrayidx10 = getelementptr inbounds <2 x double>, ptr %x, i64 6 + %6 = load <2 x double>, ptr %arrayidx10, align 8 + %sub11 = fsub reassoc nsz arcp contract afn <2 x double> %sub9, %6 + %arrayidx12 = getelementptr inbounds <2 x double>, ptr %x, i64 7 + %7 = load <2 x double>, ptr %arrayidx12, align 8 + %sub13 = fsub reassoc nsz arcp contract afn <2 x double> %sub11, %7 + ret <2 x double> %sub13 +} + +define <4 x float> @fun9_fsub(ptr %x) { +; CHECK-LABEL: fun9_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfssb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vfssb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vfssb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfssb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x float>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1 + %1 = load <4 x float>, ptr %arrayidx1, align 8 + %sub = fsub reassoc nsz arcp contract afn <4 x float> %0, %1 + %arrayidx2 = getelementptr inbounds <4 x float>, ptr %x, i64 2 + %2 = load <4 x float>, ptr %arrayidx2, align 8 + %sub3 = fsub reassoc nsz arcp contract afn <4 x float> %sub, %2 + %arrayidx4 = getelementptr inbounds <4 x float>, ptr %x, i64 3 + %3 = load <4 x float>, ptr %arrayidx4, align 8 + %sub5 = fsub reassoc nsz arcp contract afn <4 x float> %sub3, %3 + %arrayidx6 = getelementptr inbounds <4 x float>, ptr %x, i64 4 + %4 = load <4 x float>, ptr %arrayidx6, align 8 + %sub7 = fsub reassoc nsz arcp contract afn <4 x float> %sub5, %4 + %arrayidx8 = getelementptr inbounds <4 x float>, ptr %x, i64 5 + %5 = load <4 x float>, ptr %arrayidx8, align 8 + %sub9 = fsub reassoc nsz arcp contract afn <4 x float> %sub7, %5 + %arrayidx10 = getelementptr inbounds <4 x float>, ptr %x, i64 6 + %6 = load <4 x float>, ptr %arrayidx10, align 8 + %sub11 = fsub reassoc nsz arcp contract afn <4 x float> %sub9, %6 + %arrayidx12 = getelementptr inbounds <4 x float>, ptr %x, i64 7 + %7 = load <4 x float>, ptr %arrayidx12, align 8 + %sub13 = fsub reassoc nsz arcp contract afn <4 x float> %sub11, %7 + ret <4 x float> %sub13 +} + +define double @fun10_fmul(ptr %x) { +; CHECK-LABEL: fun10_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld %f0, 8(%r2) +; CHECK-NEXT: mdb %f0, 0(%r2) +; CHECK-NEXT: ld %f1, 24(%r2) +; CHECK-NEXT: mdb %f1, 16(%r2) +; CHECK-NEXT: mdbr %f0, %f1 +; CHECK-NEXT: ld %f1, 40(%r2) +; CHECK-NEXT: mdb %f1, 32(%r2) +; CHECK-NEXT: mdb %f1, 48(%r2) +; CHECK-NEXT: mdbr %f0, %f1 +; CHECK-NEXT: mdb %f0, 56(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun10_fmul +; PASSOUTPUT-NOT: WFMDB +; PASSOUTPUT: WFMDB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFMDB {{.*}}$cc +; PASSOUTPUT-NOT: WFMDB_CCPseudo +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn double %0, %1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %4 = load double, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %5 = load double, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %6 = load double, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + %7 = load double, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7 + ret double %mul13 +} + +define float @fun11_fmul(ptr %x) { +; CHECK-LABEL: fun11_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lde %f0, 4(%r2) +; CHECK-NEXT: meeb %f0, 0(%r2) +; CHECK-NEXT: lde %f1, 12(%r2) +; CHECK-NEXT: meeb %f1, 8(%r2) +; CHECK-NEXT: meebr %f0, %f1 +; CHECK-NEXT: lde %f1, 20(%r2) +; CHECK-NEXT: meeb %f1, 16(%r2) +; CHECK-NEXT: meeb %f1, 24(%r2) +; CHECK-NEXT: meebr %f0, %f1 +; CHECK-NEXT: meeb %f0, 28(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun11_fmul +; PASSOUTPUT-NOT: WFMSB +; PASSOUTPUT: WFMSB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFMSB {{.*}}$cc +; PASSOUTPUT-NOT: WFMSB_CCPseudo +entry: + %0 = load float, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn float %0, %1 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4 + %4 = load float, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5 + %5 = load float, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6 + %6 = load float, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7 + %7 = load float, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7 + ret float %mul13 +} + +define fp128 @fun12_fmul(ptr %x) { +; CHECK-LABEL: fun12_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 16(%r3), 3 +; CHECK-NEXT: wfmxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r3), 3 +; CHECK-NEXT: vl %v2, 48(%r3), 3 +; CHECK-NEXT: wfmxb %v1, %v1, %v2 +; CHECK-NEXT: wfmxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r3), 3 +; CHECK-NEXT: vl %v2, 80(%r3), 3 +; CHECK-NEXT: wfmxb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r3), 3 +; CHECK-NEXT: wfmxb %v1, %v1, %v2 +; CHECK-NEXT: wfmxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r3), 3 +; CHECK-NEXT: wfmxb %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load fp128, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1 + %1 = load fp128, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn fp128 %0, %1 + %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2 + %2 = load fp128, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn fp128 %mul, %2 + %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3 + %3 = load fp128, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn fp128 %mul3, %3 + %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 4 + %4 = load fp128, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn fp128 %mul5, %4 + %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 5 + %5 = load fp128, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn fp128 %mul7, %5 + %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 6 + %6 = load fp128, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn fp128 %mul9, %6 + %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 7 + %7 = load fp128, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn fp128 %mul11, %7 + ret fp128 %mul13 +} + +define <2 x double> @fun13_fmul(ptr %x) { +; CHECK-LABEL: fun13_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfmdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfmdb %v1, %v1, %v2 +; CHECK-NEXT: vfmdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfmdb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfmdb %v1, %v1, %v2 +; CHECK-NEXT: vfmdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfmdb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x double>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1 + %1 = load <2 x double>, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn <2 x double> %0, %1 + %arrayidx2 = getelementptr inbounds <2 x double>, ptr %x, i64 2 + %2 = load <2 x double>, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn <2 x double> %mul, %2 + %arrayidx4 = getelementptr inbounds <2 x double>, ptr %x, i64 3 + %3 = load <2 x double>, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn <2 x double> %mul3, %3 + %arrayidx6 = getelementptr inbounds <2 x double>, ptr %x, i64 4 + %4 = load <2 x double>, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn <2 x double> %mul5, %4 + %arrayidx8 = getelementptr inbounds <2 x double>, ptr %x, i64 5 + %5 = load <2 x double>, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn <2 x double> %mul7, %5 + %arrayidx10 = getelementptr inbounds <2 x double>, ptr %x, i64 6 + %6 = load <2 x double>, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn <2 x double> %mul9, %6 + %arrayidx12 = getelementptr inbounds <2 x double>, ptr %x, i64 7 + %7 = load <2 x double>, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn <2 x double> %mul11, %7 + ret <2 x double> %mul13 +} + +define <4 x float> @fun14_fmul(ptr %x) { +; CHECK-LABEL: fun14_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfmsb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfmsb %v1, %v1, %v2 +; CHECK-NEXT: vfmsb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfmsb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfmsb %v1, %v1, %v2 +; CHECK-NEXT: vfmsb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfmsb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x float>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1 + %1 = load <4 x float>, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn <4 x float> %0, %1 + %arrayidx2 = getelementptr inbounds <4 x float>, ptr %x, i64 2 + %2 = load <4 x float>, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn <4 x float> %mul, %2 + %arrayidx4 = getelementptr inbounds <4 x float>, ptr %x, i64 3 + %3 = load <4 x float>, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn <4 x float> %mul3, %3 + %arrayidx6 = getelementptr inbounds <4 x float>, ptr %x, i64 4 + %4 = load <4 x float>, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn <4 x float> %mul5, %4 + %arrayidx8 = getelementptr inbounds <4 x float>, ptr %x, i64 5 + %5 = load <4 x float>, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn <4 x float> %mul7, %5 + %arrayidx10 = getelementptr inbounds <4 x float>, ptr %x, i64 6 + %6 = load <4 x float>, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn <4 x float> %mul9, %6 + %arrayidx12 = getelementptr inbounds <4 x float>, ptr %x, i64 7 + %7 = load <4 x float>, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn <4 x float> %mul11, %7 + ret <4 x float> %mul13 +} Index: llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-02.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-02.ll @@ -0,0 +1,188 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \ +; RUN: -print-before=machine-combiner 2>&1 | FileCheck %s + +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -O3 \ +; RUN: -print-before=machine-combiner -print-after=machine-combiner -fma3-ch \ +; RUN: 2>&1 | FileCheck %s --check-prefix=FMA3-CH + +; REQUIRES: asserts + +define double @fun0_fma4(ptr %x, double %A) { +; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun0_fma4: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d, $f0d +; CHECK-NEXT: %1:fp64bit = COPY $f0d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4) +; CHECK-NEXT: %6:vr64bit = VL64 %0:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx6) +; CHECK-NEXT: %7:vr64bit = VL64 %0:addr64bit, 40, $noreg :: (load (s64) from %ir.arrayidx8) +; CHECK-NEXT: %8:vr64bit = VL64 %0:addr64bit, 48, $noreg :: (load (s64) from %ir.arrayidx10) +; CHECK-NEXT: %9:vr64bit = VL64 %0:addr64bit, 56, $noreg :: (load (s64) from %ir.arrayidx12) +; CHECK-NEXT: %10:vr64bit = nsz contract reassoc nofpexcept WFMADB_CCPseudo killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit, implicit-def dead $cc, implicit $fpc +; CHECK-NEXT: %11:vr64bit = nsz contract reassoc nofpexcept WFMADB_CCPseudo killed %4:vr64bit, killed %5:vr64bit, killed %10:vr64bit, implicit-def dead $cc, implicit $fpc +; CHECK-NEXT: %12:vr64bit = nsz contract reassoc nofpexcept WFMADB_CCPseudo killed %6:vr64bit, killed %7:vr64bit, killed %11:vr64bit, implicit-def dead $cc, implicit $fpc +; CHECK-NEXT: %13:vr64bit = nsz contract reassoc nofpexcept WFMADB_CCPseudo killed %8:vr64bit, killed %9:vr64bit, killed %12:vr64bit, implicit-def dead $cc, implicit $fpc +; CHECK-NEXT: $f0d = COPY %13:vr64bit +; CHECK-NEXT: Return implicit $f0d + +; FMA3-CH: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***: +; FMA3-CH-NEXT: # Machine code for function fun0_fma4: IsSSA, TracksLiveness +; FMA3-CH: [[A0:%1:fp64bit]] = COPY $f0d + +; FMA3-CH: [[A1:%[0-9]+:vr64bit]] = {{.*}} WFMADB_CCPseudo killed [[M11:%[0-9]+:vr64bit]], killed [[M12:%[0-9]+:vr64bit]], [[A0]] +; FMA3-CH-NEXT: [[A2:%[0-9]+:vr64bit]] = {{.*}} WFMADB_CCPseudo killed [[M21:%[0-9]+:vr64bit]], killed [[M22:%[0-9]+:vr64bit]], killed [[A1]] +; FMA3-CH-NEXT: [[A3:%[0-9]+:vr64bit]] = {{.*}} WFMADB_CCPseudo killed [[M31:%[0-9]+:vr64bit]], killed [[M32:%[0-9]+:vr64bit]], killed [[A2]] +; FMA3-CH-NEXT: [[A4:%[0-9]+:vr64bit]] = {{.*}} WFMADB_CCPseudo killed [[M41:%[0-9]+:vr64bit]], killed [[M42:%[0-9]+:vr64bit]], killed [[A3]] + +; FMA3-CH: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; FMA3-CH-NEXT: # Machine code for function fun0_fma4: IsSSA, TracksLiveness +; FMA3-CH: %14:vr64bit = {{.*}} WFMDB_CCPseudo killed [[M11]], killed [[M12]], implicit-def dead $cc +; FMA3-CH-NEXT: %15:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], [[A0]], implicit-def dead $cc +; FMA3-CH-NEXT: %16:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M31]], killed [[M32]], %14:vr64bit, implicit-def dead $cc +; FMA3-CH-NEXT: %12:vr64bit = {{.*}} WFADB_CCPseudo %15:vr64bit, %16:vr64bit, implicit-def dead $cc +; FMA3-CH-NEXT: %13:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M41]], killed [[M42]], killed %12:vr64bit, implicit-def dead $cc +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + %4 = load double, ptr %arrayidx6 + %5 = load double, ptr %arrayidx8 + %6 = load double, ptr %arrayidx10 + %7 = load double, ptr %arrayidx12 + + %mul1 = fmul reassoc nsz contract double %0, %1 + %mul2 = fmul reassoc nsz contract double %2, %3 + %mul3 = fmul reassoc nsz contract double %4, %5 + %mul4 = fmul reassoc nsz contract double %6, %7 + + %A1 = fadd reassoc nsz contract double %A, %mul1 + %A2 = fadd reassoc nsz contract double %A1, %mul2 + %A3 = fadd reassoc nsz contract double %A2, %mul3 + %A4 = fadd reassoc nsz contract double %A3, %mul4 + + ret double %A4 +} + +; Same as above, but the third FMA has a long latency operand. +; For FMA3-CH, the first three FMAs are not worth it as the depth of the new +; root increases, but the last three works out fine. +define double @fun1_fma4_divop(ptr %x, double %A) { +; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun1_fma4_divop: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d, $f0d +; CHECK-NEXT: %1:fp64bit = COPY $f0d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4) +; CHECK-NEXT: %6:fp64bit = VL64 %0:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx6) +; CHECK-NEXT: %7:vr64bit = VL64 %0:addr64bit, 48, $noreg :: (load (s64) from %ir.arrayidx10) +; CHECK-NEXT: %8:vr64bit = VL64 %0:addr64bit, 56, $noreg :: (load (s64) from %ir.arrayidx12) +; CHECK-NEXT: %9:fp64bit = nofpexcept DDB %6:fp64bit(tied-def 0), %0:addr64bit, 40, $noreg, implicit $fpc :: (load (s64) from %ir.arrayidx8) +; CHECK-NEXT: %10:vr64bit = nsz contract reassoc nofpexcept WFMADB_CCPseudo killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit, implicit-def dead $cc, implicit $fpc +; CHECK-NEXT: %11:vr64bit = nsz contract reassoc nofpexcept WFMADB_CCPseudo killed %4:vr64bit, killed %5:vr64bit, killed %10:vr64bit, implicit-def dead $cc, implicit $fpc +; CHECK-NEXT: %12:vr64bit = nsz contract reassoc nofpexcept WFMADB_CCPseudo %6:fp64bit, killed %9:fp64bit, killed %11:vr64bit, implicit-def dead $cc, implicit $fpc +; CHECK-NEXT: %13:vr64bit = nsz contract reassoc nofpexcept WFMADB_CCPseudo killed %7:vr64bit, killed %8:vr64bit, killed %12:vr64bit, implicit-def dead $cc, implicit $fpc +; CHECK-NEXT: $f0d = COPY %13:vr64bit +; CHECK-NEXT: Return implicit $f0d + +; FMA3-CH: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***: +; FMA3-CH-NEXT: # Machine code for function fun1_fma4_divop: IsSSA, TracksLiveness +; FMA3-CH: [[A0:%1:fp64bit]] = COPY $f0d +; FMA3-CH: [[A1:%[0-9]+:vr64bit]] = {{.*}} WFMADB_CCPseudo killed [[M11:%[0-9]+:vr64bit]], killed [[M12:%[0-9]+:vr64bit]], [[A0]] +; FMA3-CH-NEXT: [[A2:%[0-9]+:vr64bit]] = {{.*}} WFMADB_CCPseudo killed [[M21:%[0-9]+:vr64bit]], killed [[M22:%[0-9]+:vr64bit]], killed [[A1]] +; FMA3-CH-NEXT: [[A3:%[0-9]+:vr64bit]] = {{.*}} WFMADB_CCPseudo [[M31:%[0-9]+:fp64bit]], killed [[M32:%[0-9]+:fp64bit]], killed [[A2]] +; FMA3-CH-NEXT: [[A4:%[0-9]+:vr64bit]] = {{.*}} WFMADB_CCPseudo killed [[M41:%[0-9]+:vr64bit]], killed [[M42:%[0-9]+:vr64bit]], killed [[A3]] + +; FMA3-CH: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; FMA3-CH-NEXT: # Machine code for function fun1_fma4_divop: IsSSA, TracksLiveness +; FMA3-CH: %10:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M11]], killed [[M12]], [[A0]] +; FMA3-CH-NEXT: %17:vr64bit = {{.*}} WFMDB_CCPseudo killed [[M21]], killed [[M22]] +; FMA3-CH-NEXT: %18:vr64bit = {{.*}} WFMADB_CCPseudo [[M31]], killed [[M32]], killed [[A1]] +; FMA3-CH-NEXT: %19:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M41]], killed [[M42]], %17:vr64bit +; FMA3-CH-NEXT: %13:vr64bit = {{.*}} WFADB_CCPseudo %18:vr64bit, %19:vr64bit +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + %4 = load double, ptr %arrayidx6 + %5 = load double, ptr %arrayidx8 + %6 = load double, ptr %arrayidx10 + %7 = load double, ptr %arrayidx12 + %div = fdiv double %4, %5 + + %mul1 = fmul reassoc nsz contract double %0, %1 + %mul2 = fmul reassoc nsz contract double %2, %3 + %mul3 = fmul reassoc nsz contract double %4, %div + %mul4 = fmul reassoc nsz contract double %6, %7 + + %A1 = fadd reassoc nsz contract double %A, %mul1 + %A2 = fadd reassoc nsz contract double %A1, %mul2 + %A3 = fadd reassoc nsz contract double %A2, %mul3 + %A4 = fadd reassoc nsz contract double %A3, %mul4 + + ret double %A4 +} + + +; Test that reg/mem folding works after transformations: + +; FMA3-CH: fun0_fma4: # @fun0_fma4 +; FMA3-CH-NEXT: .cfi_startproc +; FMA3-CH-NEXT: # %bb.0: # %entry +; FMA3-CH-NEXT: ld %f1, 8(%r2) +; FMA3-CH-NEXT: mdb %f1, 0(%r2) +; FMA3-CH-NEXT: ld %f2, 24(%r2) +; FMA3-CH-NEXT: madb %f0, %f2, 16(%r2) +; FMA3-CH-NEXT: ld %f3, 40(%r2) +; FMA3-CH-NEXT: ld %f2, 56(%r2) +; FMA3-CH-NEXT: madb %f1, %f3, 32(%r2) +; FMA3-CH-NEXT: adbr %f0, %f1 +; FMA3-CH-NEXT: madb %f0, %f2, 48(%r2) +; FMA3-CH-NEXT: br %r14 +; FMA3-CH-NEXT: .Lfunc_end0: +; FMA3-CH-NEXT: .size fun0_fma4, .Lfunc_end0-fun0_fma4 +; FMA3-CH-NEXT: .cfi_endproc +; FMA3-CH-NEXT: # -- End function +; FMA3-CH-NEXT: .globl fun1_fma4_divop # -- Begin function fun1_fma4_divop +; FMA3-CH-NEXT: .p2align 4 +; FMA3-CH-NEXT: .type fun1_fma4_divop,@function +; FMA3-CH-NEXT: fun1_fma4_divop: # @fun1_fma4_divop +; FMA3-CH-NEXT: .cfi_startproc +; FMA3-CH-NEXT: # %bb.0: # %entry +; FMA3-CH-NEXT: ld %f1, 8(%r2) +; FMA3-CH-NEXT: ld %f2, 56(%r2) +; FMA3-CH-NEXT: madb %f0, %f1, 0(%r2) +; FMA3-CH-NEXT: ld %f1, 24(%r2) +; FMA3-CH-NEXT: mdb %f1, 16(%r2) +; FMA3-CH-NEXT: madb %f1, %f2, 48(%r2) +; FMA3-CH-NEXT: ld %f2, 32(%r2) +; FMA3-CH-NEXT: ldr %f3, %f2 +; FMA3-CH-NEXT: ddb %f3, 40(%r2) +; FMA3-CH-NEXT: madbr %f0, %f2, %f3 +; FMA3-CH-NEXT: adbr %f0, %f1 +; FMA3-CH-NEXT: br %r14 Index: llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-03.ll @@ -0,0 +1,90 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \ +; RUN: -print-before=machine-combiner -print-after=machine-combiner -fma2add \ +; RUN: 2>&1 | FileCheck %s + +; REQUIRES: asserts + +define double @fun0_fma2_add(ptr %x, double %A, double %B) { +; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun0_fma2_add: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d, $f0d, $f2d +; CHECK-NEXT: [[Y:%2:fp64bit]] = COPY $f2d +; CHECK-NEXT: [[X:%1:fp64bit]] = COPY $f0d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: %6:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4) +; CHECK-NEXT: %7:vr64bit = {{.*}} WFADB_CCPseudo [[X]], [[Y]] +; CHECK-NEXT: %8:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21:%3:vr64bit]], killed [[M22:%4:vr64bit]], killed %7:vr64bit +; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M31:%5:vr64bit]], killed [[M32:%6:vr64bit]], killed %8:vr64bit +; CHECK-NEXT: $f0d = COPY %9:vr64bit +; CHECK-NEXT: Return implicit $f0d + +; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun0_fma2_add: IsSSA, TracksLiveness +; CHECK: %10:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], [[X]] +; CHECK-NEXT: %11:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M31]], killed [[M32]], [[Y]] +; CHECK-NEXT: %9:vr64bit = {{.*}} WFADB_CCPseudo %10:vr64bit, %11:vr64bit +; CHECK-NEXT: $f0d = COPY %9:vr64bit +; CHECK-NEXT: Return implicit $f0d +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + + %mul1 = fmul reassoc nsz contract double %0, %1 + %mul2 = fmul reassoc nsz contract double %2, %3 + + %A1 = fadd reassoc nsz contract double %A, %B + %A2 = fadd reassoc nsz contract double %A1, %mul1 + %A3 = fadd reassoc nsz contract double %A2, %mul2 + + ret double %A3 +} + +; Same as above, but with a long-latency factor in the root FMA which makes +; this undesirable. +define double @fun1_fma2_add_divop(ptr %x, double %A, double %B) { +; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun1_fma2_add_divop: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d, $f0d, $f2d +; CHECK-NEXT: %2:fp64bit = COPY $f2d +; CHECK-NEXT: %1:fp64bit = COPY $f0d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: %5:fp64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: %6:fp64bit = nofpexcept DDB %5:fp64bit(tied-def 0), %0:addr64bit, 24, $noreg, implicit $fpc +; CHECK-NEXT: %7:vr64bit = {{.*}} WFADB_CCPseudo %1:fp64bit, %2:fp64bit +; CHECK-NEXT: %8:vr64bit = {{.*}} WFMADB_CCPseudo killed %3:vr64bit, killed %4:vr64bit, killed %7:vr64bit +; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB_CCPseudo %5:fp64bit, killed %6:fp64bit, killed %8:vr64bit +; CHECK-NEXT: $f0d = COPY %9:vr64bit +; CHECK-NEXT: Return implicit $f0d +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + %div = fdiv double %2, %3 + + %mul1 = fmul reassoc nsz contract double %0, %1 + %mul2 = fmul reassoc nsz contract double %2, %div + + %A1 = fadd reassoc nsz contract double %A, %B + %A2 = fadd reassoc nsz contract double %A1, %mul1 + %A3 = fadd reassoc nsz contract double %A2, %mul2 + + ret double %A3 +} Index: llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-04.ll @@ -0,0 +1,83 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \ +; RUN: -print-before=machine-combiner -print-after=machine-combiner -fma2 \ +; RUN: 2>&1 | FileCheck %s + +; REQUIRES: asserts + +; The incoming accumulator is stalling so it is worth putting the +; multiplications in parallell with it. +define double @fun0_fma2_divop(ptr %x) { +; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun0_fma2_divop: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: [[M21:%1:vr64bit]] = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: [[M22:%2:vr64bit]] = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: [[M11:%3:vr64bit]] = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: [[M12:%4:vr64bit]] = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4) +; CHECK-NEXT: [[DIV:%5:vr64bit]] = nofpexcept WFDDB %3:vr64bit, %4:vr64bit, implicit $fpc +; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], killed [[DIV]] +; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB_CCPseudo [[M11]], [[M12]], killed %6:vr64bit +; CHECK-NEXT: $f0d = COPY %7:vr64bit +; CHECK-NEXT: Return implicit $f0d + +; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun0_fma2_divop: IsSSA, TracksLiveness +; CHECK: %8:vr64bit = {{.*}} WFMDB_CCPseudo killed [[M21]], killed [[M22]] +; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB_CCPseudo [[M11]], [[M12]], %8:vr64bit +; CHECK-NEXT: %7:vr64bit = {{.*}} WFADB_CCPseudo killed [[DIV]], %9:vr64bit +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + %div = fdiv double %2, %3 + + %mul1 = fmul reassoc nsz contract double %0, %1 + %mul2 = fmul reassoc nsz contract double %2, %3 + + %A1 = fadd reassoc nsz contract double %div, %mul1 + %A2 = fadd reassoc nsz contract double %A1, %mul2 + + ret double %A2 +} + +; The non-profitable case: +define double @fun1_fma2(ptr %x, double %Arg) { +; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun1_fma2: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d, $f0d +; CHECK-NEXT: %1:fp64bit = COPY $f0d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4) +; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB_CCPseudo killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit +; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB_CCPseudo killed %4:vr64bit, killed %5:vr64bit, killed %6:vr64bit +; CHECK-NEXT: $f0d = COPY %7:vr64bit +; CHECK-NEXT: Return implicit $f0d +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + + %mul1 = fmul reassoc nsz contract double %0, %1 + %mul2 = fmul reassoc nsz contract double %2, %3 + + %A1 = fadd reassoc nsz contract double %Arg, %mul1 + %A2 = fadd reassoc nsz contract double %A1, %mul2 + + ret double %A2 +} Index: llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-05.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-05.ll @@ -0,0 +1,103 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \ +; RUN: -print-before=machine-combiner -print-after=machine-combiner -fma3 \ +; RUN: 2>&1 | FileCheck %s + +; REQUIRES: asserts + +; The incoming accumulator is stalling so it is worth putting the +; multiplications in parallell with it. +define double @fun0_fma3_divop(ptr %x) { +; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun0_fma3_divop: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: [[M31:%1:vr64bit]] = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: [[M32:%2:vr64bit]] = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: [[M21:%3:vr64bit]] = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: [[M22:%4:vr64bit]] = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4) +; CHECK-NEXT: [[M11:%5:vr64bit]] = VL64 %0:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx6) +; CHECK-NEXT: [[M12:%6:vr64bit]] = VL64 %0:addr64bit, 40, $noreg :: (load (s64) from %ir.arrayidx8) +; CHECK-NEXT: [[DIV:%7:vr64bit]] = nofpexcept WFDDB %5:vr64bit, %6:vr64bit, implicit $fpc +; CHECK-NEXT: %8:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M31]], killed [[M32]], killed [[DIV]] +; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], killed %8:vr64bit +; CHECK-NEXT: %10:vr64bit = {{.*}} WFMADB_CCPseudo [[M11]], [[M12]], killed %9:vr64bit +; CHECK-NEXT: $f0d = COPY %10:vr64bit +; CHECK-NEXT: Return implicit $f0d + +; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun0_fma3_divop: IsSSA, TracksLiveness +; CHECK: %11:vr64bit = {{.*}} WFMDB_CCPseudo killed [[M31]], killed [[M32]] +; CHECK-NEXT: %12:vr64bit = {{.*}} WFMDB_CCPseudo killed [[M21]], killed [[M22]] +; CHECK-NEXT: %13:vr64bit = {{.*}} WFMADB_CCPseudo [[M11]], [[M12]], %12:vr64bit +; CHECK-NEXT: %14:vr64bit = {{.*}} WFADB_CCPseudo %11:vr64bit, %13:vr64bit +; CHECK-NEXT: %10:vr64bit = {{.*}} WFADB_CCPseudo killed [[DIV]], %14:vr64bit +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + %4 = load double, ptr %arrayidx6 + %5 = load double, ptr %arrayidx8 + %div = fdiv double %4, %5 + + %mul1 = fmul reassoc nsz contract double %0, %1 + %mul2 = fmul reassoc nsz contract double %2, %3 + %mul3 = fmul reassoc nsz contract double %4, %5 + + %A1 = fadd reassoc nsz contract double %div, %mul1 + %A2 = fadd reassoc nsz contract double %A1, %mul2 + %A3 = fadd reassoc nsz contract double %A2, %mul3 + + ret double %A3 +} + +; The non-profitable case: +define double @fun1_fma3(ptr %x, double %Arg) { +; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun1_fma3: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d, $f0d +; CHECK-NEXT: %1:fp64bit = COPY $f0d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4) +; CHECK-NEXT: %6:vr64bit = VL64 %0:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx6) +; CHECK-NEXT: %7:vr64bit = VL64 %0:addr64bit, 40, $noreg :: (load (s64) from %ir.arrayidx8) +; CHECK-NEXT: %8:vr64bit = {{.*}} WFMADB_CCPseudo killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit +; CHECK-NEXT: %9:vr64bit = {{.*}} WFMADB_CCPseudo killed %4:vr64bit, killed %5:vr64bit, killed %8:vr64bit +; CHECK-NEXT: %10:vr64bit = {{.*}} WFMADB_CCPseudo killed %6:vr64bit, killed %7:vr64bit, killed %9:vr64bit +; CHECK-NEXT: $f0d = COPY %10:vr64bit +; CHECK-NEXT: Return implicit $f0d +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + %4 = load double, ptr %arrayidx6 + %5 = load double, ptr %arrayidx8 + + %mul1 = fmul reassoc nsz contract double %0, %1 + %mul2 = fmul reassoc nsz contract double %2, %3 + %mul3 = fmul reassoc nsz contract double %4, %5 + + %A1 = fadd reassoc nsz contract double %Arg, %mul1 + %A2 = fadd reassoc nsz contract double %A1, %mul2 + %A3 = fadd reassoc nsz contract double %A2, %mul3 + + ret double %A3 +} Index: llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-06.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-06.ll @@ -0,0 +1,136 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \ +; RUN: -print-before=machine-combiner -print-after=machine-combiner -fma4 \ +; RUN: 2>&1 | FileCheck %s + +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \ +; RUN: -print-before=machine-combiner -print-after=machine-combiner -fma4 \ +; RUN: -fma4-eqdepth 2>&1 | FileCheck %s --check-prefix=EQDEPTH + +; REQUIRES: asserts + +; The incoming accumulator is stalling so it is worth putting the +; multiplications in parallell with it. +define double @fun0_fma4_divop(ptr %x) { +; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun0_fma4_divop: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: [[M41:%1:vr64bit]] = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: [[M42:%2:vr64bit]] = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: [[M31:%3:vr64bit]] = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: [[M32:%4:vr64bit]] = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4) +; CHECK-NEXT: [[M21:%5:vr64bit]] = VL64 %0:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx6) +; CHECK-NEXT: [[M22:%6:vr64bit]] = VL64 %0:addr64bit, 40, $noreg :: (load (s64) from %ir.arrayidx8) +; CHECK-NEXT: [[M11:%7:vr64bit]] = VL64 %0:addr64bit, 48, $noreg :: (load (s64) from %ir.arrayidx10) +; CHECK-NEXT: [[M12:%8:vr64bit]] = VL64 %0:addr64bit, 56, $noreg :: (load (s64) from %ir.arrayidx12) +; CHECK-NEXT: [[DIV:%9:vr64bit]] = nofpexcept WFDDB %7:vr64bit, %8:vr64bit, implicit $fpc +; CHECK-NEXT: %10:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M41]], killed [[M42]], killed [[DIV]] +; CHECK-NEXT: %11:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M31]], killed [[M32]], killed %10:vr64bit +; CHECK-NEXT: %12:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], killed %11:vr64bit +; CHECK-NEXT: %13:vr64bit = {{.*}} WFMADB_CCPseudo [[M11]], [[M12]], killed %12:vr64bit +; CHECK-NEXT: $f0d = COPY %13:vr64bit +; CHECK-NEXT: Return implicit $f0d + +; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun0_fma4_divop: IsSSA, TracksLiveness +; CHECK: %14:vr64bit = {{.*}} WFMDB_CCPseudo killed [[M41]], killed [[M42]] +; CHECK-NEXT: %15:vr64bit = {{.*}} WFMDB_CCPseudo killed [[M31]], killed [[M32]] +; CHECK-NEXT: %16:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], %14:vr64bit +; CHECK-NEXT: %17:vr64bit = {{.*}} WFMADB_CCPseudo [[M11]], [[M12]], %15:vr64bit +; CHECK-NEXT: %18:vr64bit = {{.*}} WFADB_CCPseudo %16:vr64bit, %17:vr64bit +; CHECK-NEXT: %13:vr64bit = {{.*}} WFADB_CCPseudo killed [[DIV]], %18:vr64bit +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + %4 = load double, ptr %arrayidx6 + %5 = load double, ptr %arrayidx8 + %6 = load double, ptr %arrayidx10 + %7 = load double, ptr %arrayidx12 + %div = fdiv double %6, %7 + + %mul1 = fmul reassoc nsz contract double %0, %1 + %mul2 = fmul reassoc nsz contract double %2, %3 + %mul3 = fmul reassoc nsz contract double %4, %5 + %mul4 = fmul reassoc nsz contract double %6, %7 + + %A1 = fadd reassoc nsz contract double %div, %mul1 + %A2 = fadd reassoc nsz contract double %A1, %mul2 + %A3 = fadd reassoc nsz contract double %A2, %mul3 + %A4 = fadd reassoc nsz contract double %A3, %mul4 + + ret double %A4 +} + +; The non-profitable case does not improve the depth of the new root, but it +; is however equal now that 2 x 2 multiplies are used. Try allowing this in a +; second run. +define double @fun1_fma4(ptr %x, double %Arg) { +; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun1_fma4: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d, $f0d +; CHECK-NEXT: %1:fp64bit = COPY $f0d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4) +; CHECK-NEXT: %6:vr64bit = VL64 %0:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx6) +; CHECK-NEXT: %7:vr64bit = VL64 %0:addr64bit, 40, $noreg :: (load (s64) from %ir.arrayidx8) +; CHECK-NEXT: %8:vr64bit = VL64 %0:addr64bit, 48, $noreg :: (load (s64) from %ir.arrayidx10) +; CHECK-NEXT: %9:vr64bit = VL64 %0:addr64bit, 56, $noreg :: (load (s64) from %ir.arrayidx12) +; CHECK-NEXT: %10:vr64bit = {{.*}} WFMADB_CCPseudo killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit +; CHECK-NEXT: %11:vr64bit = {{.*}} WFMADB_CCPseudo killed %4:vr64bit, killed %5:vr64bit, killed %10:vr64bit +; CHECK-NEXT: %12:vr64bit = {{.*}} WFMADB_CCPseudo killed %6:vr64bit, killed %7:vr64bit, killed %11:vr64bit +; CHECK-NEXT: %13:vr64bit = {{.*}} WFMADB_CCPseudo killed %8:vr64bit, killed %9:vr64bit, killed %12:vr64bit +; CHECK-NEXT: $f0d = COPY %13:vr64bit +; CHECK-NEXT: Return implicit $f0d + +; EQDEPTH: # Machine code for function fun1_fma4: IsSSA, TracksLiveness +; EQDEPTH: WFMDB_CCPseudo +; EQDEPTH-NEXT: WFMDB_CCPseudo +; EQDEPTH-NEXT: WFMADB_CCPseudo +; EQDEPTH-NEXT: WFMADB_CCPseudo +; EQDEPTH-NEXT: WFADB_CCPseudo +; EQDEPTH-NEXT: WFADB_CCPseudo +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + %4 = load double, ptr %arrayidx6 + %5 = load double, ptr %arrayidx8 + %6 = load double, ptr %arrayidx10 + %7 = load double, ptr %arrayidx12 + + %mul1 = fmul reassoc nsz contract double %0, %1 + %mul2 = fmul reassoc nsz contract double %2, %3 + %mul3 = fmul reassoc nsz contract double %4, %5 + %mul4 = fmul reassoc nsz contract double %6, %7 + + %A1 = fadd reassoc nsz contract double %Arg, %mul1 + %A2 = fadd reassoc nsz contract double %A1, %mul2 + %A3 = fadd reassoc nsz contract double %A2, %mul3 + %A4 = fadd reassoc nsz contract double %A3, %mul4 + + ret double %A4 +} Index: llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-07.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-07.ll @@ -0,0 +1,120 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \ +; RUN: -print-before=machine-combiner -print-after=machine-combiner -fma4-ch \ +; RUN: 2>&1 | FileCheck %s + +; REQUIRES: asserts + +define double @fun0_fma4(ptr %x, double %Arg) { +; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun0_fma4: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d, $f0d +; CHECK-NEXT: %1:fp64bit = COPY $f0d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: [[M41:%2:vr64bit]] = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: [[M42:%3:vr64bit]] = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: [[M31:%4:vr64bit]] = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: [[M32:%5:vr64bit]] = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4) +; CHECK-NEXT: [[M21:%6:vr64bit]] = VL64 %0:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx6) +; CHECK-NEXT: [[M22:%7:vr64bit]] = VL64 %0:addr64bit, 40, $noreg :: (load (s64) from %ir.arrayidx8) +; CHECK-NEXT: [[M11:%8:vr64bit]] = VL64 %0:addr64bit, 48, $noreg :: (load (s64) from %ir.arrayidx10) +; CHECK-NEXT: [[M12:%9:vr64bit]] = VL64 %0:addr64bit, 56, $noreg :: (load (s64) from %ir.arrayidx12) +; CHECK-NEXT: %10:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M41]], killed [[M42]], %1:fp64bit +; CHECK-NEXT: %11:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M31]], killed [[M32]], killed %10:vr64bit +; CHECK-NEXT: %12:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], killed %11:vr64bit +; CHECK-NEXT: %13:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M11]], killed [[M12]], killed %12:vr64bit +; CHECK-NEXT: $f0d = COPY %13:vr64bit +; CHECK-NEXT: Return implicit $f0d + +; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun0_fma4: IsSSA, TracksLiveness +; CHECK: %14:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M41]], killed [[M42]], %1:fp64bit +; CHECK-NEXT: %15:vr64bit = {{.*}} WFMDB_CCPseudo killed [[M31]], killed [[M32]] +; CHECK-NEXT: %16:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], %14:vr64bit +; CHECK-NEXT: %17:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M11]], killed [[M12]], %15:vr64bit +; CHECK-NEXT: %13:vr64bit = {{.*}} WFADB_CCPseudo %16:vr64bit, %17:vr64bit +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + %4 = load double, ptr %arrayidx6 + %5 = load double, ptr %arrayidx8 + %6 = load double, ptr %arrayidx10 + %7 = load double, ptr %arrayidx12 + + %mul1 = fmul reassoc nsz contract double %0, %1 + %mul2 = fmul reassoc nsz contract double %2, %3 + %mul3 = fmul reassoc nsz contract double %4, %5 + %mul4 = fmul reassoc nsz contract double %6, %7 + + %A1 = fadd reassoc nsz contract double %Arg, %mul1 + %A2 = fadd reassoc nsz contract double %A1, %mul2 + %A3 = fadd reassoc nsz contract double %A2, %mul3 + %A4 = fadd reassoc nsz contract double %A3, %mul4 + + ret double %A4 +} + +; Same as above, but the last FMA has a long latency operand, which would +; make the transformation create a deeper new root. +define double @fun1_fma4_divop(ptr %x, double %Arg) { +; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun1_fma4_divop: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d, $f0d +; CHECK-NEXT: %1:fp64bit = COPY $f0d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4) +; CHECK-NEXT: %6:vr64bit = VL64 %0:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx6) +; CHECK-NEXT: %7:vr64bit = VL64 %0:addr64bit, 40, $noreg :: (load (s64) from %ir.arrayidx8) +; CHECK-NEXT: %8:fp64bit = VL64 %0:addr64bit, 48, $noreg :: (load (s64) from %ir.arrayidx10) +; CHECK-NEXT: %9:fp64bit = nofpexcept DDB %8:fp64bit(tied-def 0), %0:addr64bit, 56, $noreg, implicit $fpc +; CHECK-NEXT: %10:vr64bit = {{.*}} WFMADB_CCPseudo killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit +; CHECK-NEXT: %11:vr64bit = {{.*}} WFMADB_CCPseudo killed %4:vr64bit, killed %5:vr64bit, killed %10:vr64bit +; CHECK-NEXT: %12:vr64bit = {{.*}} WFMADB_CCPseudo killed %6:vr64bit, killed %7:vr64bit, killed %11:vr64bit +; CHECK-NEXT: %13:vr64bit = {{.*}} WFMADB_CCPseudo %8:fp64bit, killed %9:fp64bit, killed %12:vr64bit +; CHECK-NEXT: $f0d = COPY %13:vr64bit +; CHECK-NEXT: Return implicit $f0d +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + %4 = load double, ptr %arrayidx6 + %5 = load double, ptr %arrayidx8 + %6 = load double, ptr %arrayidx10 + %7 = load double, ptr %arrayidx12 + %div = fdiv double %6, %7 + + %mul1 = fmul reassoc nsz contract double %0, %1 + %mul2 = fmul reassoc nsz contract double %2, %3 + %mul3 = fmul reassoc nsz contract double %4, %5 + %mul4 = fmul reassoc nsz contract double %6, %div + + %A1 = fadd reassoc nsz contract double %Arg, %mul1 + %A2 = fadd reassoc nsz contract double %A1, %mul2 + %A3 = fadd reassoc nsz contract double %A2, %mul3 + %A4 = fadd reassoc nsz contract double %A3, %mul4 + + ret double %A4 +} Index: llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-08.ll @@ -0,0 +1,115 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \ +; RUN: -print-before=machine-combiner -print-after=machine-combiner -fma1add \ +; RUN: 2>&1 | FileCheck %s + +; REQUIRES: asserts + +; No improvement possible. +define double @fun0_fma1add(ptr %x) { +; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun0_fma1add: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: %1:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4) +; CHECK-NEXT: %5:vr64bit = {{.*}} WFADB_CCPseudo killed %3:vr64bit, killed %4:vr64bit +; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB_CCPseudo killed %1:vr64bit, killed %2:vr64bit, killed %5:vr64bit +; CHECK-NEXT: $f0d = COPY %6:vr64bit +; CHECK-NEXT: Return implicit $f0d +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + + %mul = fmul reassoc nsz contract double %0, %1 + + %A1 = fadd reassoc nsz contract double %2, %3 + %A2 = fadd reassoc nsz contract double %A1, %mul + + ret double %A2 +} + +; The RHS of the Add is stalling, so move up the FMA to the LHS. +define double @fun1_fma1add_divop(ptr %x) { +; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun1_fma1add_divop: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: [[M21:%1:vr64bit]] = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: [[M22:%2:vr64bit]] = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: [[T1:%3:fp64bit]] = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: [[DIV:%4:fp64bit]] = nofpexcept DDB %3:fp64bit(tied-def 0), %0:addr64bit, 24, $noreg, implicit $fpc +; CHECK-NEXT: %5:vr64bit = {{.*}} WFADB_CCPseudo [[T1]], killed [[DIV]] +; CHECK-NEXT: %6:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], killed %5:vr64bit +; CHECK-NEXT: $f0d = COPY %6:vr64bit +; CHECK-NEXT: Return implicit $f0d + +; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun1_fma1add_divop: IsSSA, TracksLiveness +; CHECK: %7:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], [[T1]] +; CHECK-NEXT: %6:vr64bit = {{.*}} WFADB_CCPseudo %7:vr64bit, killed [[DIV]] +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + %div = fdiv double %2, %3 + + %mul = fmul reassoc nsz contract double %0, %1 + + %A1 = fadd reassoc nsz contract double %2, %div + %A2 = fadd reassoc nsz contract double %A1, %mul + + ret double %A2 +} + +; The LHS of the Add is stalling, so move up the FMA to the RHS. +define double @fun2_fma1add_divop(ptr %x) { +; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun2_fma1add_divop: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: [[M21:%1:vr64bit]] = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: [[M22:%2:vr64bit]] = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: [[T2:%4:vr64bit]] = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4) +; CHECK-NEXT: [[DIV:%5:vr64bit]] = nofpexcept WFDDB killed %3:vr64bit, %4:vr64bit, implicit $fpc +; CHECK-NEXT: %6:vr64bit = {{.*}} WFADB_CCPseudo killed [[DIV]], [[T2]] +; CHECK-NEXT: %7:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], killed %6:vr64bit + +; CHECK: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun2_fma1add_divop: IsSSA, TracksLiveness +; CHECK: %9:vr64bit = {{.*}} WFMADB_CCPseudo killed [[M21]], killed [[M22]], [[T2]] +; CHECK: %7:vr64bit = {{.*}} WFADB_CCPseudo %9:vr64bit, killed [[DIV]] +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + %div = fdiv double %2, %3 + + %mul = fmul reassoc nsz contract double %0, %1 + + %A1 = fadd reassoc nsz contract double %div, %3 + %A2 = fadd reassoc nsz contract double %A1, %mul + + ret double %A2 +} Index: llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp-09.ll @@ -0,0 +1,186 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs -O3 \ +; RUN: -print-before=machine-combiner -print-after=machine-combiner \ +; RUN: 2>&1 | FileCheck %s + +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -O3 \ +; RUN: -print-before=machine-combiner -print-after=machine-combiner \ +; RUN: 2>&1 -fma2 | FileCheck %s --check-prefix=FMA2 + +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -O3 \ +; RUN: -print-before=machine-combiner -print-after=machine-combiner \ +; RUN: 2>&1 -fma3 | FileCheck %s --check-prefix=FMA3 + +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -O3 \ +; RUN: -print-before=machine-combiner -print-after=machine-combiner \ +; RUN: 2>&1 -fma3-ch | FileCheck %s --check-prefix=FMA3-CH + +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -O3 \ +; RUN: -print-before=machine-combiner -print-after=machine-combiner \ +; RUN: 2>&1 -fma4 -fma4-eqdepth | FileCheck %s --check-prefix=FMA4-EQD + +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -O3 \ +; RUN: -print-before=machine-combiner -print-after=machine-combiner \ +; RUN: 2>&1 -fma4-ch | FileCheck %s --check-prefix=FMA4-CH + +; REQUIRES: asserts + +; Test transformation of a sequence of 8 FMAs, with different patterns. + +define double @fun_fma8(ptr %x, double %A) { +; CHECK: # *** IR Dump Before Machine InstCombiner (machine-combiner) ***: +; CHECK-NEXT: # Machine code for function fun_fma8: IsSSA, TracksLiveness +; CHECK: bb.0.entry: +; CHECK-NEXT: liveins: $r2d, $f0d +; CHECK-NEXT: %1:fp64bit = COPY $f0d +; CHECK-NEXT: %0:addr64bit = COPY $r2d +; CHECK-NEXT: %2:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; CHECK-NEXT: %3:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; CHECK-NEXT: %4:vr64bit = VL64 %0:addr64bit, 16, $noreg :: (load (s64) from %ir.arrayidx2) +; CHECK-NEXT: %5:vr64bit = VL64 %0:addr64bit, 24, $noreg :: (load (s64) from %ir.arrayidx4) +; CHECK-NEXT: %6:vr64bit = VL64 %0:addr64bit, 32, $noreg :: (load (s64) from %ir.arrayidx6) +; CHECK-NEXT: %7:vr64bit = VL64 %0:addr64bit, 40, $noreg :: (load (s64) from %ir.arrayidx8) +; CHECK-NEXT: %8:vr64bit = VL64 %0:addr64bit, 48, $noreg :: (load (s64) from %ir.arrayidx10) +; CHECK-NEXT: %9:vr64bit = VL64 %0:addr64bit, 56, $noreg :: (load (s64) from %ir.arrayidx12) +; CHECK-NEXT: %10:vr64bit = VL64 %0:addr64bit, 64, $noreg :: (load (s64) from %ir.arrayidx14) +; CHECK-NEXT: %11:vr64bit = VL64 %0:addr64bit, 72, $noreg :: (load (s64) from %ir.arrayidx16) +; CHECK-NEXT: %12:vr64bit = VL64 %0:addr64bit, 80, $noreg :: (load (s64) from %ir.arrayidx18) +; CHECK-NEXT: %13:vr64bit = VL64 %0:addr64bit, 88, $noreg :: (load (s64) from %ir.arrayidx20) +; CHECK-NEXT: %14:vr64bit = VL64 %0:addr64bit, 96, $noreg :: (load (s64) from %ir.arrayidx22) +; CHECK-NEXT: %15:vr64bit = VL64 %0:addr64bit, 104, $noreg :: (load (s64) from %ir.arrayidx24) +; CHECK-NEXT: %16:vr64bit = VL64 %0:addr64bit, 112, $noreg :: (load (s64) from %ir.arrayidx26) +; CHECK-NEXT: %17:vr64bit = VL64 %0:addr64bit, 120, $noreg :: (load (s64) from %ir.arrayidx28) +; CHECK-NEXT: %18:vr64bit = {{.*}} WFMADB_CCPseudo killed %2:vr64bit, killed %3:vr64bit, %1:fp64bit +; CHECK-NEXT: %19:vr64bit = {{.*}} WFMADB_CCPseudo killed %4:vr64bit, killed %5:vr64bit, killed %18:vr64bit +; CHECK-NEXT: %20:vr64bit = {{.*}} WFMADB_CCPseudo killed %6:vr64bit, killed %7:vr64bit, killed %19:vr64bit +; CHECK-NEXT: %21:vr64bit = {{.*}} WFMADB_CCPseudo killed %8:vr64bit, killed %9:vr64bit, killed %20:vr64bit +; CHECK-NEXT: %22:vr64bit = {{.*}} WFMADB_CCPseudo killed %10:vr64bit, killed %11:vr64bit, killed %21:vr64bit +; CHECK-NEXT: %23:vr64bit = {{.*}} WFMADB_CCPseudo killed %12:vr64bit, killed %13:vr64bit, killed %22:vr64bit +; CHECK-NEXT: %24:vr64bit = {{.*}} WFMADB_CCPseudo killed %14:vr64bit, killed %15:vr64bit, killed %23:vr64bit +; CHECK-NEXT: %25:vr64bit = {{.*}} WFMADB_CCPseudo killed %16:vr64bit, killed %17:vr64bit, killed %24:vr64bit +; CHECK-NEXT: $f0d = COPY %25:vr64bit +; CHECK-NEXT: Return implicit $f0d + +; FMA2: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; FMA2-NEXT # Machine code for function fun_fma8: IsSSA, TracksLiveness +; FMA2: WFMADB_CCPseudo +; FMA2-NEXT WFMADB_CCPseudo +; FMA2-NEXT WFMDB_CCPseudo +; FMA2-NEXT WFMADB_CCPseudo +; FMA2-NEXT WFADB_CCPseudo +; FMA2-NEXT WFMDB_CCPseudo +; FMA2-NEXT WFMADB_CCPseudo +; FMA2-NEXT WFADB_CCPseudo +; FMA2-NEXT WFMDB_CCPseudo +; FMA2-NEXT WFMADB_CCPseudo +; FMA2-NEXT WFADB_CCPseudo + +; FMA3: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; FMA3-NEXT # Machine code for function fun_fma8: IsSSA, TracksLiveness +; FMA3: WFMADB_CCPseudo +; FMA3-NEXT WFMADB_CCPseudo +; FMA3-NEXT WFMDB_CCPseudo +; FMA3-NEXT WFMDB_CCPseudo +; FMA3-NEXT WFMADB_CCPseudo +; FMA3-NEXT WFADB_CCPseudo +; FMA3-NEXT WFADB_CCPseudo +; FMA3-NEXT WFMDB_CCPseudo +; FMA3-NEXT WFMDB_CCPseudo +; FMA3-NEXT WFMADB_CCPseudo +; FMA3-NEXT WFADB_CCPseudo +; FMA3-NEXT WFADB_CCPseudo + +; FMA3-CH: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; FMA3-CH-NEXT # Machine code for function fun_fma8: IsSSA, TracksLiveness +; FMA3-CH: WFMDB_CCPseudo +; FMA3-CH-NEXT WFMADB_CCPseudo +; FMA3-CH-NEXT WFMADB_CCPseudo +; FMA3-CH-NEXT WFADB_CCPseudo +; FMA3-CH-NEXT WFMDB_CCPseudo +; FMA3-CH-NEXT WFMADB_CCPseudo +; FMA3-CH-NEXT WFMADB_CCPseudo +; FMA3-CH-NEXT WFADB_CCPseudo +; FMA3-CH-NEXT WFMADB_CCPseudo +; FMA3-CH-NEXT WFMADB_CCPseudo + +; FMA4-EQD: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; FMA4-EQD-NEXT: # Machine code for function fun_fma8: IsSSA, TracksLiveness +; FMA4-EQD: WFMDB_CCPseudo +; FMA4-EQD-NEXT: WFMDB_CCPseudo +; FMA4-EQD-NEXT: WFMADB_CCPseudo +; FMA4-EQD-NEXT: WFMADB_CCPseudo +; FMA4-EQD-NEXT: WFADB_CCPseudo +; FMA4-EQD-NEXT: WFADB_CCPseudo +; FMA4-EQD-NEXT: WFMDB_CCPseudo +; FMA4-EQD-NEXT: WFMDB_CCPseudo +; FMA4-EQD-NEXT: WFMADB_CCPseudo +; FMA4-EQD-NEXT: WFMADB_CCPseudo +; FMA4-EQD-NEXT: WFADB_CCPseudo +; FMA4-EQD-NEXT: WFADB_CCPseudo + +; FMA4-CH: # *** IR Dump After Machine InstCombiner (machine-combiner) ***: +; FMA4-CH-NEXT: # Machine code for function fun_fma8: IsSSA, TracksLiveness +; FMA4-CH: WFMADB_CCPseudo +; FMA4-CH-NEXT: WFMDB_CCPseudo +; FMA4-CH-NEXT: WFMADB_CCPseudo +; FMA4-CH-NEXT: WFMADB_CCPseudo +; FMA4-CH-NEXT: WFADB_CCPseudo +; FMA4-CH-NEXT: WFMADB_CCPseudo +; FMA4-CH-NEXT: WFMDB_CCPseudo +; FMA4-CH-NEXT: WFMADB_CCPseudo +; FMA4-CH-NEXT: WFMADB_CCPseudo +; FMA4-CH-NEXT: WFADB_CCPseudo +entry: + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + %arrayidx14 = getelementptr inbounds double, ptr %x, i64 8 + %arrayidx16 = getelementptr inbounds double, ptr %x, i64 9 + %arrayidx18 = getelementptr inbounds double, ptr %x, i64 10 + %arrayidx20 = getelementptr inbounds double, ptr %x, i64 11 + %arrayidx22 = getelementptr inbounds double, ptr %x, i64 12 + %arrayidx24 = getelementptr inbounds double, ptr %x, i64 13 + %arrayidx26 = getelementptr inbounds double, ptr %x, i64 14 + %arrayidx28 = getelementptr inbounds double, ptr %x, i64 15 + + %0 = load double, ptr %x + %1 = load double, ptr %arrayidx1 + %2 = load double, ptr %arrayidx2 + %3 = load double, ptr %arrayidx4 + %4 = load double, ptr %arrayidx6 + %5 = load double, ptr %arrayidx8 + %6 = load double, ptr %arrayidx10 + %7 = load double, ptr %arrayidx12 + %8 = load double, ptr %arrayidx14 + %9 = load double, ptr %arrayidx16 + %10 = load double, ptr %arrayidx18 + %11 = load double, ptr %arrayidx20 + %12 = load double, ptr %arrayidx22 + %13 = load double, ptr %arrayidx24 + %14 = load double, ptr %arrayidx26 + %15 = load double, ptr %arrayidx28 + + %mul1 = fmul reassoc nsz contract double %0, %1 + %mul2 = fmul reassoc nsz contract double %2, %3 + %mul3 = fmul reassoc nsz contract double %4, %5 + %mul4 = fmul reassoc nsz contract double %6, %7 + %mul5 = fmul reassoc nsz contract double %8, %9 + %mul6 = fmul reassoc nsz contract double %10, %11 + %mul7 = fmul reassoc nsz contract double %12, %13 + %mul8 = fmul reassoc nsz contract double %14, %15 + + %A1 = fadd reassoc nsz contract double %A, %mul1 + %A2 = fadd reassoc nsz contract double %A1, %mul2 + %A3 = fadd reassoc nsz contract double %A2, %mul3 + %A4 = fadd reassoc nsz contract double %A3, %mul4 + %A5 = fadd reassoc nsz contract double %A4, %mul5 + %A6 = fadd reassoc nsz contract double %A5, %mul6 + %A7 = fadd reassoc nsz contract double %A6, %mul7 + %A8 = fadd reassoc nsz contract double %A7, %mul8 + + ret double %A8 +} + Index: llvm/test/CodeGen/SystemZ/reassoc-additions.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/reassoc-additions.ll @@ -0,0 +1,468 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -reassoc-additions -O3 \ +; RUN: -verify-machineinstrs | FileCheck %s + +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -reassoc-additions -O3 \ +; RUN: -debug-only=systemz-reassoc-additions \ +; RUN: -print-after=systemz-reassoc-additions 2>&1 \ +; RUN: | FileCheck %s --check-prefix=DEBUG + +; REQUIRES: asserts + +; No effective improvement available: should not be changed. +define double @fun0(ptr %x, double %Arg) { +; DEBUG: SystemZ Reassociate Additions Pass: fun0 +; DEBUG: Found chain: +; DEBUG-NEXT: On cycle 1 ADD T:4 T:4 stall %4:vr64bit = {{.*}} WFADB_CCPseudo +; DEBUG-NEXT: On cycle 10 ADD chained T:1 %5:vr64bit = {{.*}} WFADB_CCPseudo +; DEBUG-NEXT: Ends on 16 +; DEBUG: Transforming: +; DEBUG-NEXT: On cycle 4 %4:vr64bit = {{.*}} WFADB_CCPseudo +; DEBUG-NEXT: On cycle 10 %5:vr64bit = {{.*}} WFADB_CCPseudo +; DEBUG-NEXT: Ends on 16 +; DEBUG-NEXT: (unchanged) + +; CHECK-LABEL: fun0: +; CHECK: ld %f1, 0(%r2) +; CHECK-NEXT: adb %f1, 8(%r2) +; CHECK-NEXT: wfadb %f0, %f1, %f0 +; CHECK-NEXT: br %r14 +entry: + %0 = load double, ptr %x + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1 + %add1 = fadd reassoc nsz contract double %1, %0 + %add2 = fadd reassoc nsz contract double %add1, %Arg + ret double %add2 +} + +; Add the two registers first, and then the load. +define double @fun1(ptr %x, double %Arg0, double %Arg1) { +; DEBUG: SystemZ Reassociate Additions Pass: fun1 +; DEBUG: Found chain: +; DEBUG-NEXT: On cycle 1 ADD T:4 T:1 stall {{.*}} WFADB_CCPseudo killed %3:vr64bit, %1:fp64bit +; DEBUG-NEXT: On cycle 10 ADD chained T:1 {{.*}} WFADB_CCPseudo killed %4:vr64bit, %2:fp64bit +; DEBUG-NEXT: Ends on 16 +; DEBUG: Transforming: +; DEBUG-NEXT: On cycle 1 %6:vr64bit = {{.*}} WFADB_CCPseudo %1:fp64bit, %2:fp64bit +; DEBUG-NEXT: On cycle 7 %7:vr64bit = {{.*}} WFADB_CCPseudo %6:vr64bit, %3:vr64bit +; DEBUG-NEXT: Ends on 13 + +; CHECK-LABEL: fun1: +; CHECK: adbr %f0, %f2 +; CHECK-NEXT: adb %f0, 0(%r2) +entry: + %0 = load double, ptr %x + %add1 = fadd reassoc nsz contract double %0, %Arg0 + %add2 = fadd reassoc nsz contract double %add1, %Arg1 + ret double %add2 +} + +; One of the terms and both factors are registers: use an FMA first and then add the load. +define double @fun2(ptr %x, double %Arg0, double %Arg1, double %Arg2) { +; DEBUG: SystemZ Reassociate Additions Pass: fun2 +; DEBUG: Found chain: +; DEBUG-NEXT: On cycle 1 ADD T:4 T:1 stall %5:vr64bit = {{.*}} WFADB_CCPseudo killed %4:vr64bit, %1:fp64bit +; DEBUG-NEXT: On cycle 10 FMA P:1 chained %6:vr64bit = {{.*}} WFMADB_CCPseudo %2:fp64bit, %3:fp64bit, killed %5:vr64bit +; DEBUG-NEXT: Ends on 16 +; DEBUG: Transforming: +; DEBUG-NEXT: On cycle 1 %7:vr64bit = {{.*}} WFMADB_CCPseudo %2:fp64bit, %3:fp64bit, %1:fp64bit +; DEBUG-NEXT: On cycle 7 %8:vr64bit = {{.*}} WFADB_CCPseudo %7:vr64bit, %4:vr64bit +; DEBUG-NEXT: Ends on 13 + +; CHECK-LABEL: fun2: +; CHECK: madbr %f0, %f2, %f4 +; CHECK-NEXT: adb %f0, 0(%r2) +entry: + %0 = load double, ptr %x + %add1 = fadd reassoc nsz contract double %0, %Arg0 + %prod = fmul reassoc nsz contract double %Arg1, %Arg2 + %add2 = fadd reassoc nsz contract double %add1, %prod + ret double %add2 +} + +; An FMA and two Adds: the two register terms should be added first. +define double @fun3(ptr %x, double %Arg0, double %Arg1, double %Arg2) { +; DEBUG: SystemZ Reassociate Additions Pass: fun3 +; DEBUG: Found chain: +; DEBUG-NEXT: On cycle 1 ADD T:4 T:1 stall %6:vr64bit = {{.*}} WFADB_CCPseudo killed %4:vr64bit, %1:fp64bit +; DEBUG-NEXT: On cycle 10 FMA P:4 chained %7:vr64bit = {{.*}} WFMADB_CCPseudo killed %5:vr64bit, %2:fp64bit, killed %6:vr64bit +; DEBUG-NEXT: On cycle 16 ADD chained T:1 %8:vr64bit = {{.*}} WFADB_CCPseudo killed %7:vr64bit, %3:fp64bit +; DEBUG-NEXT: Ends on 22 +; DEBUG: Transforming: +; DEBUG-NEXT: On cycle 1 %9:vr64bit = {{.*}} WFADB_CCPseudo %1:fp64bit, %3:fp64bit +; DEBUG-NEXT: On cycle 7 %10:vr64bit = {{.*}} WFADB_CCPseudo %9:vr64bit, %4:vr64bit +; DEBUG-NEXT: On cycle 13 %11:vr64bit = {{.*}} WFMADB_CCPseudo %5:vr64bit, %2:fp64bit, %10:vr64bit +; DEBUG-NEXT: Ends on 19 + +; CHECK-LABEL: fun3: +; CHECK: adbr %f0, %f4 +; CHECK-NEXT: adb %f0, 0(%r2) +; CHECK-NEXT: madb %f0, %f2, 8(%r2) +entry: + %L0 = load double, ptr %x + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %L1 = load double, ptr %arrayidx1 + %add1 = fadd reassoc nsz contract double %L0, %Arg0 + %prod = fmul reassoc nsz contract double %L1, %Arg1 + %add2 = fadd reassoc nsz contract double %prod, %add1 + %add3 = fadd reassoc nsz contract double %add2, %Arg2 + ret double %add3 +} + +; Two FMAs where one of the products is available earlier: put it first. +define double @fun4(ptr %x, double %Arg0, double %Arg1, double %Arg2) { +; DEBUG: SystemZ Reassociate Additions Pass: fun4 +; DEBUG: Found chain: +; DEBUG-NEXT: On cycle 1 FMA P:4 T:1 stall %6:vr64bit = {{.*}} WFMADB_CCPseudo killed %4:vr64bit, killed %5:vr64bit, %3:fp64bit +; DEBUG-NEXT: On cycle 10 FMA P:1 chained %7:vr64bit = {{.*}} WFMADB_CCPseudo %1:fp64bit, %2:fp64bit, killed %6:vr64bit +; DEBUG: Ends on 16 +; DEBUG: Transforming: +; DEBUG-NEXT: On cycle 1 %8:vr64bit = {{.*}} WFMADB_CCPseudo %1:fp64bit, %2:fp64bit, %3:fp64bit +; DEBUG-NEXT: On cycle 7 %9:vr64bit = {{.*}} WFMADB_CCPseudo %4:vr64bit, %5:vr64bit, %8:vr64bit +; DEBUG-NEXT: Ends on 13 + +; CHECK-LABEL: fun4: +; CHECK: wfmadb %f0, %f0, %f2, %f4 +; CHECK-NEXT: ld %f1, 8(%r2) +; CHECK-NEXT: madb %f0, %f1, 0(%r2) +entry: + %L0 = load double, ptr %x + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %L1 = load double, ptr %arrayidx1 + %prod1 = fmul reassoc nsz contract double %L0, %L1 + %add1 = fadd reassoc nsz contract double %prod1, %Arg2 + %prod2 = fmul reassoc nsz contract double %Arg0, %Arg1 + %add2 = fadd reassoc nsz contract double %prod2, %add1 + ret double %add2 +} + +; Make a new FMA on top with the best term and product. +define double @fun5(ptr %x, double %Arg0, double %Arg1, double %Arg2, double %Arg3) { +; DEBUG: SystemZ Reassociate Additions Pass: fun5 +; DEBUG: Found chain: +; DEBUG-NEXT: On cycle 1 FMA P:4 T:4 stall %7:vr64bit = {{.*}} WFMADB_CCPseudo %5:vr64bit, killed %6:vr64bit, %5:vr64bit +; DEBUG-NEXT: On cycle 10 FMA P:1 chained %8:vr64bit = {{.*}} WFMADB_CCPseudo %1:fp64bit, %2:fp64bit, killed %7:vr64bit +; DEBUG-NEXT: On cycle 16 ADD chained T:1 %9:vr64bit = {{.*}} WFADB_CCPseudo killed %8:vr64bit, %4:fp64bit +; DEBUG-NEXT: Ends on 22 +; DEBUG: Transforming: +; DEBUG-NEXT: On cycle 1 %10:vr64bit = {{.*}} WFMADB_CCPseudo %1:fp64bit, %2:fp64bit, %4:fp64bit +; DEBUG-NEXT: On cycle 7 %11:vr64bit = {{.*}} WFADB_CCPseudo %10:vr64bit, %5:vr64bit +; DEBUG-NEXT: On cycle 13 %12:vr64bit = {{.*}} WFMADB_CCPseudo %5:vr64bit, %6:vr64bit, %11:vr64bit +; DEBUG-NEXT: Ends on 19 + +; CHECK-LABEL: fun5: +; CHECK: wfmadb %f0, %f0, %f2, %f6 +; CHECK-NEXT: ld %f1, 0(%r2) +; CHECK-NEXT: adbr %f0, %f1 +; CHECK-NEXT: madb %f0, %f1, 8(%r2) +entry: + %L0 = load double, ptr %x + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %L1 = load double, ptr %arrayidx1 + %prod1 = fmul reassoc nsz contract double %L0, %L1 + %add1 = fadd reassoc nsz contract double %prod1, %L0 + %prod2 = fmul reassoc nsz contract double %Arg0, %Arg1 + %add2 = fadd reassoc nsz contract double %prod2, %add1 + %add3 = fadd reassoc nsz contract double %add2, %Arg3 + ret double %add3 +} + +; Don't make chains over basic block boundaries. +define double @fun6(ptr %x, double %Arg0, double %Arg1) { +; DEBUG: SystemZ Reassociate Additions Pass: fun6 +; DEBUG-NOT: Found chain: +entry: + %L0 = load double, ptr %x + %add1 = fadd reassoc nsz double %L0, %Arg1 + %cond = fcmp oeq double %Arg0, zeroinitializer + br i1 %cond, label %if.then, label %return + +if.then: + br label %return + +return: + %term = phi double [ %Arg0, %if.then ], [ %Arg1, %entry ] + %add2 = fadd reassoc nsz double %add1, %term + ret double %add2 +} + +; Stop at a non-FMA/Add opcode. +define double @fun7(ptr %x, double %Arg0, double %Arg1, double %Arg2, double %Arg3) { +; DEBUG: SystemZ Reassociate Additions Pass: fun7 +; DEBUG: Found chain: +; DEBUG-NEXT: On cycle 1 ADD T:4 T:4 stall %7:vr64bit = {{.*}} WFADB_CCPseudo killed %5:vr64bit, killed %6:vr64bit +; DEBUG-NEXT: On cycle 10 ADD chained T:1 %8:vr64bit = {{.*}} WFADB_CCPseudo killed %7:vr64bit, %1:fp64bit +; DEBUG-NEXT: Ends on 16 +; DEBUG: (unchanged) +entry: + %L0 = load double, ptr %x + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %L1 = load double, ptr %arrayidx1 + %add1 = fadd reassoc nsz contract double %L0, %L1 + %add2 = fadd reassoc nsz contract double %add1, %Arg0 + %sub = fsub reassoc nsz contract double %add2, %Arg3 + ret double %sub +} + +; Stop if reassoc attribute is missing +define double @fun8(ptr %x, double %Arg0, double %Arg1, double %Arg2, double %Arg3) { +; DEBUG: SystemZ Reassociate Additions Pass: fun8 +; DEBUG: Found chain: +; DEBUG-NEXT: On cycle 1 ADD T:4 T:4 stall %7:vr64bit = {{.*}} WFADB_CCPseudo killed %5:vr64bit, killed %6:vr64bit +; DEBUG-NEXT: On cycle 10 ADD chained T:1 %8:vr64bit = {{.*}} WFADB_CCPseudo killed %7:vr64bit, %1:fp64bit +; DEBUG-NEXT: Ends on 16 +; DEBUG: (unchanged) +entry: + %L0 = load double, ptr %x + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %L1 = load double, ptr %arrayidx1 + %add1 = fadd reassoc nsz contract double %L0, %L1 + %add2 = fadd reassoc nsz contract double %add1, %Arg0 + %add3 = fadd double %add2, %Arg3 + ret double %add3 +} + +; Keep first part of chain and only change lower part where needed. +define double @fun9(ptr %x, double %Arg0, double %Arg1, double %Arg2, double %Arg3) { +; DEBUG: SystemZ Reassociate Additions Pass: fun9 +; DEBUG: Found chain: +; DEBUG-NEXT: On cycle 1 ADD T:1 T:1 %7:vr64bit = {{.*}} WFADB_CCPseudo %1:fp64bit, %2:fp64bit +; DEBUG-NEXT: On cycle 7 ADD chained T:1 %8:vr64bit = {{.*}} WFADB_CCPseudo killed %7:vr64bit, %3:fp64bit +; DEBUG-NEXT: On cycle 13 ADD chained T:30 stall %9:vr64bit = {{.*}} WFADB_CCPseudo killed %8:vr64bit, killed %6:fp64bit +; DEBUG-NEXT: On cycle 36 ADD chained T:1 %10:vr64bit = {{.*}} WFADB_CCPseudo killed %9:vr64bit, %4:fp64bit +; DEBUG-NEXT: Ends on 42 +; DEBUG: Transforming: +; DEBUG-NEXT: On cycle 1 %7:vr64bit = {{.*}} WFADB_CCPseudo %1:fp64bit, %2:fp64bit +; DEBUG-NEXT: On cycle 7 %8:vr64bit = {{.*}} WFADB_CCPseudo killed %7:vr64bit, %3:fp64bit +; DEBUG-NEXT: On cycle 13 %11:vr64bit = {{.*}} WFADB_CCPseudo %8:vr64bit, %4:fp64bit, implicit-def dead $cc +; DEBUG-NEXT: On cycle 30 %12:vr64bit = {{.*}} WFADB_CCPseudo %11:vr64bit, %6:fp64bit, implicit-def dead $cc +; DEBUG-NEXT: Ends on 36 + +; CHECK-LABEL: fun9: +; CHECK: ld %f1, 0(%r2) +; CHECK-NEXT: ddb %f1, 8(%r2) +; CHECK-NEXT: adbr %f0, %f2 +; CHECK-NEXT: wfadb %f2, %f4, %f6 +; CHECK-NEXT: adbr %f0, %f2 +; CHECK-NEXT: adbr %f0, %f1 +entry: + %L0 = load double, ptr %x + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %L1 = load double, ptr %arrayidx1 + %div = fdiv double %L0, %L1 + %add1 = fadd reassoc nsz contract double %Arg0, %Arg1 + %add2 = fadd reassoc nsz contract double %add1, %Arg2 + %add3 = fadd reassoc nsz contract double %add2, %div + %add4 = fadd reassoc nsz contract double %add3, %Arg3 + ret double %add4 +} + +; Two chains intersect with an Add node, don't revisit the lower part (first chain changed). +define double @fun10(ptr %x, double %Arg0, double %Arg1, double %Arg2, double %Arg3, double %Arg4) { +; DEBUG: SystemZ Reassociate Additions Pass: fun10 +; DEBUG: Found chain: +; DEBUG-NEXT: On cycle 1 ADD T:4 T:1 stall %8:vr64bit = {{.*}} WFADB_CCPseudo killed %6:vr64bit, %1:fp64bit +; DEBUG-NEXT: On cycle 10 ADD chained T:1 %9:vr64bit = {{.*}} WFADB_CCPseudo killed %8:vr64bit, %3:fp64bit +; DEBUG-NEXT: On cycle 16 ADD chained T:16 %12:vr64bit = {{.*}} WFADB_CCPseudo killed %9:vr64bit, killed %11:vr64bit +; DEBUG-NEXT: On cycle 22 ADD chained T:4 %13:vr64bit = {{.*}} WFADB_CCPseudo killed %12:vr64bit, killed %5:vr64bit +; DEBUG-NEXT: Ends on 28 +; DEBUG: Transforming: +; DEBUG-NEXT: On cycle 1 %14:vr64bit = nsz contract reassoc WFADB_CCPseudo %1:fp64bit, %3:fp64bit +; DEBUG-NEXT: On cycle 7 %15:vr64bit = nsz contract reassoc WFADB_CCPseudo %14:vr64bit, %6:vr64bit +; DEBUG-NEXT: On cycle 13 %16:vr64bit = nsz contract reassoc WFADB_CCPseudo %15:vr64bit, %5:vr64bit +; DEBUG-NEXT: On cycle 19 %17:vr64bit = nsz contract reassoc WFADB_CCPseudo %16:vr64bit, %11:vr64bit +; DEBUG-NEXT: Ends on 25 + +; DEBUG: Found chain: +; DEBUG-NEXT: On cycle 1 ADD T:4 T:1 stall %10:vr64bit = {{.*}} WFADB_CCPseudo killed %7:vr64bit, %2:fp64bit +; DEBUG-NEXT: On cycle 10 ADD chained T:1 %11:vr64bit = {{.*}} WFADB_CCPseudo killed %10:vr64bit, %4:fp64bit +; DEBUG-NEXT: Ends on 16 +; DEBUG: Transforming: +; DEBUG-NEXT: On cycle 1 %18:vr64bit = nsz contract reassoc WFADB_CCPseudo %2:fp64bit, %4:fp64bit +; DEBUG-NEXT: On cycle 7 %19:vr64bit = nsz contract reassoc WFADB_CCPseudo %18:vr64bit, %7:vr64bit +; DEBUG-NEXT: Ends on 13 + +; DEBUG: # *** IR Dump After SystemZ Reassociate Additions Pass (systemz-reassoc-additions) ***: +; DEBUG-NEXT: # Machine code for function fun10: IsSSA, TracksLiveness +; DEBUG: %4:fp64bit = COPY $f6d +; DEBUG-NEXT: %3:fp64bit = COPY $f4d +; DEBUG-NEXT: %2:fp64bit = COPY $f2d +; DEBUG-NEXT: %1:fp64bit = COPY $f0d +; DEBUG-NEXT: %0:addr64bit = COPY $r2d +; DEBUG-NEXT: %5:vr64bit = VL64 %fixed-stack.0, 0, $noreg :: (load (s64) from %fixed-stack.0) +; DEBUG-NEXT: %6:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; DEBUG-NEXT: %7:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; DEBUG-NEXT: %14:vr64bit = nsz contract reassoc WFADB_CCPseudo %1:fp64bit, %3:fp64bit +; DEBUG-NEXT: %15:vr64bit = nsz contract reassoc WFADB_CCPseudo %14:vr64bit, %6:vr64bit +; DEBUG-NEXT: %18:vr64bit = nsz contract reassoc WFADB_CCPseudo %2:fp64bit, %4:fp64bit +; DEBUG-NEXT: %19:vr64bit = nsz contract reassoc WFADB_CCPseudo %18:vr64bit, %7:vr64bit +; DEBUG-NEXT: %16:vr64bit = nsz contract reassoc WFADB_CCPseudo %15:vr64bit, %5:vr64bit +; DEBUG-NEXT: %17:vr64bit = nsz contract reassoc WFADB_CCPseudo %16:vr64bit, %19:vr64bit +; DEBUG-NEXT: $f0d = COPY %17:vr64bit +; DEBUG-NEXT: Return implicit $f0d +entry: + %L0 = load double, ptr %x + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %L1 = load double, ptr %arrayidx1 + %LHS1 = fadd reassoc nsz contract double %L0, %Arg0 + %LHS2 = fadd reassoc nsz contract double %LHS1, %Arg2 + %RHS1 = fadd reassoc nsz contract double %L1, %Arg1 + %RHS2 = fadd reassoc nsz contract double %RHS1, %Arg3 + %add1 = fadd reassoc nsz contract double %LHS2, %RHS2 + %add2 = fadd reassoc nsz contract double %add1, %Arg4 + ret double %add2 +} + +; Two chains intersect with an Add node, don't revisit the lower part (first chain unchanged). +define double @fun11(ptr %x, double %Arg0, double %Arg1, double %Arg2, double %Arg3, double %Arg4) { +; DEBUG: SystemZ Reassociate Additions Pass: fun11 +; DEBUG: Found chain: +; DEBUG-NEXT: On cycle 1 ADD T:1 T:1 %8:vr64bit = {{.*}} WFADB_CCPseudo %1:fp64bit, %2:fp64bit +; DEBUG-NEXT: On cycle 7 ADD chained T:1 %9:vr64bit = {{.*}} WFADB_CCPseudo killed %8:vr64bit, %3:fp64bit +; DEBUG-NEXT: On cycle 13 ADD chained T:16 stall %12:vr64bit = {{.*}} WFADB_CCPseudo killed %9:vr64bit, killed %11:vr64bit +; DEBUG-NEXT: On cycle 22 ADD chained T:34 stall %13:vr64bit = {{.*}} WFADB_CCPseudo killed %12:vr64bit, killed %7:vr64bit +; DEBUG-NEXT: Ends on 40 +; DEBUG: Transforming: +; DEBUG-NEXT: On cycle 1 %8:vr64bit = {{.*}} WFADB_CCPseudo %1:fp64bit, %2:fp64bit +; DEBUG-NEXT: On cycle 7 %9:vr64bit = {{.*}} WFADB_CCPseudo killed %8:vr64bit, %3:fp64bit +; DEBUG-NEXT: On cycle 16 %12:vr64bit = {{.*}} WFADB_CCPseudo killed %9:vr64bit, killed %11:vr64bit +; DEBUG-NEXT: On cycle 34 %13:vr64bit = {{.*}} WFADB_CCPseudo killed %12:vr64bit, killed %7:vr64bit +; DEBUG-NEXT: Ends on 40 +; DEBUG-NEXT: (unchanged) + +; DEBUG: Found chain: +; DEBUG-NEXT: On cycle 1 ADD T:4 T:1 stall %10:vr64bit = {{.*}} WFADB_CCPseudo %6:vr64bit, %3:fp64bit +; DEBUG-NEXT: On cycle 10 ADD chained T:1 %11:vr64bit = {{.*}} WFADB_CCPseudo killed %10:vr64bit, %4:fp64bit +; DEBUG-NEXT: Ends on 16 +; DEBUG: Transforming: +; DEBUG-NEXT: On cycle 1 %14:vr64bit = nsz contract reassoc WFADB_CCPseudo %3:fp64bit, %4:fp64bit +; DEBUG-NEXT: On cycle 7 %15:vr64bit = nsz contract reassoc WFADB_CCPseudo %14:vr64bit, %6:vr64bit +; DEBUG-NEXT: Ends on 13 + +; DEBUG: # *** IR Dump After SystemZ Reassociate Additions Pass (systemz-reassoc-additions) ***: +; DEBUG-NEXT: # Machine code for function fun11: IsSSA, TracksLiveness +; DEBUG: %4:fp64bit = COPY $f6d +; DEBUG-NEXT: %3:fp64bit = COPY $f4d +; DEBUG-NEXT: %2:fp64bit = COPY $f2d +; DEBUG-NEXT: %1:fp64bit = COPY $f0d +; DEBUG-NEXT: %0:addr64bit = COPY $r2d +; DEBUG-NEXT: %5:vr64bit = VL64 %0:addr64bit, 0, $noreg :: (load (s64) from %ir.x) +; DEBUG-NEXT: %6:vr64bit = VL64 %0:addr64bit, 8, $noreg :: (load (s64) from %ir.arrayidx1) +; DEBUG-NEXT: %7:vr64bit = nofpexcept WFDDB killed %5:vr64bit, %6:vr64bit, implicit $fpc +; DEBUG-NEXT: %8:vr64bit = {{.*}} WFADB_CCPseudo %1:fp64bit, %2:fp64bit +; DEBUG-NEXT: %9:vr64bit = {{.*}} WFADB_CCPseudo killed %8:vr64bit, %3:fp64bit +; DEBUG-NEXT: %14:vr64bit = nsz contract reassoc WFADB_CCPseudo %3:fp64bit, %4:fp64bit +; DEBUG-NEXT: %15:vr64bit = nsz contract reassoc WFADB_CCPseudo %14:vr64bit, %6:vr64bit +; DEBUG-NEXT: %12:vr64bit = {{.*}} WFADB_CCPseudo killed %9:vr64bit, killed %15:vr64bit +; DEBUG-NEXT: %13:vr64bit = {{.*}} WFADB_CCPseudo killed %12:vr64bit, killed %7:vr64bit +; DEBUG-NEXT: $f0d = COPY %13:vr64bit +; DEBUG-NEXT: Return implicit $f0d +entry: + %L0 = load double, ptr %x + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %L1 = load double, ptr %arrayidx1 + %div = fdiv double %L0, %L1 + + %LHS1 = fadd reassoc nsz contract double %Arg0, %Arg1 + %LHS2 = fadd reassoc nsz contract double %LHS1, %Arg2 + + %RHS1 = fadd reassoc nsz contract double %L1, %Arg2 + %RHS2 = fadd reassoc nsz contract double %RHS1, %Arg3 + + %add1 = fadd reassoc nsz contract double %LHS2, %RHS2 + %add2 = fadd reassoc nsz contract double %add1, %div + ret double %add2 +} + +; Stall is unavoidable, but use best order with smaller stall first. +define double @fun12(ptr %x, ptr %addr, double %Arg0, double %Arg1) { +; DEBUG: SystemZ Reassociate Additions Pass: fun12 +; DEBUG: Found chain: +; DEBUG-NEXT: On cycle 1 ADD T:1 T:1 %7:vr64bit = {{.*}} WFADB_CCPseudo %2:fp64bit, %3:fp64bit +; DEBUG-NEXT: On cycle 7 ADD chained T:34 stall %8:vr64bit = {{.*}} WFADB_CCPseudo killed %7:vr64bit, killed %6:fp64bit +; DEBUG-NEXT: On cycle 40 ADD chained T:8 %9:vr64bit = {{.*}} WFADB_CCPseudo killed %8:vr64bit, %5:fp64bit +; DEBUG-NEXT: Ends on 46 +; DEBUG: Transforming: +; DEBUG-NEXT: On cycle 1 %7:vr64bit = {{.*}} WFADB_CCPseudo %2:fp64bit, %3:fp64bit +; DEBUG-NEXT: On cycle 8 %10:vr64bit = {{.*}} WFADB_CCPseudo %7:vr64bit, %5:fp64bit +; DEBUG-NEXT: On cycle 34 %11:vr64bit = {{.*}} WFADB_CCPseudo %10:vr64bit, %6:fp64bit +; DEBUG-NEXT: Ends on 40 + +; CHECK-LABEL: fun12: +; CHECK: lg %r1, 0(%r3) +; CHECK: ld %f1, 0(%r1) +; CHECK: ldr %f3, %f1 +; CHECK: ddb %f3, 8(%r1) +; CHECK: adbr %f0, %f2 +; CHECK: adbr %f0, %f1 +; CHECK: adbr %f0, %f3 +entry: + %A = load ptr, ptr %addr + %L0 = load double, ptr %A + %arrayidx1 = getelementptr inbounds double, ptr %A, i64 1 + %L1 = load double, ptr %arrayidx1 + %div = fdiv double %L0, %L1 + + %add1 = fadd reassoc nsz contract double %Arg0, %Arg1 + %add2 = fadd reassoc nsz contract double %add1, %div + %add3 = fadd reassoc nsz contract double %add2, %L0 + + ret double %add3 +} + +; Chain of 8 FMAs that needs reordering to avoid stalls. +define double @fun13(ptr %x, ptr %addr, double %Arg0, double %Arg1, double %Arg2) { +; DEBUG: SystemZ Reassociate Additions Pass: fun13 +; DEBUG: Found chain: +; DEBUG-NEXT: On cycle 1 FMA P:38 T:1 stall %12:vr64bit = {{.*}} nofpexcept WFMADB_CCPseudo killed %11:vr64bit, %2:fp64bit, %2:fp64bit +; DEBUG-NEXT: On cycle 44 FMA P:34 chained %13:vr64bit = {{.*}} nofpexcept WFMADB_CCPseudo killed %7:vr64bit, %2:fp64bit, killed %12:vr64bit +; DEBUG-NEXT: On cycle 50 FMA P:8 chained %14:vr64bit = {{.*}} nofpexcept WFMADB_CCPseudo %9:vr64bit, %2:fp64bit, killed %13:vr64bit +; DEBUG-NEXT: On cycle 56 FMA P:8 chained %15:vr64bit = {{.*}} nofpexcept WFMADB_CCPseudo %10:vr64bit, %2:fp64bit, killed %14:vr64bit +; DEBUG-NEXT: On cycle 62 FMA P:4 chained %16:vr64bit = {{.*}} nofpexcept WFMADB_CCPseudo %5:vr64bit, %2:fp64bit, killed %15:vr64bit +; DEBUG-NEXT: On cycle 68 FMA P:4 chained %17:vr64bit = {{.*}} nofpexcept WFMADB_CCPseudo %6:vr64bit, %2:fp64bit, killed %16:vr64bit +; DEBUG-NEXT: On cycle 74 FMA P:1 chained %18:vr64bit = {{.*}} nofpexcept WFMADB_CCPseudo %3:fp64bit, %2:fp64bit, killed %17:vr64bit +; DEBUG-NEXT: On cycle 80 FMA P:1 chained %19:vr64bit = {{.*}} nofpexcept WFMADB_CCPseudo %3:fp64bit, %4:fp64bit, killed %18:vr64bit +; DEBUG-NEXT: Ends on 86 + +; DEBUG: Transforming: +; DEBUG-NEXT: On cycle 1 %20:vr64bit = {{.*}} WFMADB_CCPseudo %3:fp64bit, %2:fp64bit, %2:fp64bit +; DEBUG-NEXT: On cycle 7 %21:vr64bit = {{.*}} WFMADB_CCPseudo %5:vr64bit, %2:fp64bit, %20:vr64bit +; DEBUG-NEXT: On cycle 13 %22:vr64bit = {{.*}} WFMADB_CCPseudo %9:vr64bit, %2:fp64bit, %21:vr64bit +; DEBUG-NEXT: On cycle 19 %23:vr64bit = {{.*}} WFMADB_CCPseudo %10:vr64bit, %2:fp64bit, %22:vr64bit +; DEBUG-NEXT: On cycle 25 %24:vr64bit = {{.*}} WFMADB_CCPseudo %6:vr64bit, %2:fp64bit, %23:vr64bit +; DEBUG-NEXT: On cycle 31 %25:vr64bit = {{.*}} WFMADB_CCPseudo %3:fp64bit, %4:fp64bit, %24:vr64bit +; DEBUG-NEXT: On cycle 37 %26:vr64bit = {{.*}} WFMADB_CCPseudo %7:vr64bit, %2:fp64bit, %25:vr64bit +; DEBUG-NEXT: On cycle 43 %27:vr64bit = {{.*}} WFMADB_CCPseudo %11:vr64bit, %2:fp64bit, %26:vr64bit +; DEBUG-NEXT: Ends on 49 +entry: + %L0 = load double, ptr %x + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %L1 = load double, ptr %arrayidx1 + %div1 = fdiv double %L0, %L1 + + %A = load ptr, ptr %addr + %AL0 = load double, ptr %A + %arrayidx2 = getelementptr inbounds double, ptr %A, i64 1 + %AL1 = load double, ptr %arrayidx2 + %div2 = fdiv double %AL0, %AL1 + + %mul1 = fmul reassoc nsz contract double %div2, %Arg0 + %mul2 = fmul reassoc nsz contract double %div1, %Arg0 + %mul3 = fmul reassoc nsz contract double %AL0, %Arg0 + %mul4 = fmul reassoc nsz contract double %AL1, %Arg0 + %mul5 = fmul reassoc nsz contract double %L0, %Arg0 + %mul6 = fmul reassoc nsz contract double %L1, %Arg0 + %mul7 = fmul reassoc nsz contract double %Arg1, %Arg0 + %mul8 = fmul reassoc nsz contract double %Arg1, %Arg2 + + %add1 = fadd reassoc nsz contract double %Arg0, %mul1 + %add2 = fadd reassoc nsz contract double %add1, %mul2 + %add3 = fadd reassoc nsz contract double %add2, %mul3 + %add4 = fadd reassoc nsz contract double %add3, %mul4 + %add5 = fadd reassoc nsz contract double %add4, %mul5 + %add6 = fadd reassoc nsz contract double %add5, %mul6 + %add7 = fadd reassoc nsz contract double %add6, %mul7 + %add8 = fadd reassoc nsz contract double %add7, %mul8 + + ret double %add8 +}