Index: llvm/include/llvm/CodeGen/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1634,7 +1634,7 @@ /// instruction that defines FoldAsLoadDefReg, and the function returns /// the machine instruction generated due to folding. virtual MachineInstr *optimizeLoadInstr(MachineInstr &MI, - const MachineRegisterInfo *MRI, + MachineRegisterInfo *MRI, Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const { return nullptr; Index: llvm/lib/Target/SystemZ/CMakeLists.txt =================================================================== --- llvm/lib/Target/SystemZ/CMakeLists.txt +++ llvm/lib/Target/SystemZ/CMakeLists.txt @@ -20,6 +20,7 @@ SystemZConstantPoolValue.cpp SystemZCopyPhysRegs.cpp SystemZElimCompare.cpp + SystemZFinalizeReassociation.cpp SystemZFrameLowering.cpp SystemZHazardRecognizer.cpp SystemZISelDAGToDAG.cpp Index: llvm/lib/Target/SystemZ/SystemZ.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZ.h +++ llvm/lib/Target/SystemZ/SystemZ.h @@ -195,12 +195,14 @@ FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM); FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM); FunctionPass *createSystemZCopyPhysRegsPass(SystemZTargetMachine &TM); +FunctionPass *createSystemZFinalizeReassociationPass(SystemZTargetMachine &TM); FunctionPass *createSystemZPostRewritePass(SystemZTargetMachine &TM); FunctionPass *createSystemZTDCPass(); void initializeSystemZCopyPhysRegsPass(PassRegistry &); void initializeSystemZDAGToDAGISelPass(PassRegistry &); void initializeSystemZElimComparePass(PassRegistry &); +void initializeSystemZFinalizeReassociationPass(PassRegistry &); void initializeSystemZLDCleanupPass(PassRegistry &); void initializeSystemZLongBranchPass(PassRegistry &); void initializeSystemZPostRewritePass(PassRegistry &); Index: llvm/lib/Target/SystemZ/SystemZFinalizeReassociation.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/SystemZ/SystemZFinalizeReassociation.cpp @@ -0,0 +1,124 @@ +//===----- SystemZFinalizeReassociation.cpp - Handle phys reg copies ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass is the last step of the process of enabling reassociation with +// the MachineCombiner. These are the steps involved: +// +// 1. Instruction selection: Disable reg/mem folding for any operations that +// are reassociable since MachineCombiner will not succeed +// otherwise. Instead select a reg/reg pseudo that pretends to clobber CC. +// +// 2. MachineCombiner: Performs reassociation with the reg/reg instructions. +// +// 3. PeepholeOptimizer: fold loads into reg/mem instructions after +// reassociation. The reg/mem opcode sets CC which is why the special +// reg/reg pseudo is needed. +// +// 4. Convert any remaining pseudos into the target opcodes that do not +// clobber CC (this pass). +// +//===----------------------------------------------------------------------===// + +#include "SystemZMachineFunctionInfo.h" +#include "SystemZTargetMachine.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; + +namespace { + +class SystemZFinalizeReassociation : public MachineFunctionPass { +public: + static char ID; + SystemZFinalizeReassociation() + : MachineFunctionPass(ID), TII(nullptr), MRI(nullptr) { + initializeSystemZFinalizeReassociationPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + + bool visitMBB(MachineBasicBlock &MBB); + + const SystemZInstrInfo *TII; + MachineRegisterInfo *MRI; +}; + +char SystemZFinalizeReassociation::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS(SystemZFinalizeReassociation, "systemz-finalize-reassoc", + "SystemZ Finalize Reassociation", false, false) + +FunctionPass *llvm:: +createSystemZFinalizeReassociationPass(SystemZTargetMachine &TM) { + return new SystemZFinalizeReassociation(); +} + +void SystemZFinalizeReassociation::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +bool SystemZFinalizeReassociation::visitMBB(MachineBasicBlock &MBB) { + bool Changed = false; + for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) { + unsigned PseudoOpcode = MI.getOpcode(); + unsigned TargetOpcode = + PseudoOpcode == SystemZ::WFADB_CCPseudo ? SystemZ::WFADB + : PseudoOpcode == SystemZ::WFASB_CCPseudo ? SystemZ::WFASB + : PseudoOpcode == SystemZ::WFSDB_CCPseudo ? SystemZ::WFSDB + : PseudoOpcode == SystemZ::WFSSB_CCPseudo ? SystemZ::WFSSB + : PseudoOpcode == SystemZ::WFMDB_CCPseudo ? SystemZ::WFMDB + : PseudoOpcode == SystemZ::WFMSB_CCPseudo ? SystemZ::WFMSB + : 0; + if (TargetOpcode) { + // PeepholeOptimizer will not fold any loads across basic blocks, which + // however seems beneficial, so do it here: + bool Folded = false; + for (unsigned Op = 1; Op <= 2; ++Op) { + Register Reg = MI.getOperand(Op).getReg(); + if (MachineInstr *DefMI = MRI->getVRegDef(Reg)) + if (TII->optimizeLoadInstr(MI, MRI, Reg, DefMI)) { + MI.eraseFromParent(); + DefMI->eraseFromParent(); + MRI->markUsesInDebugValueAsUndef(Reg); + Folded = true; + break; + } + } + + if (!Folded) { + MI.setDesc(TII->get(TargetOpcode)); + MI.removeOperand(3); // CC + } + Changed = true; + } + } + return Changed; +} + +bool SystemZFinalizeReassociation::runOnMachineFunction(MachineFunction &F) { + TII = F.getSubtarget().getInstrInfo(); + MRI = &F.getRegInfo(); + + bool Modified = false; + for (auto &MBB : F) + Modified |= visitMBB(MBB); + + return Modified; +} Index: llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -345,6 +345,11 @@ // Try to expand a boolean SELECT_CCMASK using an IPM sequence. SDValue expandSelectBoolean(SDNode *Node); + bool hasReassocFlags(SDNode *N) const { + return N->getFlags().hasAllowReassociation() && + N->getFlags().hasNoSignedZeros(); + } + public: static char ID; Index: llvm/lib/Target/SystemZ/SystemZInstrFP.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrFP.td +++ llvm/lib/Target/SystemZ/SystemZInstrFP.td @@ -439,8 +439,10 @@ def ADBR : BinaryRRE<"adbr", 0xB31A, any_fadd, FP64, FP64>; def AXBR : BinaryRRE<"axbr", 0xB34A, any_fadd, FP128, FP128>; } - defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, load, 4>; - defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, load, 8>; + defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, z_any_fadd_noreassoc, FP32, + load, 4>; + defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, z_any_fadd_noreassoc, FP64, + load, 8>; } // Subtraction. @@ -450,8 +452,10 @@ def SDBR : BinaryRRE<"sdbr", 0xB31B, any_fsub, FP64, FP64>; def SXBR : BinaryRRE<"sxbr", 0xB34B, any_fsub, FP128, FP128>; - defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, any_fsub, FP32, load, 4>; - defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, any_fsub, FP64, load, 8>; + defm SEB : BinaryRXEAndPseudo<"seb", 0xED0B, z_any_fsub_noreassoc, FP32, + load, 4>; + defm SDB : BinaryRXEAndPseudo<"sdb", 0xED1B, z_any_fsub_noreassoc, FP64, + load, 8>; } // Multiplication. @@ -461,8 +465,10 @@ def MDBR : BinaryRRE<"mdbr", 0xB31C, any_fmul, FP64, FP64>; def MXBR : BinaryRRE<"mxbr", 0xB34C, any_fmul, FP128, FP128>; } - defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, load, 4>; - defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, any_fmul, FP64, load, 8>; + defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, z_any_fmul_noreassoc, FP32, + load, 4>; + defm MDB : BinaryRXEAndPseudo<"mdb", 0xED1C, z_any_fmul_noreassoc, FP64, + load, 8>; } // f64 multiplication of two FP32 registers. Index: llvm/lib/Target/SystemZ/SystemZInstrFormats.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -5388,3 +5388,18 @@ [(set GR64:$end, (operator GR64:$start1, GR64:$start2, GR32:$char))]>; } + +multiclass BinaryVRRcAndCCPseudo opcode, + SDPatternOperator operator, + SDPatternOperator reassoc_operator, + TypedReg tr1, TypedReg tr2, bits<4> type = 0, + bits<4> m5 = 0, bits<4> m6 = 0, + string fp_mnemonic = ""> { + def "" : BinaryVRRc; + let Defs = [CC], AddedComplexity = 1 in // Win over "". + def _CCPseudo : Pseudo<(outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3), + [(set (tr1.vt tr1.op:$V1), + (reassoc_operator (tr2.vt tr2.op:$V2), + (tr2.vt tr2.op:$V3)))]>; +} Index: llvm/lib/Target/SystemZ/SystemZInstrInfo.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrInfo.h +++ llvm/lib/Target/SystemZ/SystemZInstrInfo.h @@ -243,8 +243,15 @@ const DebugLoc &DL, Register DstReg, ArrayRef Cond, Register TrueReg, Register FalseReg) const override; + void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI, + MachineInstr::MIFlag Flag) const; + MachineInstr *optimizeLoadInstr(MachineInstr &MI, + MachineRegisterInfo *MRI, + Register &FoldAsLoadDefReg, + MachineInstr *&DefMI) const override; bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const override; + bool isPredicable(const MachineInstr &MI) const override; bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, @@ -274,6 +281,15 @@ Register VReg) const override; MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override; + + bool useMachineCombiner() const override { return true; } + bool isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const override; + std::optional getInverseOpcode(unsigned Opcode) const override; + void + finalizeInsInstrs(MachineInstr &Root, MachineCombinerPattern &P, + SmallVectorImpl &InsInstrs) const override; + MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, Index: llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -610,6 +610,89 @@ .addImm(CCValid).addImm(CCMask); } +static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) { + if (OldMI->registerDefIsDead(SystemZ::CC)) { + MachineOperand *CCDef = NewMI->findRegisterDefOperand(SystemZ::CC); + if (CCDef != nullptr) + CCDef->setIsDead(true); + } +} + +void SystemZInstrInfo::transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI, + MachineInstr::MIFlag Flag) const { + if (OldMI->getFlag(Flag)) + NewMI->setFlag(Flag); +} + +MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI, + MachineRegisterInfo *MRI, + Register &FoldAsLoadDefReg, + MachineInstr *&DefMI) const { + const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + + // Check whether we can move DefMI here. + DefMI = MRI->getVRegDef(FoldAsLoadDefReg); + assert(DefMI); + bool SawStore = false; + if (!DefMI->isSafeToMove(nullptr, SawStore)) + return nullptr; + + // For reassociatable FP operations, any loads have been purposefully left + // unfolded so that MachineCombiner can do its work on reg/reg + // opcodes. After that, as many loads as possible are now folded. + unsigned LoadOpcode = 0; + unsigned RegMemOpcode = 0; + const TargetRegisterClass *FPRC = nullptr; + if (MI.getOpcode() == SystemZ::WFADB_CCPseudo || + MI.getOpcode() == SystemZ::WFSDB_CCPseudo || + MI.getOpcode() == SystemZ::WFMDB_CCPseudo) { + RegMemOpcode = MI.getOpcode() == SystemZ::WFADB_CCPseudo ? SystemZ::ADB + : MI.getOpcode() == SystemZ::WFSDB_CCPseudo ? SystemZ::SDB + : SystemZ::MDB; + LoadOpcode = SystemZ::VL64; + FPRC = &SystemZ::FP64BitRegClass; + } else if (MI.getOpcode() == SystemZ::WFASB_CCPseudo || + MI.getOpcode() == SystemZ::WFSSB_CCPseudo || + MI.getOpcode() == SystemZ::WFMSB_CCPseudo) { + RegMemOpcode = MI.getOpcode() == SystemZ::WFASB_CCPseudo ? SystemZ::AEB + : MI.getOpcode() == SystemZ::WFSSB_CCPseudo ? SystemZ::SEB + : SystemZ::MEEB; + LoadOpcode = SystemZ::VL32; + FPRC = &SystemZ::FP32BitRegClass; + } else + return nullptr; + + if (DefMI->getOpcode() == LoadOpcode && + MRI->hasOneNonDBGUse(FoldAsLoadDefReg)) { + MachineBasicBlock *MBB = MI.getParent(); + Register DstReg = MI.getOperand(0).getReg(); + MachineOperand LHS = MI.getOperand(1); + MachineOperand RHS = MI.getOperand(2); + bool MemInRHS = RHS.getReg() == FoldAsLoadDefReg; + if (!MemInRHS && + (RegMemOpcode == SystemZ::SDB || RegMemOpcode == SystemZ::SEB)) + return nullptr; + MachineOperand &RegMO = MemInRHS ? LHS : RHS; + MachineInstrBuilder MIB = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(RegMemOpcode), DstReg) + .add(RegMO) + .add(DefMI->getOperand(1)) + .add(DefMI->getOperand(2)) + .add(DefMI->getOperand(3)) + .addMemOperand(*DefMI->memoperands_begin()); + transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept); + MIB->addRegisterDead(SystemZ::CC, TRI); + if (MIB->getOperand(2).isReg()) + MIB->getOperand(2).setIsKill(false); + MIB->getOperand(4).setIsKill(false); + MRI->setRegClass(RegMO.getReg(), FPRC); + MRI->setRegClass(DstReg, FPRC); + return MIB; + } + + return nullptr; +} + bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const { @@ -937,20 +1020,6 @@ } } -static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) { - if (OldMI->registerDefIsDead(SystemZ::CC)) { - MachineOperand *CCDef = NewMI->findRegisterDefOperand(SystemZ::CC); - if (CCDef != nullptr) - CCDef->setIsDead(true); - } -} - -static void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI, - MachineInstr::MIFlag Flag) { - if (OldMI->getFlag(Flag)) - NewMI->setFlag(Flag); -} - MachineInstr * SystemZInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const { @@ -1003,6 +1072,84 @@ return nullptr; } +bool SystemZInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const { + unsigned Opc = Inst.getOpcode(); + if (Invert) { + auto InverseOpcode = getInverseOpcode(Opc); + if (!InverseOpcode) + return false; + Opc = *InverseOpcode; + } + + switch (Opc) { + default: + break; + case SystemZ::VFADB: + case SystemZ::VFASB: + case SystemZ::WFAXB: + case SystemZ::WFADB_CCPseudo: + case SystemZ::WFASB_CCPseudo: + case SystemZ::VFMDB: + case SystemZ::VFMSB: + case SystemZ::WFMXB: + case SystemZ::WFMDB_CCPseudo: + case SystemZ::WFMSB_CCPseudo: + return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && + Inst.getFlag(MachineInstr::MIFlag::FmNsz); + } + + return false; +} + +std::optional +SystemZInstrInfo::getInverseOpcode(unsigned Opcode) const { + switch (Opcode) { + default: + return std::nullopt; + case SystemZ::VFADB: + return SystemZ::VFSDB; + case SystemZ::VFASB: + return SystemZ::VFSSB; + case SystemZ::WFAXB: + return SystemZ::WFSXB; + case SystemZ::WFADB_CCPseudo: + return SystemZ::WFSDB_CCPseudo; + case SystemZ::WFASB_CCPseudo: + return SystemZ::WFSSB_CCPseudo; + case SystemZ::VFSDB: + return SystemZ::VFADB; + case SystemZ::VFSSB: + return SystemZ::VFASB; + case SystemZ::WFSXB: + return SystemZ::WFAXB; + case SystemZ::WFSDB_CCPseudo: + return SystemZ::WFADB_CCPseudo; + case SystemZ::WFSSB_CCPseudo: + return SystemZ::WFASB_CCPseudo; + } +} + +void SystemZInstrInfo::finalizeInsInstrs( + MachineInstr &Root, MachineCombinerPattern &P, + SmallVectorImpl &InsInstrs) const { + const TargetRegisterInfo *TRI = + Root.getParent()->getParent()->getSubtarget().getRegisterInfo(); + for (auto *Inst : InsInstrs) { + switch (Inst->getOpcode()) { + case SystemZ::WFADB_CCPseudo: + case SystemZ::WFASB_CCPseudo: + case SystemZ::WFMDB_CCPseudo: + case SystemZ::WFMSB_CCPseudo: + case SystemZ::WFSDB_CCPseudo: + case SystemZ::WFSSB_CCPseudo: + Inst->addRegisterDead(SystemZ::CC, TRI); + break; + default: break; + } + } +} + MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, Index: llvm/lib/Target/SystemZ/SystemZInstrVector.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrVector.td +++ llvm/lib/Target/SystemZ/SystemZInstrVector.td @@ -139,7 +139,7 @@ // LEY and LDY offer full 20-bit displacement fields. It's often better // to use those instructions rather than force a 20-bit displacement // into a GPR temporary. - let mayLoad = 1 in { + let mayLoad = 1, canFoldAsLoad = 1 in { def VL32 : UnaryAliasVRX; def VL64 : UnaryAliasVRX; } @@ -1045,15 +1045,15 @@ let Predicates = [FeatureVector] in { // Add. let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in { - def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>; - def VFADB : BinaryVRRc<"vfadb", 0xE7E3, any_fadd, v128db, v128db, 3, 0>; - def WFADB : BinaryVRRc<"wfadb", 0xE7E3, any_fadd, v64db, v64db, 3, 8, 0, - "adbr">; + def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>; + def VFADB : BinaryVRRc<"vfadb", 0xE7E3, any_fadd, v128db, v128db, 3, 0>; + defm WFADB : BinaryVRRcAndCCPseudo<"wfadb", 0xE7E3, any_fadd, + z_fadd_reassoc, v64db, v64db, 3, 8, 0, "adbr">; let Predicates = [FeatureVectorEnhancements1] in { - def VFASB : BinaryVRRc<"vfasb", 0xE7E3, any_fadd, v128sb, v128sb, 2, 0>; - def WFASB : BinaryVRRc<"wfasb", 0xE7E3, any_fadd, v32sb, v32sb, 2, 8, 0, - "aebr">; - def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, any_fadd, v128xb, v128xb, 4, 8>; + def VFASB : BinaryVRRc<"vfasb", 0xE7E3, any_fadd, v128sb, v128sb, 2, 0>; + defm WFASB : BinaryVRRcAndCCPseudo<"wfasb", 0xE7E3, any_fadd, + z_fadd_reassoc, v32sb, v32sb, 2, 8, 0, "aebr">; + def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, any_fadd, v128xb, v128xb, 4, 8>; } } @@ -1258,15 +1258,15 @@ // Multiply. let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in { - def VFM : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>; - def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, any_fmul, v128db, v128db, 3, 0>; - def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, any_fmul, v64db, v64db, 3, 8, 0, - "mdbr">; + def VFM : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>; + def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, any_fmul, v128db, v128db, 3, 0>; + defm WFMDB : BinaryVRRcAndCCPseudo<"wfmdb", 0xE7E7, any_fmul, + z_fmul_reassoc, v64db, v64db, 3, 8, 0, "mdbr">; let Predicates = [FeatureVectorEnhancements1] in { - def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, any_fmul, v128sb, v128sb, 2, 0>; - def WFMSB : BinaryVRRc<"wfmsb", 0xE7E7, any_fmul, v32sb, v32sb, 2, 8, 0, - "meebr">; - def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, any_fmul, v128xb, v128xb, 4, 8>; + def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, any_fmul, v128sb, v128sb, 2, 0>; + defm WFMSB : BinaryVRRcAndCCPseudo<"wfmsb", 0xE7E7, any_fmul, + z_fmul_reassoc, v32sb, v32sb, 2, 8, 0, "meebr">; + def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, any_fmul, v128xb, v128xb, 4, 8>; } } @@ -1373,15 +1373,15 @@ // Subtract. let Uses = [FPC], mayRaiseFPException = 1 in { - def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>; - def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, any_fsub, v128db, v128db, 3, 0>; - def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, any_fsub, v64db, v64db, 3, 8, 0, - "sdbr">; + def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>; + def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, any_fsub, v128db, v128db, 3, 0>; + defm WFSDB : BinaryVRRcAndCCPseudo<"wfsdb", 0xE7E2, any_fsub, + z_fsub_reassoc, v64db, v64db, 3, 8, 0, "sdbr">; let Predicates = [FeatureVectorEnhancements1] in { - def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, any_fsub, v128sb, v128sb, 2, 0>; - def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, any_fsub, v32sb, v32sb, 2, 8, 0, - "sebr">; - def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, any_fsub, v128xb, v128xb, 4, 8>; + def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, any_fsub, v128sb, v128sb, 2, 0>; + defm WFSSB : BinaryVRRcAndCCPseudo<"wfssb", 0xE7E2, any_fsub, + z_fsub_reassoc, v32sb, v32sb, 2, 8, 0, "sebr">; + def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, any_fsub, v128xb, v128xb, 4, 8>; } } Index: llvm/lib/Target/SystemZ/SystemZOperators.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZOperators.td +++ llvm/lib/Target/SystemZ/SystemZOperators.td @@ -700,6 +700,28 @@ // Floating-point negative absolute. def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>; +// Floating-point operations which are not marked as reassociable. +def z_any_fadd_noreassoc : PatFrag<(ops node:$src1, node:$src2), + (any_fadd node:$src1, node:$src2), + [{ return !hasReassocFlags(N); }]>; +def z_any_fsub_noreassoc : PatFrag<(ops node:$src1, node:$src2), + (any_fsub node:$src1, node:$src2), + [{ return !hasReassocFlags(N); }]>; +def z_any_fmul_noreassoc : PatFrag<(ops node:$src1, node:$src2), + (any_fmul node:$src1, node:$src2), + [{ return !hasReassocFlags(N); }]>; + +// Floating-point operations which are reassociable. +def z_fadd_reassoc : PatFrag<(ops node:$src1, node:$src2), + (fadd node:$src1, node:$src2), + [{ return hasReassocFlags(N); }]>; +def z_fsub_reassoc : PatFrag<(ops node:$src1, node:$src2), + (fsub node:$src1, node:$src2), + [{ return hasReassocFlags(N); }]>; +def z_fmul_reassoc : PatFrag<(ops node:$src1, node:$src2), + (fmul node:$src1, node:$src2), + [{ return hasReassocFlags(N); }]>; + // Strict floating-point fragments. def z_any_fcmp : PatFrags<(ops node:$lhs, node:$rhs), [(z_strict_fcmp node:$lhs, node:$rhs), Index: llvm/lib/Target/SystemZ/SystemZScheduleZ13.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZScheduleZ13.td +++ llvm/lib/Target/SystemZ/SystemZScheduleZ13.td @@ -1346,12 +1346,12 @@ // Add / subtract def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)DB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>; // Multiply / multiply-and-add/subtract def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFMDB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFMDB$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFMDB(_CCPseudo)?$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM(A|S)$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM(A|S)DB$")>; def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(A|S)DB$")>; Index: llvm/lib/Target/SystemZ/SystemZScheduleZ14.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZScheduleZ14.td +++ llvm/lib/Target/SystemZ/SystemZScheduleZ14.td @@ -1390,15 +1390,15 @@ // Add / subtract def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)$")>; def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VF(A|S)DB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)SB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)SB$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)SB(_CCPseudo)?$")>; def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>; // Multiply / multiply-and-add/subtract def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM$")>; def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VFMDB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(D|S)B$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(D|S)B(_CCPseudo)?$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFMSB$")>; def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(N)?M(A|S)$")>; Index: llvm/lib/Target/SystemZ/SystemZScheduleZ15.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZScheduleZ15.td +++ llvm/lib/Target/SystemZ/SystemZScheduleZ15.td @@ -1433,14 +1433,14 @@ // Add / subtract def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB(_CCPseudo)?$")>; def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>; // Multiply / multiply-and-add/subtract def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFM(DB)?$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B(_CCPseudo)?$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>; def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>; Index: llvm/lib/Target/SystemZ/SystemZScheduleZ16.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZScheduleZ16.td +++ llvm/lib/Target/SystemZ/SystemZScheduleZ16.td @@ -1439,14 +1439,14 @@ // Add / subtract def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB(_CCPseudo)?$")>; def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>; // Multiply / multiply-and-add/subtract def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFM(DB)?$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B(_CCPseudo)?$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>; def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>; Index: llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -30,6 +30,11 @@ using namespace llvm; +static cl::opt +EnableMachineCombinerPass("systemz-machine-combiner", + cl::desc("Enable the machine combiner pass"), + cl::init(true), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() { // Register the target. RegisterTargetMachine X(getTheSystemZTarget()); @@ -240,11 +245,16 @@ bool SystemZPassConfig::addILPOpts() { addPass(&EarlyIfConverterID); + + if (EnableMachineCombinerPass) + addPass(&MachineCombinerID); + return true; } void SystemZPassConfig::addPreRegAlloc() { addPass(createSystemZCopyPhysRegsPass(getSystemZTargetMachine())); + addPass(createSystemZFinalizeReassociationPass(getSystemZTargetMachine())); } void SystemZPassConfig::addPostRewrite() { Index: llvm/lib/Target/X86/X86InstrInfo.h =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.h +++ llvm/lib/Target/X86/X86InstrInfo.h @@ -543,7 +543,7 @@ /// instruction that defines FoldAsLoadDefReg, and the function returns /// the machine instruction generated due to folding. MachineInstr *optimizeLoadInstr(MachineInstr &MI, - const MachineRegisterInfo *MRI, + MachineRegisterInfo *MRI, Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const override; Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4739,7 +4739,7 @@ /// register, the virtual register is used once in the same BB, and the /// instructions in-between do not load or store, and have no side effects. MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, - const MachineRegisterInfo *MRI, + MachineRegisterInfo *MRI, Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const { // Check whether we can move DefMI here. Index: llvm/test/CodeGen/SystemZ/fp-mul-02.ll =================================================================== --- llvm/test/CodeGen/SystemZ/fp-mul-02.ll +++ llvm/test/CodeGen/SystemZ/fp-mul-02.ll @@ -1,6 +1,6 @@ ; Test multiplication of two f32s, producing an f64 result. ; -; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s declare float @foo() @@ -201,3 +201,13 @@ ret float %trunc9 } + +; Check that reassociation flags do not get in the way of mdebr. +define double @f8(float %Src) { +; CHECK-LABEL: f8: +; CHECK: mdebr %f0, %f0 +; CHECK: br %r14 + %D = fpext float %Src to double + %res = fmul reassoc nsz arcp contract afn double %D, %D + ret double %res +} Index: llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp.ll @@ -0,0 +1,688 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs \ +; RUN: | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -stop-before=processimpdefs \ +; RUN: | FileCheck %s --check-prefix=PASSOUTPUT + +define double @fun0_fadd(ptr %x) { +; CHECK-LABEL: fun0_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld %f0, 0(%r2) +; CHECK-NEXT: adb %f0, 8(%r2) +; CHECK-NEXT: ld %f1, 24(%r2) +; CHECK-NEXT: adb %f1, 16(%r2) +; CHECK-NEXT: adbr %f0, %f1 +; CHECK-NEXT: ld %f1, 40(%r2) +; CHECK-NEXT: adb %f1, 32(%r2) +; CHECK-NEXT: adb %f1, 48(%r2) +; CHECK-NEXT: adbr %f0, %f1 +; CHECK-NEXT: adb %f0, 56(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun0_fadd +; PASSOUTPUT-NOT: WFADB +; PASSOUTPUT: WFADB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFADB {{.*}}$cc +; PASSOUTPUT-NOT: WFADB_CCPseudo +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn double %add, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn double %add3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %4 = load double, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn double %add5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %5 = load double, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn double %add7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %6 = load double, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn double %add9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + %7 = load double, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn double %add11, %7 + ret double %add13 +} + +define float @fun1_fadd(ptr %x) { +; CHECK-LABEL: fun1_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lde %f0, 0(%r2) +; CHECK-NEXT: aeb %f0, 4(%r2) +; CHECK-NEXT: lde %f1, 12(%r2) +; CHECK-NEXT: aeb %f1, 8(%r2) +; CHECK-NEXT: aebr %f0, %f1 +; CHECK-NEXT: lde %f1, 20(%r2) +; CHECK-NEXT: aeb %f1, 16(%r2) +; CHECK-NEXT: aeb %f1, 24(%r2) +; CHECK-NEXT: aebr %f0, %f1 +; CHECK-NEXT: aeb %f0, 28(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun1_fadd +; PASSOUTPUT-NOT: WFASB +; PASSOUTPUT: WFASB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFASB {{.*}}$cc +; PASSOUTPUT-NOT: WFASB_CCPseudo +entry: + %0 = load float, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn float %add, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn float %add3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4 + %4 = load float, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn float %add5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5 + %5 = load float, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn float %add7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6 + %6 = load float, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn float %add9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7 + %7 = load float, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn float %add11, %7 + ret float %add13 +} + +define fp128 @fun2_fadd(ptr %x) { +; CHECK-LABEL: fun2_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 16(%r3), 3 +; CHECK-NEXT: wfaxb %v0, %v1, %v0 +; CHECK-NEXT: vl %v1, 32(%r3), 3 +; CHECK-NEXT: vl %v2, 48(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: wfaxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r3), 3 +; CHECK-NEXT: vl %v2, 80(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: wfaxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r3), 3 +; CHECK-NEXT: wfaxb %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load fp128, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1 + %1 = load fp128, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn fp128 %1, %0 + %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2 + %2 = load fp128, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn fp128 %add, %2 + %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3 + %3 = load fp128, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn fp128 %add3, %3 + %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 4 + %4 = load fp128, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn fp128 %add5, %4 + %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 5 + %5 = load fp128, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn fp128 %add7, %5 + %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 6 + %6 = load fp128, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn fp128 %add9, %6 + %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 7 + %7 = load fp128, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn fp128 %add11, %7 + ret fp128 %add13 +} + +define <2 x double> @fun3_fadd(ptr %x) { +; CHECK-LABEL: fun3_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfadb %v0, %v1, %v0 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vfadb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vfadb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfadb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x double>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1 + %1 = load <2 x double>, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn <2 x double> %1, %0 + %arrayidx2 = getelementptr inbounds <2 x double>, ptr %x, i64 2 + %2 = load <2 x double>, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn <2 x double> %add, %2 + %arrayidx4 = getelementptr inbounds <2 x double>, ptr %x, i64 3 + %3 = load <2 x double>, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn <2 x double> %add3, %3 + %arrayidx6 = getelementptr inbounds <2 x double>, ptr %x, i64 4 + %4 = load <2 x double>, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn <2 x double> %add5, %4 + %arrayidx8 = getelementptr inbounds <2 x double>, ptr %x, i64 5 + %5 = load <2 x double>, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn <2 x double> %add7, %5 + %arrayidx10 = getelementptr inbounds <2 x double>, ptr %x, i64 6 + %6 = load <2 x double>, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn <2 x double> %add9, %6 + %arrayidx12 = getelementptr inbounds <2 x double>, ptr %x, i64 7 + %7 = load <2 x double>, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn <2 x double> %add11, %7 + ret <2 x double> %add13 +} + +define <4 x float> @fun4_fadd(ptr %x) { +; CHECK-LABEL: fun4_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfasb %v0, %v1, %v0 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vfasb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vfasb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfasb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x float>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1 + %1 = load <4 x float>, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn <4 x float> %1, %0 + %arrayidx2 = getelementptr inbounds <4 x float>, ptr %x, i64 2 + %2 = load <4 x float>, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn <4 x float> %add, %2 + %arrayidx4 = getelementptr inbounds <4 x float>, ptr %x, i64 3 + %3 = load <4 x float>, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn <4 x float> %add3, %3 + %arrayidx6 = getelementptr inbounds <4 x float>, ptr %x, i64 4 + %4 = load <4 x float>, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn <4 x float> %add5, %4 + %arrayidx8 = getelementptr inbounds <4 x float>, ptr %x, i64 5 + %5 = load <4 x float>, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn <4 x float> %add7, %5 + %arrayidx10 = getelementptr inbounds <4 x float>, ptr %x, i64 6 + %6 = load <4 x float>, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn <4 x float> %add9, %6 + %arrayidx12 = getelementptr inbounds <4 x float>, ptr %x, i64 7 + %7 = load <4 x float>, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn <4 x float> %add11, %7 + ret <4 x float> %add13 +} + +define double @fun5_fsub(ptr %x) { +; CHECK-LABEL: fun5_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld %f0, 0(%r2) +; CHECK-NEXT: sdb %f0, 8(%r2) +; CHECK-NEXT: ld %f1, 24(%r2) +; CHECK-NEXT: adb %f1, 16(%r2) +; CHECK-NEXT: sdbr %f0, %f1 +; CHECK-NEXT: ld %f1, 40(%r2) +; CHECK-NEXT: adb %f1, 32(%r2) +; CHECK-NEXT: adb %f1, 48(%r2) +; CHECK-NEXT: sdbr %f0, %f1 +; CHECK-NEXT: sdb %f0, 56(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun5_fsub +; PASSOUTPUT-NOT: WFSDB +; PASSOUTPUT: WFSDB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFSDB {{.*}}$cc +; PASSOUTPUT-NOT: WFSDB_CCPseudo +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %sub = fsub reassoc nsz arcp contract afn double %0, %1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %sub3 = fsub reassoc nsz arcp contract afn double %sub, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %sub5 = fsub reassoc nsz arcp contract afn double %sub3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %4 = load double, ptr %arrayidx6, align 8 + %sub7 = fsub reassoc nsz arcp contract afn double %sub5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %5 = load double, ptr %arrayidx8, align 8 + %sub9 = fsub reassoc nsz arcp contract afn double %sub7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %6 = load double, ptr %arrayidx10, align 8 + %sub11 = fsub reassoc nsz arcp contract afn double %sub9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + %7 = load double, ptr %arrayidx12, align 8 + %sub13 = fsub reassoc nsz arcp contract afn double %sub11, %7 + ret double %sub13 +} + +define float @fun6_fsub(ptr %x) { +; CHECK-LABEL: fun6_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lde %f0, 0(%r2) +; CHECK-NEXT: seb %f0, 4(%r2) +; CHECK-NEXT: lde %f1, 12(%r2) +; CHECK-NEXT: aeb %f1, 8(%r2) +; CHECK-NEXT: sebr %f0, %f1 +; CHECK-NEXT: lde %f1, 20(%r2) +; CHECK-NEXT: aeb %f1, 16(%r2) +; CHECK-NEXT: aeb %f1, 24(%r2) +; CHECK-NEXT: sebr %f0, %f1 +; CHECK-NEXT: seb %f0, 28(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun6_fsub +; PASSOUTPUT-NOT: WFSSB +; PASSOUTPUT: WFSSB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFSSB {{.*}}$cc +; PASSOUTPUT-NOT: WFSSB_CCPseudo +entry: + %0 = load float, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 8 + %sub = fsub reassoc nsz arcp contract afn float %0, %1 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 8 + %sub3 = fsub reassoc nsz arcp contract afn float %sub, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 8 + %sub5 = fsub reassoc nsz arcp contract afn float %sub3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4 + %4 = load float, ptr %arrayidx6, align 8 + %sub7 = fsub reassoc nsz arcp contract afn float %sub5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5 + %5 = load float, ptr %arrayidx8, align 8 + %sub9 = fsub reassoc nsz arcp contract afn float %sub7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6 + %6 = load float, ptr %arrayidx10, align 8 + %sub11 = fsub reassoc nsz arcp contract afn float %sub9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7 + %7 = load float, ptr %arrayidx12, align 8 + %sub13 = fsub reassoc nsz arcp contract afn float %sub11, %7 + ret float %sub13 +} + +define fp128 @fun7_fsub(ptr %x) { +; CHECK-LABEL: fun7_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 16(%r3), 3 +; CHECK-NEXT: wfsxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r3), 3 +; CHECK-NEXT: vl %v2, 48(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: wfsxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r3), 3 +; CHECK-NEXT: vl %v2, 80(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: wfsxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r3), 3 +; CHECK-NEXT: wfsxb %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load fp128, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1 + %1 = load fp128, ptr %arrayidx1, align 8 + %sub = fsub reassoc nsz arcp contract afn fp128 %0, %1 + %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2 + %2 = load fp128, ptr %arrayidx2, align 8 + %sub3 = fsub reassoc nsz arcp contract afn fp128 %sub, %2 + %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3 + %3 = load fp128, ptr %arrayidx4, align 8 + %sub5 = fsub reassoc nsz arcp contract afn fp128 %sub3, %3 + %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 4 + %4 = load fp128, ptr %arrayidx6, align 8 + %sub7 = fsub reassoc nsz arcp contract afn fp128 %sub5, %4 + %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 5 + %5 = load fp128, ptr %arrayidx8, align 8 + %sub9 = fsub reassoc nsz arcp contract afn fp128 %sub7, %5 + %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 6 + %6 = load fp128, ptr %arrayidx10, align 8 + %sub11 = fsub reassoc nsz arcp contract afn fp128 %sub9, %6 + %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 7 + %7 = load fp128, ptr %arrayidx12, align 8 + %sub13 = fsub reassoc nsz arcp contract afn fp128 %sub11, %7 + ret fp128 %sub13 +} + +define <2 x double> @fun8_fsub(ptr %x) { +; CHECK-LABEL: fun8_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfsdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vfsdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vfsdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfsdb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x double>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1 + %1 = load <2 x double>, ptr %arrayidx1, align 8 + %sub = fsub reassoc nsz arcp contract afn <2 x double> %0, %1 + %arrayidx2 = getelementptr inbounds <2 x double>, ptr %x, i64 2 + %2 = load <2 x double>, ptr %arrayidx2, align 8 + %sub3 = fsub reassoc nsz arcp contract afn <2 x double> %sub, %2 + %arrayidx4 = getelementptr inbounds <2 x double>, ptr %x, i64 3 + %3 = load <2 x double>, ptr %arrayidx4, align 8 + %sub5 = fsub reassoc nsz arcp contract afn <2 x double> %sub3, %3 + %arrayidx6 = getelementptr inbounds <2 x double>, ptr %x, i64 4 + %4 = load <2 x double>, ptr %arrayidx6, align 8 + %sub7 = fsub reassoc nsz arcp contract afn <2 x double> %sub5, %4 + %arrayidx8 = getelementptr inbounds <2 x double>, ptr %x, i64 5 + %5 = load <2 x double>, ptr %arrayidx8, align 8 + %sub9 = fsub reassoc nsz arcp contract afn <2 x double> %sub7, %5 + %arrayidx10 = getelementptr inbounds <2 x double>, ptr %x, i64 6 + %6 = load <2 x double>, ptr %arrayidx10, align 8 + %sub11 = fsub reassoc nsz arcp contract afn <2 x double> %sub9, %6 + %arrayidx12 = getelementptr inbounds <2 x double>, ptr %x, i64 7 + %7 = load <2 x double>, ptr %arrayidx12, align 8 + %sub13 = fsub reassoc nsz arcp contract afn <2 x double> %sub11, %7 + ret <2 x double> %sub13 +} + +define <4 x float> @fun9_fsub(ptr %x) { +; CHECK-LABEL: fun9_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfssb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vfssb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vfssb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfssb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x float>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1 + %1 = load <4 x float>, ptr %arrayidx1, align 8 + %sub = fsub reassoc nsz arcp contract afn <4 x float> %0, %1 + %arrayidx2 = getelementptr inbounds <4 x float>, ptr %x, i64 2 + %2 = load <4 x float>, ptr %arrayidx2, align 8 + %sub3 = fsub reassoc nsz arcp contract afn <4 x float> %sub, %2 + %arrayidx4 = getelementptr inbounds <4 x float>, ptr %x, i64 3 + %3 = load <4 x float>, ptr %arrayidx4, align 8 + %sub5 = fsub reassoc nsz arcp contract afn <4 x float> %sub3, %3 + %arrayidx6 = getelementptr inbounds <4 x float>, ptr %x, i64 4 + %4 = load <4 x float>, ptr %arrayidx6, align 8 + %sub7 = fsub reassoc nsz arcp contract afn <4 x float> %sub5, %4 + %arrayidx8 = getelementptr inbounds <4 x float>, ptr %x, i64 5 + %5 = load <4 x float>, ptr %arrayidx8, align 8 + %sub9 = fsub reassoc nsz arcp contract afn <4 x float> %sub7, %5 + %arrayidx10 = getelementptr inbounds <4 x float>, ptr %x, i64 6 + %6 = load <4 x float>, ptr %arrayidx10, align 8 + %sub11 = fsub reassoc nsz arcp contract afn <4 x float> %sub9, %6 + %arrayidx12 = getelementptr inbounds <4 x float>, ptr %x, i64 7 + %7 = load <4 x float>, ptr %arrayidx12, align 8 + %sub13 = fsub reassoc nsz arcp contract afn <4 x float> %sub11, %7 + ret <4 x float> %sub13 +} + +define double @fun10_fmul(ptr %x) { +; CHECK-LABEL: fun10_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld %f0, 8(%r2) +; CHECK-NEXT: mdb %f0, 0(%r2) +; CHECK-NEXT: ld %f1, 24(%r2) +; CHECK-NEXT: mdb %f1, 16(%r2) +; CHECK-NEXT: mdbr %f0, %f1 +; CHECK-NEXT: ld %f1, 40(%r2) +; CHECK-NEXT: mdb %f1, 32(%r2) +; CHECK-NEXT: mdb %f1, 48(%r2) +; CHECK-NEXT: mdbr %f0, %f1 +; CHECK-NEXT: mdb %f0, 56(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun10_fmul +; PASSOUTPUT-NOT: WFMDB +; PASSOUTPUT: WFMDB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFMDB {{.*}}$cc +; PASSOUTPUT-NOT: WFMDB_CCPseudo +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn double %0, %1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %4 = load double, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %5 = load double, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %6 = load double, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + %7 = load double, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7 + ret double %mul13 +} + +define float @fun11_fmul(ptr %x) { +; CHECK-LABEL: fun11_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lde %f0, 4(%r2) +; CHECK-NEXT: meeb %f0, 0(%r2) +; CHECK-NEXT: lde %f1, 12(%r2) +; CHECK-NEXT: meeb %f1, 8(%r2) +; CHECK-NEXT: meebr %f0, %f1 +; CHECK-NEXT: lde %f1, 20(%r2) +; CHECK-NEXT: meeb %f1, 16(%r2) +; CHECK-NEXT: meeb %f1, 24(%r2) +; CHECK-NEXT: meebr %f0, %f1 +; CHECK-NEXT: meeb %f0, 28(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun11_fmul +; PASSOUTPUT-NOT: WFMSB +; PASSOUTPUT: WFMSB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFMSB {{.*}}$cc +; PASSOUTPUT-NOT: WFMSB_CCPseudo +entry: + %0 = load float, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn float %0, %1 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4 + %4 = load float, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5 + %5 = load float, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6 + %6 = load float, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7 + %7 = load float, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7 + ret float %mul13 +} + +define fp128 @fun12_fmul(ptr %x) { +; CHECK-LABEL: fun12_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 16(%r3), 3 +; CHECK-NEXT: wfmxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r3), 3 +; CHECK-NEXT: vl %v2, 48(%r3), 3 +; CHECK-NEXT: wfmxb %v1, %v1, %v2 +; CHECK-NEXT: wfmxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r3), 3 +; CHECK-NEXT: vl %v2, 80(%r3), 3 +; CHECK-NEXT: wfmxb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r3), 3 +; CHECK-NEXT: wfmxb %v1, %v1, %v2 +; CHECK-NEXT: wfmxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r3), 3 +; CHECK-NEXT: wfmxb %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load fp128, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1 + %1 = load fp128, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn fp128 %0, %1 + %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2 + %2 = load fp128, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn fp128 %mul, %2 + %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3 + %3 = load fp128, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn fp128 %mul3, %3 + %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 4 + %4 = load fp128, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn fp128 %mul5, %4 + %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 5 + %5 = load fp128, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn fp128 %mul7, %5 + %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 6 + %6 = load fp128, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn fp128 %mul9, %6 + %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 7 + %7 = load fp128, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn fp128 %mul11, %7 + ret fp128 %mul13 +} + +define <2 x double> @fun13_fmul(ptr %x) { +; CHECK-LABEL: fun13_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfmdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfmdb %v1, %v1, %v2 +; CHECK-NEXT: vfmdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfmdb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfmdb %v1, %v1, %v2 +; CHECK-NEXT: vfmdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfmdb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x double>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1 + %1 = load <2 x double>, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn <2 x double> %0, %1 + %arrayidx2 = getelementptr inbounds <2 x double>, ptr %x, i64 2 + %2 = load <2 x double>, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn <2 x double> %mul, %2 + %arrayidx4 = getelementptr inbounds <2 x double>, ptr %x, i64 3 + %3 = load <2 x double>, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn <2 x double> %mul3, %3 + %arrayidx6 = getelementptr inbounds <2 x double>, ptr %x, i64 4 + %4 = load <2 x double>, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn <2 x double> %mul5, %4 + %arrayidx8 = getelementptr inbounds <2 x double>, ptr %x, i64 5 + %5 = load <2 x double>, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn <2 x double> %mul7, %5 + %arrayidx10 = getelementptr inbounds <2 x double>, ptr %x, i64 6 + %6 = load <2 x double>, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn <2 x double> %mul9, %6 + %arrayidx12 = getelementptr inbounds <2 x double>, ptr %x, i64 7 + %7 = load <2 x double>, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn <2 x double> %mul11, %7 + ret <2 x double> %mul13 +} + +define <4 x float> @fun14_fmul(ptr %x) { +; CHECK-LABEL: fun14_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfmsb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfmsb %v1, %v1, %v2 +; CHECK-NEXT: vfmsb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfmsb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfmsb %v1, %v1, %v2 +; CHECK-NEXT: vfmsb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfmsb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x float>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1 + %1 = load <4 x float>, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn <4 x float> %0, %1 + %arrayidx2 = getelementptr inbounds <4 x float>, ptr %x, i64 2 + %2 = load <4 x float>, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn <4 x float> %mul, %2 + %arrayidx4 = getelementptr inbounds <4 x float>, ptr %x, i64 3 + %3 = load <4 x float>, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn <4 x float> %mul3, %3 + %arrayidx6 = getelementptr inbounds <4 x float>, ptr %x, i64 4 + %4 = load <4 x float>, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn <4 x float> %mul5, %4 + %arrayidx8 = getelementptr inbounds <4 x float>, ptr %x, i64 5 + %5 = load <4 x float>, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn <4 x float> %mul7, %5 + %arrayidx10 = getelementptr inbounds <4 x float>, ptr %x, i64 6 + %6 = load <4 x float>, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn <4 x float> %mul9, %6 + %arrayidx12 = getelementptr inbounds <4 x float>, ptr %x, i64 7 + %7 = load <4 x float>, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn <4 x float> %mul11, %7 + ret <4 x float> %mul13 +}