Index: llvm/include/llvm/CodeGen/TargetInstrInfo.h =================================================================== --- llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1630,7 +1630,7 @@ /// instruction that defines FoldAsLoadDefReg, and the function returns /// the machine instruction generated due to folding. virtual MachineInstr *optimizeLoadInstr(MachineInstr &MI, - const MachineRegisterInfo *MRI, + MachineRegisterInfo *MRI, Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const { return nullptr; @@ -1647,6 +1647,11 @@ return false; } + /// Allow any handling of instructions after peephole optimizer. + virtual bool processFunctionAfterPeepholeOpt(MachineFunction &MF) const { + return false; + } + /// Return the number of u-operations the given machine /// instruction will be decoded to on the target cpu. The itinerary's /// IssueWidth is the number of microops that can be dispatched each Index: llvm/lib/CodeGen/PeepholeOptimizer.cpp =================================================================== --- llvm/lib/CodeGen/PeepholeOptimizer.cpp +++ llvm/lib/CodeGen/PeepholeOptimizer.cpp @@ -1815,6 +1815,8 @@ } } + Changed |= TII->processFunctionAfterPeepholeOpt(MF); + return Changed; } Index: llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp +++ llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp @@ -345,6 +345,11 @@ // Try to expand a boolean SELECT_CCMASK using an IPM sequence. SDValue expandSelectBoolean(SDNode *Node); + bool hasReassocFlags(SDNode *N) const { + return N->getFlags().hasAllowReassociation() && + N->getFlags().hasNoSignedZeros(); + } + public: static char ID; Index: llvm/lib/Target/SystemZ/SystemZInstrFormats.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrFormats.td +++ llvm/lib/Target/SystemZ/SystemZInstrFormats.td @@ -5388,3 +5388,13 @@ [(set GR64:$end, (operator GR64:$start1, GR64:$start2, GR32:$char))]>; } + +multiclass BinaryVRRcAndCCPseudo opcode, + SDPatternOperator operator, TypedReg tr1, + TypedReg tr2, bits<4> type = 0, bits<4> m5 = 0, + bits<4> m6 = 0, string fp_mnemonic = ""> { + def "" : BinaryVRRc; + let Defs = [CC] in + def _CCPseudo : Pseudo<(outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3), []>; +} Index: llvm/lib/Target/SystemZ/SystemZInstrInfo.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrInfo.h +++ llvm/lib/Target/SystemZ/SystemZInstrInfo.h @@ -243,8 +243,14 @@ const DebugLoc &DL, Register DstReg, ArrayRef Cond, Register TrueReg, Register FalseReg) const override; + MachineInstr *optimizeLoadInstr(MachineInstr &MI, + MachineRegisterInfo *MRI, + Register &FoldAsLoadDefReg, + MachineInstr *&DefMI) const override; bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const override; + bool processFunctionAfterPeepholeOpt(MachineFunction &MF) const override; + bool isPredicable(const MachineInstr &MI) const override; bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles, unsigned ExtraPredCycles, @@ -274,6 +280,12 @@ Register VReg) const override; MachineInstr *convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const override; + + bool useMachineCombiner() const override { return true; } + bool isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const override; + std::optional getInverseOpcode(unsigned Opcode) const override; + MachineInstr * foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, Index: llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -610,6 +610,88 @@ .addImm(CCValid).addImm(CCMask); } +static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) { + if (OldMI->registerDefIsDead(SystemZ::CC)) { + MachineOperand *CCDef = NewMI->findRegisterDefOperand(SystemZ::CC); + if (CCDef != nullptr) + CCDef->setIsDead(true); + } +} + +static void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI, + MachineInstr::MIFlag Flag) { + if (OldMI->getFlag(Flag)) + NewMI->setFlag(Flag); +} + +MachineInstr *SystemZInstrInfo::optimizeLoadInstr(MachineInstr &MI, + MachineRegisterInfo *MRI, + Register &FoldAsLoadDefReg, + MachineInstr *&DefMI) const { + // Check whether we can move DefMI here. + DefMI = MRI->getVRegDef(FoldAsLoadDefReg); + assert(DefMI); + bool SawStore = false; + if (!DefMI->isSafeToMove(nullptr, SawStore)) + return nullptr; + + // For reassociatable FP operations, any loads have been purposefully left + // unfolded so that MachineCombiner can do its work on reg/reg + // opcodes. After that, as many loads as possible are now folded. + unsigned LoadOpcode = 0; + unsigned RegMemOpcode = 0; + const TargetRegisterClass *FPRC = nullptr; + if (MI.getOpcode() == SystemZ::WFADB_CCPseudo || + MI.getOpcode() == SystemZ::WFSDB_CCPseudo || + MI.getOpcode() == SystemZ::WFMDB_CCPseudo) { + RegMemOpcode = MI.getOpcode() == SystemZ::WFADB_CCPseudo ? SystemZ::ADB + : MI.getOpcode() == SystemZ::WFSDB_CCPseudo ? SystemZ::SDB + : SystemZ::MDB; + LoadOpcode = SystemZ::VL64; + FPRC = &SystemZ::FP64BitRegClass; + } else if (MI.getOpcode() == SystemZ::WFASB_CCPseudo || + MI.getOpcode() == SystemZ::WFSSB_CCPseudo || + MI.getOpcode() == SystemZ::WFMSB_CCPseudo) { + RegMemOpcode = MI.getOpcode() == SystemZ::WFASB_CCPseudo ? SystemZ::AEB + : MI.getOpcode() == SystemZ::WFSSB_CCPseudo ? SystemZ::SEB + : SystemZ::MEEB; + LoadOpcode = SystemZ::VL32; + FPRC = &SystemZ::FP32BitRegClass; + } else + return nullptr; + + if (DefMI->getOpcode() == LoadOpcode && + MRI->hasOneNonDBGUse(FoldAsLoadDefReg)) { + MachineBasicBlock *MBB = MI.getParent(); + Register DstReg = MI.getOperand(0).getReg(); + MachineOperand LHS = MI.getOperand(1); + MachineOperand RHS = MI.getOperand(2); + bool MemInRHS = RHS.getReg() == FoldAsLoadDefReg; + if (!MemInRHS && + (RegMemOpcode == SystemZ::SDB || RegMemOpcode == SystemZ::SEB)) + return nullptr; + MachineOperand &RegMO = MemInRHS ? LHS : RHS; + // Only use the 2-address reg/mem if there is no other use of RegMO in MBB. + for (auto &UseMI : MRI->use_nodbg_instructions(RegMO.getReg())) + if (UseMI.getParent() == MBB && &UseMI != &MI) + return nullptr; + + MachineInstrBuilder MIB = + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(RegMemOpcode), DstReg) + .add(RegMO) + .add(DefMI->getOperand(1)) + .add(DefMI->getOperand(2)) + .add(DefMI->getOperand(3)) + .addMemOperand(*DefMI->memoperands_begin()); + transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept); + MRI->setRegClass(RegMO.getReg(), FPRC); + MRI->setRegClass(DstReg, FPRC); + return MIB; + } + + return nullptr; +} + bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const { @@ -674,6 +756,29 @@ return true; } +bool SystemZInstrInfo::processFunctionAfterPeepholeOpt( + MachineFunction &MF) const { + bool Changed = false; + for (auto &MBB : MF) + for (auto &MI : MBB) { + unsigned PseudoOpcode = MI.getOpcode(); + unsigned TargetOpcode = + PseudoOpcode == SystemZ::WFADB_CCPseudo ? SystemZ::WFADB + : PseudoOpcode == SystemZ::WFASB_CCPseudo ? SystemZ::WFASB + : PseudoOpcode == SystemZ::WFSDB_CCPseudo ? SystemZ::WFSDB + : PseudoOpcode == SystemZ::WFSSB_CCPseudo ? SystemZ::WFSSB + : PseudoOpcode == SystemZ::WFMDB_CCPseudo ? SystemZ::WFMDB + : PseudoOpcode == SystemZ::WFMSB_CCPseudo ? SystemZ::WFMSB + : 0; + if (TargetOpcode) { + MI.setDesc(get(TargetOpcode)); + MI.removeOperand(3); // CC + Changed = true; + } + } + return Changed; +} + bool SystemZInstrInfo::isPredicable(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); if (Opcode == SystemZ::Return || @@ -937,20 +1042,6 @@ } } -static void transferDeadCC(MachineInstr *OldMI, MachineInstr *NewMI) { - if (OldMI->registerDefIsDead(SystemZ::CC)) { - MachineOperand *CCDef = NewMI->findRegisterDefOperand(SystemZ::CC); - if (CCDef != nullptr) - CCDef->setIsDead(true); - } -} - -static void transferMIFlag(MachineInstr *OldMI, MachineInstr *NewMI, - MachineInstr::MIFlag Flag) { - if (OldMI->getFlag(Flag)) - NewMI->setFlag(Flag); -} - MachineInstr * SystemZInstrInfo::convertToThreeAddress(MachineInstr &MI, LiveVariables *LV, LiveIntervals *LIS) const { @@ -1003,6 +1094,64 @@ return nullptr; } +bool SystemZInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst, + bool Invert) const { + unsigned Opc = Inst.getOpcode(); + if (Invert) { + auto InverseOpcode = getInverseOpcode(Opc); + if (!InverseOpcode) + return false; + Opc = *InverseOpcode; + } + + switch (Opc) { + default: + break; + case SystemZ::VFADB: + case SystemZ::VFASB: + case SystemZ::WFAXB: + case SystemZ::WFADB_CCPseudo: + case SystemZ::WFASB_CCPseudo: + case SystemZ::VFMDB: + case SystemZ::VFMSB: + case SystemZ::WFMXB: + case SystemZ::WFMDB_CCPseudo: + case SystemZ::WFMSB_CCPseudo: + return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) && + Inst.getFlag(MachineInstr::MIFlag::FmNsz); + } + + return false; +} + +std::optional +SystemZInstrInfo::getInverseOpcode(unsigned Opcode) const { + switch (Opcode) { + default: + return std::nullopt; + case SystemZ::VFADB: + return SystemZ::VFSDB; + case SystemZ::VFASB: + return SystemZ::VFSSB; + case SystemZ::WFAXB: + return SystemZ::WFSXB; + case SystemZ::WFADB_CCPseudo: + return SystemZ::WFSDB_CCPseudo; + case SystemZ::WFASB_CCPseudo: + return SystemZ::WFSSB_CCPseudo; + case SystemZ::VFSDB: + return SystemZ::VFADB; + case SystemZ::VFSSB: + return SystemZ::VFASB; + case SystemZ::WFSXB: + return SystemZ::WFAXB; + case SystemZ::WFSDB_CCPseudo: + return SystemZ::WFADB_CCPseudo; + case SystemZ::WFSSB_CCPseudo: + return SystemZ::WFASB_CCPseudo; + } +} + MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl( MachineFunction &MF, MachineInstr &MI, ArrayRef Ops, MachineBasicBlock::iterator InsertPt, int FrameIndex, Index: llvm/lib/Target/SystemZ/SystemZInstrVector.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZInstrVector.td +++ llvm/lib/Target/SystemZ/SystemZInstrVector.td @@ -139,7 +139,7 @@ // LEY and LDY offer full 20-bit displacement fields. It's often better // to use those instructions rather than force a 20-bit displacement // into a GPR temporary. - let mayLoad = 1 in { + let mayLoad = 1, canFoldAsLoad = 1 in { def VL32 : UnaryAliasVRX; def VL64 : UnaryAliasVRX; } @@ -1045,17 +1045,26 @@ let Predicates = [FeatureVector] in { // Add. let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in { - def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>; - def VFADB : BinaryVRRc<"vfadb", 0xE7E3, any_fadd, v128db, v128db, 3, 0>; - def WFADB : BinaryVRRc<"wfadb", 0xE7E3, any_fadd, v64db, v64db, 3, 8, 0, - "adbr">; + def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>; + def VFADB : BinaryVRRc<"vfadb", 0xE7E3, any_fadd, v128db, v128db, 3, 0>; + defm WFADB : BinaryVRRcAndCCPseudo<"wfadb", 0xE7E3, any_fadd, v64db, v64db, + 3, 8, 0, "adbr">; let Predicates = [FeatureVectorEnhancements1] in { - def VFASB : BinaryVRRc<"vfasb", 0xE7E3, any_fadd, v128sb, v128sb, 2, 0>; - def WFASB : BinaryVRRc<"wfasb", 0xE7E3, any_fadd, v32sb, v32sb, 2, 8, 0, - "aebr">; - def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, any_fadd, v128xb, v128xb, 4, 8>; + def VFASB : BinaryVRRc<"vfasb", 0xE7E3, any_fadd, v128sb, v128sb, 2, 0>; + defm WFASB : BinaryVRRcAndCCPseudo<"wfasb", 0xE7E3, any_fadd, v32sb, v32sb, + 2, 8, 0, "aebr">; + def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, any_fadd, v128xb, v128xb, 4, 8>; } } + // Wait with reg/mem folding if reassociation is allowed since + // MachineCombiner would otherwise not succeed. Use a reg/reg pseudo that + // clobbers CC to help later load folding done by the peephole + // optimizer. The AddedComplexity makes sure it wins over the reg/mem + // pattern. + let AddedComplexity = 15 in { + def : FAddReassoc; + def : FAddReassoc; + } // Convert from fixed. let Uses = [FPC], mayRaiseFPException = 1 in { @@ -1258,17 +1267,22 @@ // Multiply. let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in { - def VFM : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>; - def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, any_fmul, v128db, v128db, 3, 0>; - def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, any_fmul, v64db, v64db, 3, 8, 0, - "mdbr">; + def VFM : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>; + def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, any_fmul, v128db, v128db, 3, 0>; + defm WFMDB : BinaryVRRcAndCCPseudo<"wfmdb", 0xE7E7, any_fmul, v64db, v64db, + 3, 8, 0, "mdbr">; let Predicates = [FeatureVectorEnhancements1] in { - def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, any_fmul, v128sb, v128sb, 2, 0>; - def WFMSB : BinaryVRRc<"wfmsb", 0xE7E7, any_fmul, v32sb, v32sb, 2, 8, 0, - "meebr">; - def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, any_fmul, v128xb, v128xb, 4, 8>; + def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, any_fmul, v128sb, v128sb, 2, 0>; + defm WFMSB : BinaryVRRcAndCCPseudo<"wfmsb", 0xE7E7, any_fmul, v32sb, v32sb, + 2, 8, 0, "meebr">; + def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, any_fmul, v128xb, v128xb, 4, 8>; } } + // Same as with addition, see above. + let AddedComplexity = 15 in { + def : FMulReassoc; + def : FMulReassoc; + } // Multiply and add. let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in { @@ -1373,17 +1387,22 @@ // Subtract. let Uses = [FPC], mayRaiseFPException = 1 in { - def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>; - def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, any_fsub, v128db, v128db, 3, 0>; - def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, any_fsub, v64db, v64db, 3, 8, 0, - "sdbr">; + def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>; + def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, any_fsub, v128db, v128db, 3, 0>; + defm WFSDB : BinaryVRRcAndCCPseudo<"wfsdb", 0xE7E2, any_fsub, v64db, v64db, + 3, 8, 0, "sdbr">; let Predicates = [FeatureVectorEnhancements1] in { - def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, any_fsub, v128sb, v128sb, 2, 0>; - def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, any_fsub, v32sb, v32sb, 2, 8, 0, - "sebr">; - def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, any_fsub, v128xb, v128xb, 4, 8>; + def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, any_fsub, v128sb, v128sb, 2, 0>; + defm WFSSB : BinaryVRRcAndCCPseudo<"wfssb", 0xE7E2, any_fsub, v32sb, v32sb, + 2, 8, 0, "sebr">; + def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, any_fsub, v128xb, v128xb, 4, 8>; } } + // Same as with addition, see above. + let AddedComplexity = 15 in { + def : FSubReassoc; + def : FSubReassoc; + } // Test data class immediate. let Defs = [CC] in { Index: llvm/lib/Target/SystemZ/SystemZOperators.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZOperators.td +++ llvm/lib/Target/SystemZ/SystemZOperators.td @@ -700,6 +700,17 @@ // Floating-point negative absolute. def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>; +// Floating-point operations which are reassociable. +def z_fadd_reassoc : PatFrag<(ops node:$src1, node:$src2), + (fadd node:$src1, node:$src2), + [{ return hasReassocFlags(N); }]>; +def z_fsub_reassoc : PatFrag<(ops node:$src1, node:$src2), + (fsub node:$src1, node:$src2), + [{ return hasReassocFlags(N); }]>; +def z_fmul_reassoc : PatFrag<(ops node:$src1, node:$src2), + (fmul node:$src1, node:$src2), + [{ return hasReassocFlags(N); }]>; + // Strict floating-point fragments. def z_any_fcmp : PatFrags<(ops node:$lhs, node:$rhs), [(z_strict_fcmp node:$lhs, node:$rhs), Index: llvm/lib/Target/SystemZ/SystemZPatterns.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZPatterns.td +++ llvm/lib/Target/SystemZ/SystemZPatterns.td @@ -144,6 +144,16 @@ defm : BinaryLoadStore; } +class FAddReassoc + : Pat<(tr.vt (z_fadd_reassoc tr.op:$R1, tr.op:$R2)), + (insn tr.op:$R1, tr.op:$R2)>; +class FSubReassoc + : Pat<(tr.vt (z_fsub_reassoc tr.op:$R1, tr.op:$R2)), + (insn tr.op:$R1, tr.op:$R2)>; +class FMulReassoc + : Pat<(tr.vt (z_fmul_reassoc tr.op:$R1, tr.op:$R2)), + (insn tr.op:$R1, tr.op:$R2)>; + // Record that INSN is a LOAD AND TEST that can be used to compare // registers in CLS against zero. The instruction has separate R1 and R2 // operands, but they must be the same when the instruction is used like this. Index: llvm/lib/Target/SystemZ/SystemZScheduleZ13.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZScheduleZ13.td +++ llvm/lib/Target/SystemZ/SystemZScheduleZ13.td @@ -1346,12 +1346,12 @@ // Add / subtract def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)DB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>; // Multiply / multiply-and-add/subtract def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFMDB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFMDB$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFMDB(_CCPseudo)?$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM(A|S)$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM(A|S)DB$")>; def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(A|S)DB$")>; Index: llvm/lib/Target/SystemZ/SystemZScheduleZ14.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZScheduleZ14.td +++ llvm/lib/Target/SystemZ/SystemZScheduleZ14.td @@ -1390,15 +1390,15 @@ // Add / subtract def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)$")>; def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VF(A|S)DB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(A|S)SB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)SB$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WF(A|S)SB(_CCPseudo)?$")>; def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>; // Multiply / multiply-and-add/subtract def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFM$")>; def : InstRW<[WLat7, VecBF, NormalGr], (instregex "VFMDB$")>; -def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(D|S)B$")>; +def : InstRW<[WLat7, VecBF, NormalGr], (instregex "WFM(D|S)B(_CCPseudo)?$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VFMSB$")>; def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>; def : InstRW<[WLat8, VecBF2, NormalGr], (instregex "VF(N)?M(A|S)$")>; Index: llvm/lib/Target/SystemZ/SystemZScheduleZ15.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZScheduleZ15.td +++ llvm/lib/Target/SystemZ/SystemZScheduleZ15.td @@ -1433,14 +1433,14 @@ // Add / subtract def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB(_CCPseudo)?$")>; def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>; // Multiply / multiply-and-add/subtract def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFM(DB)?$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B(_CCPseudo)?$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>; def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>; Index: llvm/lib/Target/SystemZ/SystemZScheduleZ16.td =================================================================== --- llvm/lib/Target/SystemZ/SystemZScheduleZ16.td +++ llvm/lib/Target/SystemZ/SystemZScheduleZ16.td @@ -1439,14 +1439,14 @@ // Add / subtract def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)DB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)DB(_CCPseudo)?$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(A|S)SB$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WF(A|S)SB(_CCPseudo)?$")>; def : InstRW<[WLat10, VecDF2, NormalGr], (instregex "WF(A|S)XB$")>; // Multiply / multiply-and-add/subtract def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFM(DB)?$")>; -def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B$")>; +def : InstRW<[WLat6, VecBF, NormalGr], (instregex "WFM(D|S)B(_CCPseudo)?$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VFMSB$")>; def : InstRW<[WLat20, VecDF2, NormalGr], (instregex "WFMXB$")>; def : InstRW<[WLat6, VecBF, NormalGr], (instregex "VF(N)?M(A|S)$")>; Index: llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -30,6 +30,11 @@ using namespace llvm; +static cl::opt +EnableMachineCombinerPass("systemz-machine-combiner", + cl::desc("Enable the machine combiner pass"), + cl::init(true), cl::Hidden); + extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeSystemZTarget() { // Register the target. RegisterTargetMachine X(getTheSystemZTarget()); @@ -240,6 +245,10 @@ bool SystemZPassConfig::addILPOpts() { addPass(&EarlyIfConverterID); + + if (EnableMachineCombinerPass) + addPass(&MachineCombinerID); + return true; } Index: llvm/lib/Target/X86/X86InstrInfo.h =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.h +++ llvm/lib/Target/X86/X86InstrInfo.h @@ -541,7 +541,7 @@ /// instruction that defines FoldAsLoadDefReg, and the function returns /// the machine instruction generated due to folding. MachineInstr *optimizeLoadInstr(MachineInstr &MI, - const MachineRegisterInfo *MRI, + MachineRegisterInfo *MRI, Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const override; Index: llvm/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrInfo.cpp +++ llvm/lib/Target/X86/X86InstrInfo.cpp @@ -4679,7 +4679,7 @@ /// register, the virtual register is used once in the same BB, and the /// instructions in-between do not load or store, and have no side effects. MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, - const MachineRegisterInfo *MRI, + MachineRegisterInfo *MRI, Register &FoldAsLoadDefReg, MachineInstr *&DefMI) const { // Check whether we can move DefMI here. Index: llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/SystemZ/machine-combiner-reassoc-fp.ll @@ -0,0 +1,688 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -verify-machineinstrs \ +; RUN: | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 -stop-after=peephole-opt \ +; RUN: | FileCheck %s --check-prefix=PASSOUTPUT + +define double @fun0_fadd(ptr %x) { +; CHECK-LABEL: fun0_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld %f0, 0(%r2) +; CHECK-NEXT: adb %f0, 8(%r2) +; CHECK-NEXT: ld %f1, 24(%r2) +; CHECK-NEXT: adb %f1, 16(%r2) +; CHECK-NEXT: adbr %f0, %f1 +; CHECK-NEXT: ld %f1, 40(%r2) +; CHECK-NEXT: adb %f1, 32(%r2) +; CHECK-NEXT: adb %f1, 48(%r2) +; CHECK-NEXT: adbr %f0, %f1 +; CHECK-NEXT: adb %f0, 56(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun0_fadd +; PASSOUTPUT-NOT: WFADB +; PASSOUTPUT: WFADB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFADB {{.*}}$cc +; PASSOUTPUT-NOT: WFADB_CCPseudo +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn double %add, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn double %add3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %4 = load double, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn double %add5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %5 = load double, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn double %add7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %6 = load double, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn double %add9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + %7 = load double, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn double %add11, %7 + ret double %add13 +} + +define float @fun1_fadd(ptr %x) { +; CHECK-LABEL: fun1_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lde %f0, 0(%r2) +; CHECK-NEXT: aeb %f0, 4(%r2) +; CHECK-NEXT: lde %f1, 12(%r2) +; CHECK-NEXT: aeb %f1, 8(%r2) +; CHECK-NEXT: aebr %f0, %f1 +; CHECK-NEXT: lde %f1, 20(%r2) +; CHECK-NEXT: aeb %f1, 16(%r2) +; CHECK-NEXT: aeb %f1, 24(%r2) +; CHECK-NEXT: aebr %f0, %f1 +; CHECK-NEXT: aeb %f0, 28(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun1_fadd +; PASSOUTPUT-NOT: WFASB +; PASSOUTPUT: WFASB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFASB {{.*}}$cc +; PASSOUTPUT-NOT: WFASB_CCPseudo +entry: + %0 = load float, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn float %add, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn float %add3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4 + %4 = load float, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn float %add5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5 + %5 = load float, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn float %add7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6 + %6 = load float, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn float %add9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7 + %7 = load float, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn float %add11, %7 + ret float %add13 +} + +define fp128 @fun2_fadd(ptr %x) { +; CHECK-LABEL: fun2_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 16(%r3), 3 +; CHECK-NEXT: wfaxb %v0, %v1, %v0 +; CHECK-NEXT: vl %v1, 32(%r3), 3 +; CHECK-NEXT: vl %v2, 48(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: wfaxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r3), 3 +; CHECK-NEXT: vl %v2, 80(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: wfaxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r3), 3 +; CHECK-NEXT: wfaxb %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load fp128, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1 + %1 = load fp128, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn fp128 %1, %0 + %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2 + %2 = load fp128, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn fp128 %add, %2 + %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3 + %3 = load fp128, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn fp128 %add3, %3 + %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 4 + %4 = load fp128, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn fp128 %add5, %4 + %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 5 + %5 = load fp128, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn fp128 %add7, %5 + %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 6 + %6 = load fp128, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn fp128 %add9, %6 + %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 7 + %7 = load fp128, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn fp128 %add11, %7 + ret fp128 %add13 +} + +define <2 x double> @fun3_fadd(ptr %x) { +; CHECK-LABEL: fun3_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfadb %v0, %v1, %v0 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vfadb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vfadb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfadb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x double>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1 + %1 = load <2 x double>, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn <2 x double> %1, %0 + %arrayidx2 = getelementptr inbounds <2 x double>, ptr %x, i64 2 + %2 = load <2 x double>, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn <2 x double> %add, %2 + %arrayidx4 = getelementptr inbounds <2 x double>, ptr %x, i64 3 + %3 = load <2 x double>, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn <2 x double> %add3, %3 + %arrayidx6 = getelementptr inbounds <2 x double>, ptr %x, i64 4 + %4 = load <2 x double>, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn <2 x double> %add5, %4 + %arrayidx8 = getelementptr inbounds <2 x double>, ptr %x, i64 5 + %5 = load <2 x double>, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn <2 x double> %add7, %5 + %arrayidx10 = getelementptr inbounds <2 x double>, ptr %x, i64 6 + %6 = load <2 x double>, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn <2 x double> %add9, %6 + %arrayidx12 = getelementptr inbounds <2 x double>, ptr %x, i64 7 + %7 = load <2 x double>, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn <2 x double> %add11, %7 + ret <2 x double> %add13 +} + +define <4 x float> @fun4_fadd(ptr %x) { +; CHECK-LABEL: fun4_fadd: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfasb %v0, %v1, %v0 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vfasb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vfasb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfasb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x float>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1 + %1 = load <4 x float>, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn <4 x float> %1, %0 + %arrayidx2 = getelementptr inbounds <4 x float>, ptr %x, i64 2 + %2 = load <4 x float>, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn <4 x float> %add, %2 + %arrayidx4 = getelementptr inbounds <4 x float>, ptr %x, i64 3 + %3 = load <4 x float>, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn <4 x float> %add3, %3 + %arrayidx6 = getelementptr inbounds <4 x float>, ptr %x, i64 4 + %4 = load <4 x float>, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn <4 x float> %add5, %4 + %arrayidx8 = getelementptr inbounds <4 x float>, ptr %x, i64 5 + %5 = load <4 x float>, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn <4 x float> %add7, %5 + %arrayidx10 = getelementptr inbounds <4 x float>, ptr %x, i64 6 + %6 = load <4 x float>, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn <4 x float> %add9, %6 + %arrayidx12 = getelementptr inbounds <4 x float>, ptr %x, i64 7 + %7 = load <4 x float>, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn <4 x float> %add11, %7 + ret <4 x float> %add13 +} + +define double @fun5_fsub(ptr %x) { +; CHECK-LABEL: fun5_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld %f0, 0(%r2) +; CHECK-NEXT: sdb %f0, 8(%r2) +; CHECK-NEXT: ld %f1, 24(%r2) +; CHECK-NEXT: adb %f1, 16(%r2) +; CHECK-NEXT: sdbr %f0, %f1 +; CHECK-NEXT: ld %f1, 40(%r2) +; CHECK-NEXT: adb %f1, 32(%r2) +; CHECK-NEXT: adb %f1, 48(%r2) +; CHECK-NEXT: sdbr %f0, %f1 +; CHECK-NEXT: sdb %f0, 56(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun5_fsub +; PASSOUTPUT-NOT: WFSDB +; PASSOUTPUT: WFSDB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFSDB {{.*}}$cc +; PASSOUTPUT-NOT: WFSDB_CCPseudo +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %sub = fsub reassoc nsz arcp contract afn double %0, %1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %sub3 = fsub reassoc nsz arcp contract afn double %sub, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %sub5 = fsub reassoc nsz arcp contract afn double %sub3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %4 = load double, ptr %arrayidx6, align 8 + %sub7 = fsub reassoc nsz arcp contract afn double %sub5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %5 = load double, ptr %arrayidx8, align 8 + %sub9 = fsub reassoc nsz arcp contract afn double %sub7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %6 = load double, ptr %arrayidx10, align 8 + %sub11 = fsub reassoc nsz arcp contract afn double %sub9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + %7 = load double, ptr %arrayidx12, align 8 + %sub13 = fsub reassoc nsz arcp contract afn double %sub11, %7 + ret double %sub13 +} + +define float @fun6_fsub(ptr %x) { +; CHECK-LABEL: fun6_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lde %f0, 0(%r2) +; CHECK-NEXT: seb %f0, 4(%r2) +; CHECK-NEXT: lde %f1, 12(%r2) +; CHECK-NEXT: aeb %f1, 8(%r2) +; CHECK-NEXT: sebr %f0, %f1 +; CHECK-NEXT: lde %f1, 20(%r2) +; CHECK-NEXT: aeb %f1, 16(%r2) +; CHECK-NEXT: aeb %f1, 24(%r2) +; CHECK-NEXT: sebr %f0, %f1 +; CHECK-NEXT: seb %f0, 28(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun6_fsub +; PASSOUTPUT-NOT: WFSSB +; PASSOUTPUT: WFSSB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFSSB {{.*}}$cc +; PASSOUTPUT-NOT: WFSSB_CCPseudo +entry: + %0 = load float, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 8 + %sub = fsub reassoc nsz arcp contract afn float %0, %1 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 8 + %sub3 = fsub reassoc nsz arcp contract afn float %sub, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 8 + %sub5 = fsub reassoc nsz arcp contract afn float %sub3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4 + %4 = load float, ptr %arrayidx6, align 8 + %sub7 = fsub reassoc nsz arcp contract afn float %sub5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5 + %5 = load float, ptr %arrayidx8, align 8 + %sub9 = fsub reassoc nsz arcp contract afn float %sub7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6 + %6 = load float, ptr %arrayidx10, align 8 + %sub11 = fsub reassoc nsz arcp contract afn float %sub9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7 + %7 = load float, ptr %arrayidx12, align 8 + %sub13 = fsub reassoc nsz arcp contract afn float %sub11, %7 + ret float %sub13 +} + +define fp128 @fun7_fsub(ptr %x) { +; CHECK-LABEL: fun7_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 16(%r3), 3 +; CHECK-NEXT: wfsxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r3), 3 +; CHECK-NEXT: vl %v2, 48(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: wfsxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r3), 3 +; CHECK-NEXT: vl %v2, 80(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r3), 3 +; CHECK-NEXT: wfaxb %v1, %v1, %v2 +; CHECK-NEXT: wfsxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r3), 3 +; CHECK-NEXT: wfsxb %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load fp128, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1 + %1 = load fp128, ptr %arrayidx1, align 8 + %sub = fsub reassoc nsz arcp contract afn fp128 %0, %1 + %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2 + %2 = load fp128, ptr %arrayidx2, align 8 + %sub3 = fsub reassoc nsz arcp contract afn fp128 %sub, %2 + %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3 + %3 = load fp128, ptr %arrayidx4, align 8 + %sub5 = fsub reassoc nsz arcp contract afn fp128 %sub3, %3 + %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 4 + %4 = load fp128, ptr %arrayidx6, align 8 + %sub7 = fsub reassoc nsz arcp contract afn fp128 %sub5, %4 + %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 5 + %5 = load fp128, ptr %arrayidx8, align 8 + %sub9 = fsub reassoc nsz arcp contract afn fp128 %sub7, %5 + %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 6 + %6 = load fp128, ptr %arrayidx10, align 8 + %sub11 = fsub reassoc nsz arcp contract afn fp128 %sub9, %6 + %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 7 + %7 = load fp128, ptr %arrayidx12, align 8 + %sub13 = fsub reassoc nsz arcp contract afn fp128 %sub11, %7 + ret fp128 %sub13 +} + +define <2 x double> @fun8_fsub(ptr %x) { +; CHECK-LABEL: fun8_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfsdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vfsdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfadb %v1, %v1, %v2 +; CHECK-NEXT: vfsdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfsdb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x double>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1 + %1 = load <2 x double>, ptr %arrayidx1, align 8 + %sub = fsub reassoc nsz arcp contract afn <2 x double> %0, %1 + %arrayidx2 = getelementptr inbounds <2 x double>, ptr %x, i64 2 + %2 = load <2 x double>, ptr %arrayidx2, align 8 + %sub3 = fsub reassoc nsz arcp contract afn <2 x double> %sub, %2 + %arrayidx4 = getelementptr inbounds <2 x double>, ptr %x, i64 3 + %3 = load <2 x double>, ptr %arrayidx4, align 8 + %sub5 = fsub reassoc nsz arcp contract afn <2 x double> %sub3, %3 + %arrayidx6 = getelementptr inbounds <2 x double>, ptr %x, i64 4 + %4 = load <2 x double>, ptr %arrayidx6, align 8 + %sub7 = fsub reassoc nsz arcp contract afn <2 x double> %sub5, %4 + %arrayidx8 = getelementptr inbounds <2 x double>, ptr %x, i64 5 + %5 = load <2 x double>, ptr %arrayidx8, align 8 + %sub9 = fsub reassoc nsz arcp contract afn <2 x double> %sub7, %5 + %arrayidx10 = getelementptr inbounds <2 x double>, ptr %x, i64 6 + %6 = load <2 x double>, ptr %arrayidx10, align 8 + %sub11 = fsub reassoc nsz arcp contract afn <2 x double> %sub9, %6 + %arrayidx12 = getelementptr inbounds <2 x double>, ptr %x, i64 7 + %7 = load <2 x double>, ptr %arrayidx12, align 8 + %sub13 = fsub reassoc nsz arcp contract afn <2 x double> %sub11, %7 + ret <2 x double> %sub13 +} + +define <4 x float> @fun9_fsub(ptr %x) { +; CHECK-LABEL: fun9_fsub: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfssb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vfssb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfasb %v1, %v1, %v2 +; CHECK-NEXT: vfssb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfssb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x float>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1 + %1 = load <4 x float>, ptr %arrayidx1, align 8 + %sub = fsub reassoc nsz arcp contract afn <4 x float> %0, %1 + %arrayidx2 = getelementptr inbounds <4 x float>, ptr %x, i64 2 + %2 = load <4 x float>, ptr %arrayidx2, align 8 + %sub3 = fsub reassoc nsz arcp contract afn <4 x float> %sub, %2 + %arrayidx4 = getelementptr inbounds <4 x float>, ptr %x, i64 3 + %3 = load <4 x float>, ptr %arrayidx4, align 8 + %sub5 = fsub reassoc nsz arcp contract afn <4 x float> %sub3, %3 + %arrayidx6 = getelementptr inbounds <4 x float>, ptr %x, i64 4 + %4 = load <4 x float>, ptr %arrayidx6, align 8 + %sub7 = fsub reassoc nsz arcp contract afn <4 x float> %sub5, %4 + %arrayidx8 = getelementptr inbounds <4 x float>, ptr %x, i64 5 + %5 = load <4 x float>, ptr %arrayidx8, align 8 + %sub9 = fsub reassoc nsz arcp contract afn <4 x float> %sub7, %5 + %arrayidx10 = getelementptr inbounds <4 x float>, ptr %x, i64 6 + %6 = load <4 x float>, ptr %arrayidx10, align 8 + %sub11 = fsub reassoc nsz arcp contract afn <4 x float> %sub9, %6 + %arrayidx12 = getelementptr inbounds <4 x float>, ptr %x, i64 7 + %7 = load <4 x float>, ptr %arrayidx12, align 8 + %sub13 = fsub reassoc nsz arcp contract afn <4 x float> %sub11, %7 + ret <4 x float> %sub13 +} + +define double @fun10_fmul(ptr %x) { +; CHECK-LABEL: fun10_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld %f0, 8(%r2) +; CHECK-NEXT: mdb %f0, 0(%r2) +; CHECK-NEXT: ld %f1, 24(%r2) +; CHECK-NEXT: mdb %f1, 16(%r2) +; CHECK-NEXT: mdbr %f0, %f1 +; CHECK-NEXT: ld %f1, 40(%r2) +; CHECK-NEXT: mdb %f1, 32(%r2) +; CHECK-NEXT: mdb %f1, 48(%r2) +; CHECK-NEXT: mdbr %f0, %f1 +; CHECK-NEXT: mdb %f0, 56(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun10_fmul +; PASSOUTPUT-NOT: WFMDB +; PASSOUTPUT: WFMDB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFMDB {{.*}}$cc +; PASSOUTPUT-NOT: WFMDB_CCPseudo +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn double %0, %1 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 4 + %4 = load double, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 5 + %5 = load double, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 6 + %6 = load double, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 7 + %7 = load double, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7 + ret double %mul13 +} + +define float @fun11_fmul(ptr %x) { +; CHECK-LABEL: fun11_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lde %f0, 4(%r2) +; CHECK-NEXT: meeb %f0, 0(%r2) +; CHECK-NEXT: lde %f1, 12(%r2) +; CHECK-NEXT: meeb %f1, 8(%r2) +; CHECK-NEXT: meebr %f0, %f1 +; CHECK-NEXT: lde %f1, 20(%r2) +; CHECK-NEXT: meeb %f1, 16(%r2) +; CHECK-NEXT: meeb %f1, 24(%r2) +; CHECK-NEXT: meebr %f0, %f1 +; CHECK-NEXT: meeb %f0, 28(%r2) +; CHECK-NEXT: br %r14 + +; PASSOUTPUT: name: fun11_fmul +; PASSOUTPUT-NOT: WFMSB +; PASSOUTPUT: WFMSB killed %3, killed %18, implicit $fpc +; PASSOUTPUT-NOT: WFMSB {{.*}}$cc +; PASSOUTPUT-NOT: WFMSB_CCPseudo +entry: + %0 = load float, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 1 + %1 = load float, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn float %0, %1 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 2 + %2 = load float, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 3 + %3 = load float, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 4 + %4 = load float, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 5 + %5 = load float, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 6 + %6 = load float, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 7 + %7 = load float, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7 + ret float %mul13 +} + +define fp128 @fun12_fmul(ptr %x) { +; CHECK-LABEL: fun12_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v1, 16(%r3), 3 +; CHECK-NEXT: wfmxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r3), 3 +; CHECK-NEXT: vl %v2, 48(%r3), 3 +; CHECK-NEXT: wfmxb %v1, %v1, %v2 +; CHECK-NEXT: wfmxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r3), 3 +; CHECK-NEXT: vl %v2, 80(%r3), 3 +; CHECK-NEXT: wfmxb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r3), 3 +; CHECK-NEXT: wfmxb %v1, %v1, %v2 +; CHECK-NEXT: wfmxb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r3), 3 +; CHECK-NEXT: wfmxb %v0, %v0, %v1 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load fp128, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds fp128, ptr %x, i64 1 + %1 = load fp128, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn fp128 %0, %1 + %arrayidx2 = getelementptr inbounds fp128, ptr %x, i64 2 + %2 = load fp128, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn fp128 %mul, %2 + %arrayidx4 = getelementptr inbounds fp128, ptr %x, i64 3 + %3 = load fp128, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn fp128 %mul3, %3 + %arrayidx6 = getelementptr inbounds fp128, ptr %x, i64 4 + %4 = load fp128, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn fp128 %mul5, %4 + %arrayidx8 = getelementptr inbounds fp128, ptr %x, i64 5 + %5 = load fp128, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn fp128 %mul7, %5 + %arrayidx10 = getelementptr inbounds fp128, ptr %x, i64 6 + %6 = load fp128, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn fp128 %mul9, %6 + %arrayidx12 = getelementptr inbounds fp128, ptr %x, i64 7 + %7 = load fp128, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn fp128 %mul11, %7 + ret fp128 %mul13 +} + +define <2 x double> @fun13_fmul(ptr %x) { +; CHECK-LABEL: fun13_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfmdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfmdb %v1, %v1, %v2 +; CHECK-NEXT: vfmdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfmdb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfmdb %v1, %v1, %v2 +; CHECK-NEXT: vfmdb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfmdb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x double>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <2 x double>, ptr %x, i64 1 + %1 = load <2 x double>, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn <2 x double> %0, %1 + %arrayidx2 = getelementptr inbounds <2 x double>, ptr %x, i64 2 + %2 = load <2 x double>, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn <2 x double> %mul, %2 + %arrayidx4 = getelementptr inbounds <2 x double>, ptr %x, i64 3 + %3 = load <2 x double>, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn <2 x double> %mul3, %3 + %arrayidx6 = getelementptr inbounds <2 x double>, ptr %x, i64 4 + %4 = load <2 x double>, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn <2 x double> %mul5, %4 + %arrayidx8 = getelementptr inbounds <2 x double>, ptr %x, i64 5 + %5 = load <2 x double>, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn <2 x double> %mul7, %5 + %arrayidx10 = getelementptr inbounds <2 x double>, ptr %x, i64 6 + %6 = load <2 x double>, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn <2 x double> %mul9, %6 + %arrayidx12 = getelementptr inbounds <2 x double>, ptr %x, i64 7 + %7 = load <2 x double>, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn <2 x double> %mul11, %7 + ret <2 x double> %mul13 +} + +define <4 x float> @fun14_fmul(ptr %x) { +; CHECK-LABEL: fun14_fmul: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: vfmsb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 32(%r2), 3 +; CHECK-NEXT: vl %v2, 48(%r2), 3 +; CHECK-NEXT: vfmsb %v1, %v1, %v2 +; CHECK-NEXT: vfmsb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 64(%r2), 3 +; CHECK-NEXT: vl %v2, 80(%r2), 3 +; CHECK-NEXT: vfmsb %v1, %v1, %v2 +; CHECK-NEXT: vl %v2, 96(%r2), 3 +; CHECK-NEXT: vfmsb %v1, %v1, %v2 +; CHECK-NEXT: vfmsb %v0, %v0, %v1 +; CHECK-NEXT: vl %v1, 112(%r2), 3 +; CHECK-NEXT: vfmsb %v24, %v0, %v1 +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x float>, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds <4 x float>, ptr %x, i64 1 + %1 = load <4 x float>, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn <4 x float> %0, %1 + %arrayidx2 = getelementptr inbounds <4 x float>, ptr %x, i64 2 + %2 = load <4 x float>, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn <4 x float> %mul, %2 + %arrayidx4 = getelementptr inbounds <4 x float>, ptr %x, i64 3 + %3 = load <4 x float>, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn <4 x float> %mul3, %3 + %arrayidx6 = getelementptr inbounds <4 x float>, ptr %x, i64 4 + %4 = load <4 x float>, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn <4 x float> %mul5, %4 + %arrayidx8 = getelementptr inbounds <4 x float>, ptr %x, i64 5 + %5 = load <4 x float>, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn <4 x float> %mul7, %5 + %arrayidx10 = getelementptr inbounds <4 x float>, ptr %x, i64 6 + %6 = load <4 x float>, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn <4 x float> %mul9, %6 + %arrayidx12 = getelementptr inbounds <4 x float>, ptr %x, i64 7 + %7 = load <4 x float>, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn <4 x float> %mul11, %7 + ret <4 x float> %mul13 +}