Index: include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- include/llvm/CodeGen/MachineCombinerPattern.h +++ include/llvm/CodeGen/MachineCombinerPattern.h @@ -38,7 +38,40 @@ MULSUBX_OP1, MULSUBX_OP2, MULADDXI_OP1, - MULSUBXI_OP1 + MULSUBXI_OP1, + // Floating Point + FMULADDS_OP1, + FMULADDS_OP2, + FMULSUBS_OP1, + FMULSUBS_OP2, + FMULADDD_OP1, + FMULADDD_OP2, + FMULSUBD_OP1, + FMULSUBD_OP2, + FMLAv1i32_indexed_OP1, + FMLAv1i32_indexed_OP2, + FMLAv1i64_indexed_OP1, + FMLAv1i64_indexed_OP2, + FMLAv2f32_OP2, + FMLAv2f32_OP1, + FMLAv2f64_OP1, + FMLAv2f64_OP2, + FMLAv2i32_indexed_OP1, + FMLAv2i32_indexed_OP2, + FMLAv2i64_indexed_OP1, + FMLAv2i64_indexed_OP2, + FMLAv4f32_OP1, + FMLAv4f32_OP2, + FMLAv4i32_indexed_OP1, + FMLAv4i32_indexed_OP2, + FMLSv1i32_indexed_OP2, + FMLSv1i64_indexed_OP2, + FMLSv2i32_indexed_OP2, + FMLSv2i64_indexed_OP2, + FMLSv2f32_OP2, + FMLSv2f64_OP2, + FMLSv4i32_indexed_OP2, + FMLSv4f32_OP2 }; } // end namespace llvm Index: include/llvm/CodeGen/SelectionDAGTargetInfo.h =================================================================== --- include/llvm/CodeGen/SelectionDAGTargetInfo.h +++ include/llvm/CodeGen/SelectionDAGTargetInfo.h @@ -17,6 +17,7 @@ #define LLVM_CODEGEN_SELECTIONDAGTARGETINFO_H #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/Support/CodeGen.h" namespace llvm { @@ -138,6 +139,11 @@ MachinePointerInfo SrcPtrInfo) const { return std::make_pair(SDValue(), SDValue()); } + // Return true when the decision to generate FMA's (or FMS, FMLA etc) rather + // than FMUL and ADD is delegated to the machine combiner. + virtual bool GenerateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const { + return false; + } }; } // end llvm namespace Index: include/llvm/Target/TargetInstrInfo.h =================================================================== --- include/llvm/Target/TargetInstrInfo.h +++ include/llvm/Target/TargetInstrInfo.h @@ -818,6 +818,11 @@ MachineInstr &Root, SmallVectorImpl &Patterns) const; + /// Return true when a code sequence can improve throughput. It + /// should be called only for instructions in loops. + /// \param Pattern - combiner pattern + virtual bool isThroughputPattern(MachineCombinerPattern Pattern) const; + /// Return true if the input \P Inst is part of a chain of dependent ops /// that are suitable for reassociation, otherwise return false. /// If the instruction's operands must be commuted to have a previous Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- lib/CodeGen/MachineCombiner.cpp +++ lib/CodeGen/MachineCombiner.cpp @@ -41,6 +41,7 @@ const TargetRegisterInfo *TRI; MCSchedModel SchedModel; MachineRegisterInfo *MRI; + MachineLoopInfo *MLI; // Current MachineLoopInfo MachineTraceMetrics *Traces; MachineTraceMetrics::Ensemble *MinInstr; @@ -87,6 +88,7 @@ INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", "Machine InstCombiner", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", false, false) @@ -94,6 +96,7 @@ void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); AU.addPreserved(); + AU.addRequired(); AU.addPreserved(); AU.addRequired(); AU.addPreserved(); @@ -355,6 +358,8 @@ DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); auto BlockIter = MBB->begin(); + // Check if the block is in a loop. + const MachineLoop *ML = MLI->getLoopFor(MBB); while (BlockIter != MBB->end()) { auto &MI = *BlockIter++; @@ -407,11 +412,15 @@ if (!NewInstCount) continue; + bool substituteAlways = false; + if (ML && TII->isThroughputPattern(P)) { + substituteAlways = true; + } // Substitute when we optimize for codesize and the new sequence has // fewer instructions OR // the new sequence neither lengthens the critical path nor increases // resource pressure. - if (doSubstitute(NewInstCount, OldInstCount) || + if (substituteAlways || doSubstitute(NewInstCount, OldInstCount) || (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, InstrIdxForVirtReg, P) && preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { @@ -448,6 +457,7 @@ SchedModel = STI.getSchedModel(); TSchedModel.init(SchedModel, &STI, TII); MRI = &MF.getRegInfo(); + MLI = &getAnalysis(); Traces = &getAnalysis(); MinInstr = nullptr; OptSize = MF.getFunction()->optForSize(); Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" @@ -85,6 +86,7 @@ class DAGCombiner { SelectionDAG &DAG; + const SelectionDAGTargetInfo &STI; const TargetLowering &TLI; CombineLevel Level; CodeGenOpt::Level OptLevel; @@ -469,8 +471,9 @@ public: DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL) - : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), - OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) { + : DAG(D), STI(D.getSelectionDAGInfo()), TLI(D.getTargetLoweringInfo()), + Level(BeforeLegalizeTypes), OptLevel(OL), LegalOperations(false), + LegalTypes(false), AA(A) { ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize(); } @@ -7671,6 +7674,9 @@ if (!HasFMAD && !HasFMA) return SDValue(); + if (AllowFusion && STI.GenerateFMAsInMachineCombiner(OptLevel)) + return SDValue(); + // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); @@ -7854,6 +7860,9 @@ if (!HasFMAD && !HasFMA) return SDValue(); + if (AllowFusion && STI.GenerateFMAsInMachineCombiner(OptLevel)) + return SDValue(); + // Always prefer FMAD to FMA for precision. unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA; bool Aggressive = TLI.enableAggressiveFMAFusion(VT); Index: lib/CodeGen/TargetInstrInfo.cpp =================================================================== --- lib/CodeGen/TargetInstrInfo.cpp +++ lib/CodeGen/TargetInstrInfo.cpp @@ -655,7 +655,11 @@ return false; } - +/// Return true when a code sequence can improve loop throughput. +bool +TargetInstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const { + return false; +} /// Attempt the reassociation transformation to reduce critical path length. /// See the above comments before getMachineCombinerPatterns(). void TargetInstrInfo::reassociateOps( Index: lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.h +++ lib/Target/AArch64/AArch64InstrInfo.h @@ -172,6 +172,11 @@ unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const override; bool optimizeCondBranch(MachineInstr *MI) const override; + + /// Return true when a code sequence can improve throughput. It + /// should be called only for instructions in loops. + /// \param Pattern - combiner pattern + bool isThroughputPattern(MachineCombinerPattern Pattern) const override; /// Return true when there is potentially a faster code sequence /// for an instruction chain ending in . All potential patterns are /// listed in the array. Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2598,37 +2598,75 @@ return false; } // +// FP Opcodes that can be combined with a FMUL +static bool isCombineInstrCandidateFP(const MachineInstr &Inst) { + switch (Inst.getOpcode()) { + case AArch64::FADDSrr: + case AArch64::FADDDrr: + case AArch64::FADDv2f32: + case AArch64::FADDv2f64: + case AArch64::FADDv4f32: + case AArch64::FSUBSrr: + case AArch64::FSUBDrr: + case AArch64::FSUBv2f32: + case AArch64::FSUBv2f64: + case AArch64::FSUBv4f32: + return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath; + default: + break; + } + return false; +} +// // Opcodes that can be combined with a MUL static bool isCombineInstrCandidate(unsigned Opc) { return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); } -static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, - unsigned MulOpc, unsigned ZeroReg) { +// +// Utility routine that checks if \param MO is defined by an +// \param CombineOpc instruction in the basic block \param MBB +static bool canCombine(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned CombineOpc, unsigned ZeroReg = 0, + bool CheckZeroReg = false) { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineInstr *MI = nullptr; - // We need a virtual register definition. + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) MI = MRI.getUniqueVRegDef(MO.getReg()); // And it needs to be in the trace (otherwise, it won't have a depth). - if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc) - return false; - - assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && - MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && - MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); - - // The third input reg must be zero. - if (MI->getOperand(3).getReg() != ZeroReg) + if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != CombineOpc) return false; - // Must only used by the user we combine with. if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) return false; + if (CheckZeroReg) { + assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && + MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && + MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); + // The third input reg must be zero. + if (MI->getOperand(3).getReg() != ZeroReg) + return false; + } + return true; } +// +// Is \param MO defined by an integer multiply and can be combined? +static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned MulOpc, unsigned ZeroReg) { + return canCombine(MBB, MO, MulOpc, ZeroReg, true); +} + +// +// Is \param MO defined by a floating-point multiply and can be combined? +static bool canCombineWithFMUL(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned MulOpc) { + return canCombine(MBB, MO, MulOpc); +} + // TODO: There are many more machine instruction opcodes to match: // 1. Other data types (integer, vectors) // 2. Other math / logic operations (xor, or) @@ -2762,7 +2800,230 @@ } return Found; } +/// Floating-Point Support + +/// Find instructions that can be turned into madd. +static bool getFMAPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns) { + + if (!isCombineInstrCandidateFP(Root)) + return 0; + + MachineBasicBlock &MBB = *Root.getParent(); + bool Found = false; + switch (Root.getOpcode()) { + default: + assert(false && "Unsupported FP instruction in combiner\n"); + break; + case AArch64::FADDSrr: + assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && + "FADDWrr does not have register operands"); + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDS_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv1i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDS_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i32_indexed_OP2); + Found = true; + } + break; + case AArch64::FADDDrr: + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDD_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv1i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULADDD_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv1i64_indexed_OP2); + Found = true; + } + break; + case AArch64::FADDv2f32: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f32_OP2); + Found = true; + } + break; + case AArch64::FADDv2f64: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv2f64)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2i64_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f64)) { + Patterns.push_back(MachineCombinerPattern::FMLAv2f64_OP2); + Found = true; + } + break; + case AArch64::FADDv4f32: + if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv4i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP1); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(1), + AArch64::FMULv4f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4f32)) { + Patterns.push_back(MachineCombinerPattern::FMLAv4f32_OP2); + Found = true; + } + break; + + case AArch64::FSUBSrr: + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBS_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2); + Found = true; + } + break; + case AArch64::FSUBDrr: + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP1); + Found = true; + } + if (canCombineWithFMUL(MBB, Root.getOperand(2), AArch64::FMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FMULSUBD_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv1i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2); + Found = true; + } + break; + case AArch64::FSUBv2f32: + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f32)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2f32_OP2); + Found = true; + } + break; + case AArch64::FSUBv2f64: + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2i64_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2i64_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv2f64)) { + Patterns.push_back(MachineCombinerPattern::FMLSv2f64_OP2); + Found = true; + } + break; + case AArch64::FSUBv4f32: + if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4i32_indexed)) { + Patterns.push_back(MachineCombinerPattern::FMLSv4i32_indexed_OP2); + Found = true; + } else if (canCombineWithFMUL(MBB, Root.getOperand(2), + AArch64::FMULv4f32)) { + Patterns.push_back(MachineCombinerPattern::FMLSv4f32_OP2); + Found = true; + } + break; + } + return Found; +} + +/// Return true when a code sequence can improve throughput. It +/// should be called only for instructions in loops. +/// \param Pattern - combiner pattern +bool +AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const { + switch (Pattern) { + default: + break; + case MachineCombinerPattern::FMULADDS_OP1: + case MachineCombinerPattern::FMULADDS_OP2: + case MachineCombinerPattern::FMULSUBS_OP1: + case MachineCombinerPattern::FMULSUBS_OP2: + case MachineCombinerPattern::FMULADDD_OP1: + case MachineCombinerPattern::FMULADDD_OP2: + case MachineCombinerPattern::FMULSUBD_OP1: + case MachineCombinerPattern::FMULSUBD_OP2: + case MachineCombinerPattern::FMLAv1i32_indexed_OP1: + case MachineCombinerPattern::FMLAv1i32_indexed_OP2: + case MachineCombinerPattern::FMLAv1i64_indexed_OP1: + case MachineCombinerPattern::FMLAv1i64_indexed_OP2: + case MachineCombinerPattern::FMLAv2f32_OP2: + case MachineCombinerPattern::FMLAv2f32_OP1: + case MachineCombinerPattern::FMLAv2f64_OP1: + case MachineCombinerPattern::FMLAv2f64_OP2: + case MachineCombinerPattern::FMLAv2i32_indexed_OP1: + case MachineCombinerPattern::FMLAv2i32_indexed_OP2: + case MachineCombinerPattern::FMLAv2i64_indexed_OP1: + case MachineCombinerPattern::FMLAv2i64_indexed_OP2: + case MachineCombinerPattern::FMLAv4f32_OP1: + case MachineCombinerPattern::FMLAv4f32_OP2: + case MachineCombinerPattern::FMLAv4i32_indexed_OP1: + case MachineCombinerPattern::FMLAv4i32_indexed_OP2: + case MachineCombinerPattern::FMLSv1i32_indexed_OP2: + case MachineCombinerPattern::FMLSv1i64_indexed_OP2: + case MachineCombinerPattern::FMLSv2i32_indexed_OP2: + case MachineCombinerPattern::FMLSv2i64_indexed_OP2: + case MachineCombinerPattern::FMLSv2f32_OP2: + case MachineCombinerPattern::FMLSv2f64_OP2: + case MachineCombinerPattern::FMLSv4i32_indexed_OP2: + case MachineCombinerPattern::FMLSv4f32_OP2: + return true; + } // end switch (Pattern) + return false; +} /// Return true when there is potentially a faster code sequence for an /// instruction chain ending in \p Root. All potential patterns are listed in /// the \p Pattern vector. Pattern should be sorted in priority order since the @@ -2771,28 +3032,35 @@ bool AArch64InstrInfo::getMachineCombinerPatterns( MachineInstr &Root, SmallVectorImpl &Patterns) const { + // Integer patterns if (getMaddPatterns(Root, Patterns)) return true; + // Floating point patterns + if (getFMAPatterns(Root, Patterns)) + return true; return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns); } -/// genMadd - Generate madd instruction and combine mul and add. -/// Example: -/// MUL I=A,B,0 -/// ADD R,I,C -/// ==> MADD R,A,B,C -/// \param Root is the ADD instruction +enum class FMAInstKind { Default, Indexed, Accumulator }; +/// genFusedMultiply - Generate fused multiply instructions. +/// This function supports both integer and floating point instructions. +/// A typical example: +/// F|MUL I=A,B,0 +/// F|ADD R,I,C +/// ==> F|MADD R,A,B,C +/// \param Root is the F|ADD instruction /// \param [out] InsInstrs is a vector of machine instructions and will /// contain the generated madd instruction /// \param IdxMulOpd is index of operand in Root that is the result of -/// the MUL. In the example above IdxMulOpd is 1. -/// \param MaddOpc the opcode fo the madd instruction -static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI, - const TargetInstrInfo *TII, MachineInstr &Root, - SmallVectorImpl &InsInstrs, - unsigned IdxMulOpd, unsigned MaddOpc, - const TargetRegisterClass *RC) { +/// the F|MUL. In the example above IdxMulOpd is 1. +/// \param MaddOpc the opcode fo the f|madd instruction +static MachineInstr * +genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, MachineInstr &Root, + SmallVectorImpl &InsInstrs, unsigned IdxMulOpd, + unsigned MaddOpc, const TargetRegisterClass *RC, + FMAInstKind kind = FMAInstKind::Default) { assert(IdxMulOpd == 1 || IdxMulOpd == 2); unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; @@ -2814,12 +3082,26 @@ if (TargetRegisterInfo::isVirtualRegister(SrcReg2)) MRI.constrainRegClass(SrcReg2, RC); - MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), - ResultReg) - .addReg(SrcReg0, getKillRegState(Src0IsKill)) - .addReg(SrcReg1, getKillRegState(Src1IsKill)) - .addReg(SrcReg2, getKillRegState(Src2IsKill)); - // Insert the MADD + MachineInstrBuilder MIB; + if (kind == FMAInstKind::Default) + MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addReg(SrcReg2, getKillRegState(Src2IsKill)); + else if (kind == FMAInstKind::Indexed) + MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) + .addReg(SrcReg2, getKillRegState(Src2IsKill)) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)) + .addImm(MUL->getOperand(3).getImm()); + else if (kind == FMAInstKind::Accumulator) + MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc), ResultReg) + .addReg(SrcReg2, getKillRegState(Src2IsKill)) + .addReg(SrcReg0, getKillRegState(Src0IsKill)) + .addReg(SrcReg1, getKillRegState(Src1IsKill)); + else + assert(false && "Invalid FMA instruction kind \n"); + // Insert the MADD (MADD, FMA, FMS, FMLA, FMSL) InsInstrs.push_back(MIB); return MUL; } @@ -2907,7 +3189,7 @@ Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; case MachineCombinerPattern::MULADDW_OP2: case MachineCombinerPattern::MULADDX_OP2: @@ -2922,7 +3204,7 @@ Opc = AArch64::MADDXrrr; RC = &AArch64::GPR64RegClass; } - MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; case MachineCombinerPattern::MULADDWI_OP1: case MachineCombinerPattern::MULADDXI_OP1: { @@ -3014,7 +3296,7 @@ Opc = AArch64::MSUBXrrr; RC = &AArch64::GPR64RegClass; } - MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; case MachineCombinerPattern::MULSUBWI_OP1: case MachineCombinerPattern::MULSUBXI_OP1: { @@ -3059,6 +3341,234 @@ } break; } + // Floating Point Support + case MachineCombinerPattern::FMULADDS_OP1: + case MachineCombinerPattern::FMULADDD_OP1: + // MUL I=A,B,0 + // ADD R,I,C + // ==> MADD R,A,B,C + // --- Create(MADD); + if (Pattern == MachineCombinerPattern::FMULADDS_OP1) { + Opc = AArch64::FMADDSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FMADDDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::FMULADDS_OP2: + case MachineCombinerPattern::FMULADDD_OP2: + // FMUL I=A,B,0 + // FADD R,C,I + // ==> FMADD R,A,B,C + // --- Create(FMADD); + if (Pattern == MachineCombinerPattern::FMULADDS_OP2) { + Opc = AArch64::FMADDSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FMADDDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + + case MachineCombinerPattern::FMLAv1i32_indexed_OP1: + Opc = AArch64::FMLAv1i32_indexed; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv1i32_indexed_OP2: + Opc = AArch64::FMLAv1i32_indexed; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLAv1i64_indexed_OP1: + Opc = AArch64::FMLAv1i64_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + break; + case MachineCombinerPattern::FMLAv1i64_indexed_OP2: + Opc = AArch64::FMLAv1i64_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLAv2i32_indexed_OP1: + case MachineCombinerPattern::FMLAv2f32_OP1: + RC = &AArch64::FPR64RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP1) { + Opc = AArch64::FMLAv2i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + } + break; + case MachineCombinerPattern::FMLAv2i32_indexed_OP2: + case MachineCombinerPattern::FMLAv2f32_OP2: + RC = &AArch64::FPR64RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i32_indexed_OP2) { + Opc = AArch64::FMLAv2i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLAv2i64_indexed_OP1: + case MachineCombinerPattern::FMLAv2f64_OP1: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP1) { + Opc = AArch64::FMLAv2i64_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f64; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + } + break; + case MachineCombinerPattern::FMLAv2i64_indexed_OP2: + case MachineCombinerPattern::FMLAv2f64_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv2i64_indexed_OP2) { + Opc = AArch64::FMLAv2i64_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv2f64; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLAv4i32_indexed_OP1: + case MachineCombinerPattern::FMLAv4f32_OP1: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP1) { + Opc = AArch64::FMLAv4i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv4f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLAv4i32_indexed_OP2: + case MachineCombinerPattern::FMLAv4f32_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLAv4i32_indexed_OP2) { + Opc = AArch64::FMLAv4i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLAv4f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMULSUBS_OP1: + case MachineCombinerPattern::FMULSUBD_OP1: { + // FMUL I=A,B,0 + // FSUB R,I,C + // ==> FNMSUB R,A,B,C // = -C + A*B + // --- Create(FNMSUB); + if (Pattern == MachineCombinerPattern::FMULSUBS_OP1) { + Opc = AArch64::FNMSUBSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FNMSUBDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + } + case MachineCombinerPattern::FMULSUBS_OP2: + case MachineCombinerPattern::FMULSUBD_OP2: { + // FMUL I=A,B,0 + // FSUB R,C,I + // ==> FMSUB R,A,B,C (computes C - A*B) + // --- Create(FMSUB); + if (Pattern == MachineCombinerPattern::FMULSUBS_OP2) { + Opc = AArch64::FMSUBSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FMSUBDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + + case MachineCombinerPattern::FMLSv1i32_indexed_OP2: + Opc = AArch64::FMLSv1i32_indexed; + RC = &AArch64::FPR32RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLSv1i64_indexed_OP2: + Opc = AArch64::FMLSv1i64_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + break; + + case MachineCombinerPattern::FMLSv2f32_OP2: + case MachineCombinerPattern::FMLSv2i32_indexed_OP2: + RC = &AArch64::FPR64RegClass; + if (Pattern == MachineCombinerPattern::FMLSv2i32_indexed_OP2) { + Opc = AArch64::FMLSv2i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLSv2f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLSv2f64_OP2: + case MachineCombinerPattern::FMLSv2i64_indexed_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLSv2i64_indexed_OP2) { + Opc = AArch64::FMLSv2i64_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLSv2f64; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + + case MachineCombinerPattern::FMLSv4f32_OP2: + case MachineCombinerPattern::FMLSv4i32_indexed_OP2: + RC = &AArch64::FPR128RegClass; + if (Pattern == MachineCombinerPattern::FMLSv4i32_indexed_OP2) { + Opc = AArch64::FMLSv4i32_indexed; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Indexed); + } else { + Opc = AArch64::FMLSv4f32; + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC, + FMAInstKind::Accumulator); + } + break; + } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion DelInstrs.push_back(MUL); Index: lib/Target/AArch64/AArch64SelectionDAGInfo.h =================================================================== --- lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -25,6 +25,7 @@ SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, MachinePointerInfo DstPtrInfo) const override; + bool GenerateFMAsInMachineCombiner(CodeGenOpt::Level OptLevel) const override; }; } Index: lib/Target/AArch64/AArch64SelectionDAGInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -51,3 +51,9 @@ } return SDValue(); } +bool AArch64SelectionDAGInfo::GenerateFMAsInMachineCombiner( + CodeGenOpt::Level OptLevel) const { + if (OptLevel >= CodeGenOpt::Aggressive) + return true; + return false; +} Index: test/CodeGen/AArch64/arm64-fma-combines.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/arm64-fma-combines.ll @@ -0,0 +1,136 @@ +; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s +define void @foo_2d(double* %src) { +; CHECK-LABEL: %entry +; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}} +entry: + %arrayidx1 = getelementptr inbounds double, double* %src, i64 5 + %arrayidx2 = getelementptr inbounds double, double* %src, i64 11 + %0 = bitcast double* %arrayidx1 to <2 x double>* + %1 = load double, double* %arrayidx2, align 8 + %2 = load double, double* %arrayidx1, align 8 + %fmul = fmul fast double %1, %1 + %fmul2 = fmul fast double %2, 0x3F94AFD6A052BF5B + %fadd = fadd fast double %fmul, %fmul2 + br label %for.body + +; CHECK-LABEL: %for.body +; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] +; CHECK: fmla.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0] +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds double, double* %src, i64 %indvars.iv.next + %3 = load double, double* %arrayidx3, align 8 + %add = fadd fast double %3, %3 + %mul = fmul fast double %add, %fadd + %e1 = insertelement <2 x double> undef, double %add, i32 0 + %e2 = insertelement <2 x double> %e1, double %add, i32 1 + %add2 = fadd fast <2 x double> %e2, + %e3 = insertelement <2 x double> undef, double %mul, i32 0 + %e4 = insertelement <2 x double> %e3, double %mul, i32 1 + %mul2 = fmul fast <2 x double> %add2, + %e5 = insertelement <2 x double> undef, double %add, i32 0 + %e6 = insertelement <2 x double> %e5, double %add, i32 1 + %add3 = fadd fast <2 x double> %mul2, + %mulx = fmul fast <2 x double> %add2, %e2 + %addx = fadd fast <2 x double> %mulx, %e4 + %e7 = insertelement <2 x double> undef, double %mul, i32 0 + %e8 = insertelement <2 x double> %e7, double %mul, i32 1 + %e9 = fmul fast <2 x double> %addx, %add3 + store <2 x double> %e9, <2 x double>* %0, align 8 + %e10 = extractelement <2 x double> %add3, i32 0 + %mul3 = fmul fast double %mul, %e10 + %add4 = fadd fast double %mul3, %mul + store double %add4, double* %arrayidx2, align 8 + %exitcond = icmp eq i64 %indvars.iv.next, 25 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} +define void @foo_2s(float* %src) { +entry: + %arrayidx1 = getelementptr inbounds float, float* %src, i64 5 + %arrayidx2 = getelementptr inbounds float, float* %src, i64 11 + %0 = bitcast float* %arrayidx1 to <2 x float>* + br label %for.body + +; CHECK-LABEL: %for.body +; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] +; CHECK: fmla.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0] +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next + %1 = load float, float* %arrayidx3, align 8 + %add = fadd fast float %1, %1 + %mul = fmul fast float %add, %add + %e1 = insertelement <2 x float> undef, float %add, i32 0 + %e2 = insertelement <2 x float> %e1, float %add, i32 1 + %add2 = fadd fast <2 x float> %e2, + %e3 = insertelement <2 x float> undef, float %mul, i32 0 + %e4 = insertelement <2 x float> %e3, float %mul, i32 1 + %mul2 = fmul fast <2 x float> %add2, + %e5 = insertelement <2 x float> undef, float %add, i32 0 + %e6 = insertelement <2 x float> %e5, float %add, i32 1 + %add3 = fadd fast <2 x float> %mul2, + %mulx = fmul fast <2 x float> %add2, %e2 + %addx = fadd fast <2 x float> %mulx, %e4 + %e7 = insertelement <2 x float> undef, float %mul, i32 0 + %e8 = insertelement <2 x float> %e7, float %mul, i32 1 + %e9 = fmul fast <2 x float> %addx, %add3 + store <2 x float> %e9, <2 x float>* %0, align 8 + %e10 = extractelement <2 x float> %add3, i32 0 + %mul3 = fmul fast float %mul, %e10 + %add4 = fadd fast float %mul3, %mul + store float %add4, float* %arrayidx2, align 8 + %exitcond = icmp eq i64 %indvars.iv.next, 25 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} +define void @foo_4s(float* %src) { +entry: + %arrayidx1 = getelementptr inbounds float, float* %src, i64 5 + %arrayidx2 = getelementptr inbounds float, float* %src, i64 11 + %0 = bitcast float* %arrayidx1 to <4 x float>* + br label %for.body + +; CHECK-LABEL: %for.body +; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next + %1 = load float, float* %arrayidx3, align 8 + %add = fadd fast float %1, %1 + %mul = fmul fast float %add, %add + %e1 = insertelement <4 x float> undef, float %add, i32 0 + %e2 = insertelement <4 x float> %e1, float %add, i32 1 + %add2 = fadd fast <4 x float> %e2, + %e3 = insertelement <4 x float> undef, float %mul, i32 0 + %e4 = insertelement <4 x float> %e3, float %mul, i32 1 + %mul2 = fmul fast <4 x float> %add2, + %e5 = insertelement <4 x float> undef, float %add, i32 0 + %e6 = insertelement <4 x float> %e5, float %add, i32 1 + %add3 = fadd fast <4 x float> %mul2, + %mulx = fmul fast <4 x float> %add2, %e2 + %addx = fadd fast <4 x float> %mulx, %e4 + %e7 = insertelement <4 x float> undef, float %mul, i32 0 + %e8 = insertelement <4 x float> %e7, float %mul, i32 1 + %e9 = fmul fast <4 x float> %addx, %add3 + store <4 x float> %e9, <4 x float>* %0, align 8 + %e10 = extractelement <4 x float> %add3, i32 0 + %mul3 = fmul fast float %mul, %e10 + store float %mul3, float* %arrayidx2, align 8 + %exitcond = icmp eq i64 %indvars.iv.next, 25 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} Index: test/CodeGen/AArch64/arm64-fml-combines.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/arm64-fml-combines.ll @@ -0,0 +1,128 @@ +; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s +define void @foo_2d(double* %src) { +entry: + %arrayidx1 = getelementptr inbounds double, double* %src, i64 5 + %arrayidx2 = getelementptr inbounds double, double* %src, i64 11 + %0 = bitcast double* %arrayidx1 to <2 x double>* + br label %for.body + +; CHECK-LABEL: %for.body +; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; CHECK: fmls.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] +; CHECK: fmls.d {{d[0-9]+}}, {{d[0-9]+}}, {{v[0-9]+}}[0] +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = sub nuw nsw i64 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds double, double* %src, i64 %indvars.iv.next + %1 = load double, double* %arrayidx3, align 8 + %add = fadd fast double %1, %1 + %mul = fmul fast double %add, %add + %e1 = insertelement <2 x double> undef, double %add, i32 0 + %e2 = insertelement <2 x double> %e1, double %add, i32 1 + %sub2 = fsub fast <2 x double> %e2, + %e3 = insertelement <2 x double> undef, double %mul, i32 0 + %e4 = insertelement <2 x double> %e3, double %mul, i32 1 + %mul2 = fmul fast <2 x double> %sub2, + %e5 = insertelement <2 x double> undef, double %add, i32 0 + %e6 = insertelement <2 x double> %e5, double %add, i32 1 + %sub3 = fsub fast <2 x double> , %mul2 + %mulx = fmul fast <2 x double> %sub2, %e2 + %subx = fsub fast <2 x double> %e4, %mulx + %e7 = insertelement <2 x double> undef, double %mul, i32 0 + %e8 = insertelement <2 x double> %e7, double %mul, i32 1 + %e9 = fmul fast <2 x double> %subx, %sub3 + store <2 x double> %e9, <2 x double>* %0, align 8 + %e10 = extractelement <2 x double> %sub3, i32 0 + %mul3 = fmul fast double %mul, %e10 + %sub4 = fsub fast double %mul, %mul3 + store double %sub4, double* %arrayidx2, align 8 + %exitcond = icmp eq i64 %indvars.iv.next, 25 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} +define void @foo_2s(float* %src) { +entry: + %arrayidx1 = getelementptr inbounds float, float* %src, i64 5 + %arrayidx2 = getelementptr inbounds float, float* %src, i64 11 + %0 = bitcast float* %arrayidx1 to <2 x float>* + br label %for.body + +; CHECK-LABEL: %for.body +; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; CHECK: fmls.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] +; CHECK: fmls.s {{s[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}[0] +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next + %1 = load float, float* %arrayidx3, align 8 + %add = fadd fast float %1, %1 + %mul = fmul fast float %add, %add + %e1 = insertelement <2 x float> undef, float %add, i32 0 + %e2 = insertelement <2 x float> %e1, float %add, i32 1 + %add2 = fsub fast <2 x float> %e2, + %e3 = insertelement <2 x float> undef, float %mul, i32 0 + %e4 = insertelement <2 x float> %e3, float %mul, i32 1 + %mul2 = fmul fast <2 x float> %add2, + %e5 = insertelement <2 x float> undef, float %add, i32 0 + %e6 = insertelement <2 x float> %e5, float %add, i32 1 + %add3 = fsub fast <2 x float> , %mul2 + %mulx = fmul fast <2 x float> %add2, %e2 + %addx = fsub fast <2 x float> %e4, %mulx + %e7 = insertelement <2 x float> undef, float %mul, i32 0 + %e8 = insertelement <2 x float> %e7, float %mul, i32 1 + %e9 = fmul fast <2 x float> %addx, %add3 + store <2 x float> %e9, <2 x float>* %0, align 8 + %e10 = extractelement <2 x float> %add3, i32 0 + %mul3 = fmul fast float %mul, %e10 + %add4 = fsub fast float %mul, %mul3 + store float %add4, float* %arrayidx2, align 8 + %exitcond = icmp eq i64 %indvars.iv.next, 25 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} +define void @foo_4s(float* %src) { +entry: + %arrayidx1 = getelementptr inbounds float, float* %src, i64 5 + %arrayidx2 = getelementptr inbounds float, float* %src, i64 11 + %0 = bitcast float* %arrayidx1 to <4 x float>* + br label %for.body + +; CHECK-LABEL: %for.body +; CHECK: fmls.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} +; CHECK: fmls.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0] +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds float, float* %src, i64 %indvars.iv.next + %1 = load float, float* %arrayidx3, align 8 + %add = fadd fast float %1, %1 + %mul = fmul fast float %add, %add + %e1 = insertelement <4 x float> undef, float %add, i32 0 + %e2 = insertelement <4 x float> %e1, float %add, i32 1 + %add2 = fadd fast <4 x float> %e2, + %e3 = insertelement <4 x float> undef, float %mul, i32 0 + %e4 = insertelement <4 x float> %e3, float %mul, i32 1 + %mul2 = fmul fast <4 x float> %add2, + %e5 = insertelement <4 x float> undef, float %add, i32 0 + %e6 = insertelement <4 x float> %e5, float %add, i32 1 + %add3 = fsub fast <4 x float> , %mul2 + %mulx = fmul fast <4 x float> %add2, %e2 + %addx = fsub fast <4 x float> %e4, %mulx + %e7 = insertelement <4 x float> undef, float %mul, i32 0 + %e8 = insertelement <4 x float> %e7, float %mul, i32 1 + %e9 = fmul fast <4 x float> %addx, %add3 + store <4 x float> %e9, <4 x float>* %0, align 8 + %e10 = extractelement <4 x float> %add3, i32 0 + %mul3 = fmul fast float %mul, %e10 + store float %mul3, float* %arrayidx2, align 8 + %exitcond = icmp eq i64 %indvars.iv.next, 25 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +}