Index: include/llvm/CodeGen/MachineCombinerPattern.h =================================================================== --- /dev/null +++ include/llvm/CodeGen/MachineCombinerPattern.h @@ -0,0 +1,29 @@ +//===-- llvm/CodeGen/MachineCombinerPattern.h - Instruction pattern supported by +//combiner ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines instruction pattern supported by combiner +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_MACHINECOMBINERPATTERN_H +#define LLVM_CODEGEN_MACHINECOMBINERPATTERN_H + +namespace llvm { + +/// Enumeration of instruction pattern supported by machine combiner +/// +/// +namespace MachineCombinerPattern { +// Forward declaration +enum MC_PATTERN : int; +} // end namespace MachineCombinerPattern +} // end namespace llvm + +#endif Index: include/llvm/CodeGen/MachineTraceMetrics.h =================================================================== --- include/llvm/CodeGen/MachineTraceMetrics.h +++ include/llvm/CodeGen/MachineTraceMetrics.h @@ -264,8 +264,9 @@ /// classes are included. For the caller to account for extra machine /// instructions, it must first resolve each instruction's scheduling class. unsigned getResourceLength( - ArrayRef Extrablocks = None, - ArrayRef ExtraInstrs = None) const; + ArrayRef Extrablocks = None, + ArrayRef ExtraInstrs = None, + ArrayRef RemoveInstrs = None) const; /// Return the length of the (data dependency) critical path through the /// trace. @@ -286,6 +287,12 @@ /// Return the Depth of a PHI instruction in a trace center block successor. /// The PHI does not have to be part of the trace. unsigned getPHIDepth(const MachineInstr *PHI) const; + + /// A dependence is useful if the basic block of the defining instruction + /// is part of the trace of the user instruction. It is assumed that DefMI + /// dominates UseMI (see also isUsefulDominator). + bool isDepInTrace(const MachineInstr *DefMI, + const MachineInstr *UseMI) const; }; /// A trace ensemble is a collection of traces selected using the same Index: include/llvm/CodeGen/Passes.h =================================================================== --- include/llvm/CodeGen/Passes.h +++ include/llvm/CodeGen/Passes.h @@ -486,6 +486,10 @@ /// inserting cmov instructions. extern char &EarlyIfConverterID; + /// This pass performs instruction combining using trace metrics to estimate + /// critical-path and resource depth. + extern char &MachineCombinerID; + /// StackSlotColoring - This pass performs stack coloring and merging. /// It merges disjoint allocas to reduce the stack size. extern char &StackColoringID; Index: include/llvm/CodeGen/TargetSchedule.h =================================================================== --- include/llvm/CodeGen/TargetSchedule.h +++ include/llvm/CodeGen/TargetSchedule.h @@ -167,6 +167,7 @@ /// if converter after moving it to TargetSchedModel). unsigned computeInstrLatency(const MachineInstr *MI, bool UseDefaultDefLatency = true) const; + unsigned computeInstrLatency(unsigned Opcode) const; /// \brief Output dependency latency of a pair of defs of the same register. /// Index: include/llvm/InitializePasses.h =================================================================== --- include/llvm/InitializePasses.h +++ include/llvm/InitializePasses.h @@ -274,6 +274,7 @@ void initializeBBVectorizePass(PassRegistry&); void initializeMachineFunctionPrinterPassPass(PassRegistry&); void initializeStackMapLivenessPass(PassRegistry&); +void initializeMachineCombinerPass(PassRegistry &); void initializeLoadCombinePass(PassRegistry&); } Index: include/llvm/Target/TargetInstrInfo.h =================================================================== --- include/llvm/Target/TargetInstrInfo.h +++ include/llvm/Target/TargetInstrInfo.h @@ -15,9 +15,12 @@ #define LLVM_TARGET_TARGETINSTRINFO_H #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/CodeGen/DFAPacketizer.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/MC/MCInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" namespace llvm { @@ -563,6 +566,43 @@ const SmallVectorImpl &Ops, MachineInstr* LoadMI) const; + /// hasPattern - return true when there is potentially a faster code sequence + /// for an instruction chain ending in . All potential pattern a listed + /// in the array.` + /// \param Root A binary instruction that could be combined with one of its + /// operands + /// \param Pattern Vector of possible combination pattern + virtual bool hasPattern( + MachineInstr &Root, + SmallVectorImpl &Pattern) const { + return false; + } + /// genAlternativeCodeSequence - when hasPattern() finds a pattern + /// this function generates the instructions that could replace the + /// original code sequence. The client has to make the call whether + /// the actual replacementment is beneficial or not. + /// \param Root - a binary instruction that could be combined with one of its + /// operands + /// \param P - a combination pattern for Root + /// \param InsInstr - vector of new instructions that implement P + /// \param DelInstr - old instruction including Root that could be replaced by + /// InsInstr + /// \param InstrIdxForVirtReg - map of virtual register to instruction in + /// InsInstr that defines it + virtual void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) + const { + return; + } + + virtual void insertMove(MachineInstr &Root, MachineOperand &Imm, + unsigned NewVR) const { + return; + } + protected: /// foldMemoryOperandImpl - Target-dependent implementation for /// foldMemoryOperand. Target-independent code in foldMemoryOperand will Index: lib/CodeGen/CMakeLists.txt =================================================================== --- lib/CodeGen/CMakeLists.txt +++ lib/CodeGen/CMakeLists.txt @@ -49,6 +49,7 @@ MachineBranchProbabilityInfo.cpp MachineCSE.cpp MachineCodeEmitter.cpp + MachineCombiner.cpp MachineCopyPropagation.cpp MachineDominators.cpp MachineFunction.cpp Index: lib/CodeGen/CodeGen.cpp =================================================================== --- lib/CodeGen/CodeGen.cpp +++ lib/CodeGen/CodeGen.cpp @@ -41,6 +41,7 @@ initializeMachineBlockPlacementPass(Registry); initializeMachineBlockPlacementStatsPass(Registry); initializeMachineCopyPropagationPass(Registry); + initializeMachineCombinerPass(Registry); initializeMachineCSEPass(Registry); initializeMachineDominatorTreePass(Registry); initializeMachinePostDominatorTreePass(Registry); Index: lib/CodeGen/MachineCombiner.cpp =================================================================== --- /dev/null +++ lib/CodeGen/MachineCombiner.cpp @@ -0,0 +1,382 @@ +//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The machine combiner pass uses machine trace metrics to ensure the combined +// instructions does not lengthen the critical path or the resource depth. +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "machine-combiner" + +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +STATISTIC(NumInstCombined, "Number of machineinst combined"); + +namespace { +class MachineCombiner : public MachineFunctionPass { + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + const MCSchedModel *SchedModel; + MachineRegisterInfo *MRI; + MachineTraceMetrics *Traces; + MachineTraceMetrics::Ensemble *MinInstr; + + TargetSchedModel TSchedModel; + + /// OptSize - True if optimizing for code size. + bool OptSize; + +public: + static char ID; + MachineCombiner() : MachineFunctionPass(ID) { + initializeMachineCombinerPass(*PassRegistry::getPassRegistry()); + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnMachineFunction(MachineFunction &MF) override; + const char *getPassName() const override { return "Machine InstCombiner"; } + +private: + bool combineInstructions(MachineBasicBlock *); + MachineInstr *getOperandDef(const MachineOperand &MO); + bool preservesCriticalPathLen( + MachineBasicBlock *MBB, MachineInstr *Root, + MachineTraceMetrics::Trace BlockTrace, + SmallVector InsInstrs, + DenseMap &InstrIdxForVirtReg); + bool preservesResourceLen(MachineBasicBlock *MBB, + MachineTraceMetrics::Trace BlockTrace, + SmallVector InsInstrs, + SmallVector DelInstrs); +}; +} + +char MachineCombiner::ID = 0; +char &llvm::MachineCombinerID = MachineCombiner::ID; + +INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner", + "Machine InstCombiner", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics) +INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner", + false, false) + +void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addPreserved(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); +} + +MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { + MachineInstr *DefInstr = nullptr; + // We need a virtual register definition. + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) + DefInstr = MRI->getUniqueVRegDef(MO.getReg()); + // PHI's have no depth etc. + if (DefInstr && DefInstr->isPHI()) + DefInstr = nullptr; + return DefInstr; +} + +/// preservesCriticalPathlen - True when the new instruction sequence does +/// not lengthen the critical path. +/// The original code sequence ends in MI (Machine Instruction) Root. The new +/// code sequence ends in MI NewRoot. A necessary condition for the new sequence +/// to replace the old sequence is that is cannot lengthen the critical path. +/// This is decided by the formula (NewRootDepth + NewRootLatency) <= (RootDepth +/// + +/// RootLatency + RootSlack)). The slack is the number of cycles Root can be +/// delayed before the critical patch becomes longer. +bool MachineCombiner::preservesCriticalPathLen( + MachineBasicBlock *MBB, MachineInstr *Root, + MachineTraceMetrics::Trace BlockTrace, + SmallVector InsInstrs, + DenseMap &InstrIdxForVirtReg) { + + int Idx = -1; + assert(InsInstrs.size() < 16 && "Long unsupported pattern \n"); + unsigned InstrDepth[16]; + + // Foreach instruction in in the new sequence compute the depth based on the + // operands. Use the trace information when possible. For new operands which + // are tracked in the InstrIdxForVirtReg map depth is looked up in the local + // InstrDepth array. + for (auto *InstrPtr : InsInstrs) { // for each Use + unsigned IDepth = 0; + DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";); + for (unsigned i = 0, e = InstrPtr->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = InstrPtr->getOperand(i); + // Check for virtual register operand. + if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) + continue; + if (!MO.isUse()) + continue; + unsigned DepthOp = 0; + unsigned LatencyOp = 0; + DenseMap::iterator II = + InstrIdxForVirtReg.find(MO.getReg()); + if (II != InstrIdxForVirtReg.end()) { + // Operand is new virtual register not in trace + assert(II->second >= 0 && II->second < 16 && "Bad Index"); + MachineInstr *DefInstr = InsInstrs[II->second]; + assert(DefInstr && + "There must be a definition for a new virtual register"); + DepthOp = InstrDepth[II->second]; + LatencyOp = TSchedModel.computeOperandLatency( + DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), + InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); + } else { + MachineInstr *DefInstr = getOperandDef(MO); + if (DefInstr) { + DepthOp = BlockTrace.getInstrCycles(DefInstr).Depth; + LatencyOp = TSchedModel.computeOperandLatency( + DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()), + InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg())); + } + } + IDepth = std::max(IDepth, DepthOp + LatencyOp); + } + InstrDepth[++Idx] = IDepth; + } + + unsigned NewRootIdx = InsInstrs.size() - 1; + MachineInstr *NewRoot = InsInstrs[NewRootIdx]; + unsigned NewRootDepth = InstrDepth[NewRootIdx]; + unsigned NewRootLatency = 0; + // + // Compute latency of NewRoot as max of latency of defined operands + // + // Check each definition in NewRoot and compute the latency + for (unsigned i = 0, e = NewRoot->getNumOperands(); i != e; ++i) { + const MachineOperand &MO = NewRoot->getOperand(i); + // Check for virtual register operand. + if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))) + continue; + if (!MO.isDef()) + continue; + // Get the first instruction that uses MO + MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg()); + RI++; + MachineInstr *UseMO = RI->getParent(); + unsigned LatencyOp = 0; + if (UseMO && BlockTrace.isDepInTrace(Root, UseMO)) { + LatencyOp = TSchedModel.computeOperandLatency( + NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO, + UseMO->findRegisterUseOperandIdx(MO.getReg())); + } else { + LatencyOp = TSchedModel.computeInstrLatency(NewRoot->getOpcode()); + } + NewRootLatency = std::max(NewRootLatency, LatencyOp); + } + + // Get depth, latency and slack of Root + unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth; + unsigned RootLatency = TSchedModel.computeInstrLatency(Root); + unsigned RootSlack = BlockTrace.getInstrSlack(Root); + + DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n"; + dbgs() << " NewRootDepth: " << NewRootDepth + << " NewRootLatency: " << NewRootLatency << "\n"; + dbgs() << " RootDepth: " << RootDepth << "RootLatency: " << RootLatency + << " RootSlack: " << RootSlack << "\n"; + dbgs() << " NewRootDepth + NewRootLatency " + << NewRootDepth + NewRootLatency << "\n"; + dbgs() << " RootDepth + RootLatency + RootSlack " + << RootDepth + RootLatency + RootSlack << "\n";); + + /// True when the new sequence does not lenghten the critical path. + return ((NewRootDepth + NewRootLatency) <= + (RootDepth + RootLatency + RootSlack)); +} + +/// preservesResourceLen - True when the new instructions do not increase +/// resource length +bool MachineCombiner::preservesResourceLen( + MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace, + SmallVector InsInstrs, + SmallVector DelInstrs) { + + // Compute current resource length + + ArrayRef MBBarr(MBB); + unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr); + + // Compute new resource length + + // Deal with SC rather than Instructions. + SmallVector InsInstrsSC; + for (auto *InstrPtr : InsInstrs) { + unsigned Opc = InstrPtr->getOpcode(); + unsigned Idx = TII->get(Opc).getSchedClass(); + const MCSchedClassDesc *SC = SchedModel->getSchedClassDesc(Idx); + InsInstrsSC.push_back(SC); + } + + // Deal with SC rather than Instructions. + SmallVector DelInstrsSC; + for (auto *InstrPtr : DelInstrs) { + unsigned Opc = InstrPtr->getOpcode(); + unsigned Idx = TII->get(Opc).getSchedClass(); + const MCSchedClassDesc *SC = SchedModel->getSchedClassDesc(Idx); + DelInstrsSC.push_back(SC); + } + + ArrayRef MSCInsArr = makeArrayRef(InsInstrsSC); + ArrayRef MSCDelArr = makeArrayRef(DelInstrsSC); + + unsigned ResLenAfterCombine = + BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr); + + DEBUG(dbgs() << "RESOURCE DATA: \n"; + dbgs() << " resource len before: " << ResLenBeforeCombine + << " after: " << ResLenAfterCombine << "\n";); + + return ResLenAfterCombine <= ResLenBeforeCombine; +} +/// combineInstructions - substitute a slow code sequence with a faster one by +/// evaluating instruction combining pattern. +/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction +/// combining based on machine trace metrics. Only combine a sequence of +/// instructions when this neither lengthens the critical path nor increases +/// resource pressure. When optimizing for codesize always combine when the new +/// sequence is shorter. +bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) { + bool Changed = false; + DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n"); + + auto BlockIter = MBB->begin(); + + while (BlockIter != MBB->end()) { + auto &MI = *BlockIter++; + + DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";); + SmallVector Pattern; + // The motivating example is: + // + // MUL Other MUL_op1 MUL_op2 Other + // \ / \ | / + // ADD/SUB => MADD/MSUB + // (=Root) (=NewRoot) + + // The original code always replace MUL + ADD/SUB by MADD. While this is + // usually benficial for code size it unfortunately can hurt performance + // when the ADD is on the critical path, but the MUL is not. With the + // substitution the MUL becomes part of the critical path (in form of the + // MADD) and can lengthen it on architectures where the MADD latency is + // longer than the ADD latency. + // + // For each instruction we check if it can be the root of a combiner + // pattern. Then for each pattern the new code sequence in form of MI is + // generated and evaluated. When the efficiency criteria (don't lengthen + // critical path, don't use more resources) is met the new sequence gets + // hooked up into the basic block before the old sequence is removed. + // + // The algorithm does not try to evaluate all patterns and pick the best. + // This is only an artificial restriction though. In practice there is + // mostly one pattern and hasPattern() can order patterns based on an + // internal cost heuristic. + + if (TII->hasPattern(MI, Pattern)) { + for (auto P : Pattern) { + SmallVector InsInstrs; + SmallVector DelInstrs; + DenseMap InstrIdxForVirtReg; + if (!MinInstr) + MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount); + MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB); + Traces->verifyAnalysis(); + TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs, + InstrIdxForVirtReg); + // Found pattern, but did not generate alternative sequence. + // This can happen e.g. when an immediate could not be materialized + // in a single instruction. + if (!InsInstrs.size()) + continue; + // Substitute when we optimize for codesize and the new sequence has + // fewer instructions OR + // the new sequence neither lenghten the critical path nor increases + // resource pressure. + if ((OptSize && (InsInstrs.size() < DelInstrs.size())) || + (preservesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, + InstrIdxForVirtReg) && + preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) { + for (auto *InstrPtr : InsInstrs) + MBB->insert((MachineBasicBlock::iterator) & MI, + (MachineInstr *)InstrPtr); + for (auto *InstrPtr : DelInstrs) + InstrPtr->eraseFromParent(); + + Changed = true; + ++NumInstCombined; + + Traces->invalidate(MBB); + Traces->verifyAnalysis(); + // Eagerly stop after the first pattern fired + break; + } else { + // Cleanup instructions of the alternative code sequence. There is no + // use for them. + for (auto *InstrPtr : InsInstrs) { + MachineFunction *MF = MBB->getParent(); + MF->DeleteMachineInstr((MachineInstr *)InstrPtr); + } + } + InstrIdxForVirtReg.clear(); + } + } + } + + return Changed; +} + +bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) { + TII = MF.getTarget().getInstrInfo(); + TRI = MF.getTarget().getRegisterInfo(); + const TargetSubtargetInfo &STI = + MF.getTarget().getSubtarget(); + SchedModel = STI.getSchedModel(); + TSchedModel.init(*SchedModel, &STI, TII); + MRI = &MF.getRegInfo(); + Traces = &getAnalysis(); + MinInstr = 0; + + OptSize = MF.getFunction()->getAttributes().hasAttribute( + AttributeSet::FunctionIndex, Attribute::OptimizeForSize); + + DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n'); + if (!TSchedModel.hasInstrSchedModel()) { + DEBUG(dbgs() << " Skipping pass: no machine model available\n"); + return false; + } + + bool Changed = false; + + // Try to combine instructions. + for (auto &MBB : MF) + Changed |= combineInstructions(&MBB); + + return Changed; +} Index: lib/CodeGen/MachineScheduler.cpp =================================================================== --- lib/CodeGen/MachineScheduler.cpp +++ lib/CodeGen/MachineScheduler.cpp @@ -40,6 +40,9 @@ cl::desc("Force top-down list scheduling")); cl::opt ForceBottomUp("misched-bottomup", cl::Hidden, cl::desc("Force bottom-up list scheduling")); +cl::opt +DumpCriticalPathLength("misched-dcpl", cl::Hidden, + cl::desc("Print critical path length to stdout")); } #ifndef NDEBUG @@ -451,6 +454,11 @@ else dbgs() << "End"; dbgs() << " RegionInstrs: " << NumRegionInstrs << " Remaining: " << RemainingInstrs << "\n"); + if (DumpCriticalPathLength) { + errs() << MF->getName(); + errs() << ":BB# " << MBB->getNumber(); + errs() << " " << MBB->getName() << " \n"; + } // Schedule a region: possibly reorder instructions. // This invalidates 'RegionEnd' and 'I'. @@ -2460,7 +2468,10 @@ if ((*I)->getDepth() > Rem.CriticalPath) Rem.CriticalPath = (*I)->getDepth(); } - DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n'); + DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n'); + if (DumpCriticalPathLength) { + errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n"; + } if (EnableCyclicPath) { Rem.CyclicCritPath = DAG->computeCyclicCriticalPath(); @@ -2902,7 +2913,10 @@ if ((*I)->getDepth() > Rem.CriticalPath) Rem.CriticalPath = (*I)->getDepth(); } - DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n'); + DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n'); + if (DumpCriticalPathLength) { + errs() << "Critical Path(PGS-RR ): " << Rem.CriticalPath << " \n"; + } } /// Apply a set of heursitics to a new candidate for PostRA scheduling. Index: lib/CodeGen/MachineTraceMetrics.cpp =================================================================== --- lib/CodeGen/MachineTraceMetrics.cpp +++ lib/CodeGen/MachineTraceMetrics.cpp @@ -1169,6 +1169,7 @@ return DepCycle; } +/// When bottom is set include instructions in current block in estimate. unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const { // Find the limiting processor resource. // Numbers have been pre-scaled to be comparable. @@ -1185,7 +1186,9 @@ // Convert to cycle count. PRMax = TE.MTM.getCycles(PRMax); + /// All instructions before current block unsigned Instrs = TBI.InstrDepth; + // plus instructions in current block if (Bottom) Instrs += TE.MTM.BlockInfo[getBlockNum()].InstrCount; if (unsigned IW = TE.MTM.SchedModel.getIssueWidth()) @@ -1194,10 +1197,10 @@ return std::max(Instrs, PRMax); } - -unsigned MachineTraceMetrics::Trace:: -getResourceLength(ArrayRef Extrablocks, - ArrayRef ExtraInstrs) const { +unsigned MachineTraceMetrics::Trace::getResourceLength( + ArrayRef Extrablocks, + ArrayRef ExtraInstrs, + ArrayRef RemoveInstrs) const { // Add up resources above and below the center block. ArrayRef PRDepths = TE.getProcResourceDepths(getBlockNum()); ArrayRef PRHeights = TE.getProcResourceHeights(getBlockNum()); @@ -1218,20 +1221,52 @@ PRCycles += (PI->Cycles * TE.MTM.SchedModel.getResourceFactor(K)); } } + for (unsigned I = 0; I != RemoveInstrs.size(); ++I) { + const MCSchedClassDesc *SC = RemoveInstrs[I]; + if (!SC->isValid()) + continue; + for (TargetSchedModel::ProcResIter + PI = TE.MTM.SchedModel.getWriteProcResBegin(SC), + PE = TE.MTM.SchedModel.getWriteProcResEnd(SC); + PI != PE; ++PI) { + if (PI->ProcResourceIdx != K) + continue; + PRCycles -= (PI->Cycles * TE.MTM.SchedModel.getResourceFactor(K)); + } + } PRMax = std::max(PRMax, PRCycles); } // Convert to cycle count. PRMax = TE.MTM.getCycles(PRMax); + // Instrs: #instructions in current trace outside current block. unsigned Instrs = TBI.InstrDepth + TBI.InstrHeight; + // Add instruction count from the extra blocks. for (unsigned i = 0, e = Extrablocks.size(); i != e; ++i) Instrs += TE.MTM.getResources(Extrablocks[i])->InstrCount; + /// FIXME: what about the new instructions? Since they will be + /// in the current block they get excluded here. Ok. So why + /// are they included in the calculation for PRMax above? + /// FIX proposal: + Instrs += ExtraInstrs.size(); + Instrs -= RemoveInstrs.size(); if (unsigned IW = TE.MTM.SchedModel.getIssueWidth()) Instrs /= IW; // Assume issue width 1 without a schedule model. return std::max(Instrs, PRMax); } +bool MachineTraceMetrics::Trace::isDepInTrace(const MachineInstr *DefMI, + const MachineInstr *UseMI) const { + if (DefMI->getParent() == UseMI->getParent()) + return true; + + const TraceBlockInfo &DepTBI = TE.BlockInfo[DefMI->getParent()->getNumber()]; + const TraceBlockInfo &TBI = TE.BlockInfo[UseMI->getParent()->getNumber()]; + + return DepTBI.isUsefulDominator(TBI); +} + void MachineTraceMetrics::Ensemble::print(raw_ostream &OS) const { OS << getName() << " ensemble:\n"; for (unsigned i = 0, e = BlockInfo.size(); i != e; ++i) { Index: lib/CodeGen/TargetSchedule.cpp =================================================================== --- lib/CodeGen/TargetSchedule.cpp +++ lib/CodeGen/TargetSchedule.cpp @@ -225,6 +225,28 @@ return DefMI->isTransient() ? 0 : TII->defaultDefLatency(&SchedModel, DefMI); } +unsigned TargetSchedModel::computeInstrLatency(unsigned Opcode) const { + assert(hasInstrSchedModel() && "Only call this function with a SchedModel"); + + unsigned SCIdx = TII->get(Opcode).getSchedClass(); + const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SCIdx); + unsigned Latency = 0; + + if (SCDesc->isValid() && !SCDesc->isVariant()) { + for (unsigned DefIdx = 0, DefEnd = SCDesc->NumWriteLatencyEntries; + DefIdx != DefEnd; ++DefIdx) { + // Lookup the definition's write latency in SubtargetInfo. + const MCWriteLatencyEntry *WLEntry = + STI->getWriteLatencyEntry(SCDesc, DefIdx); + Latency = std::max(Latency, capLatency(WLEntry->Cycles)); + } + return Latency; + } + + assert(Latency && "No MI sched latency"); + return 0; +} + unsigned TargetSchedModel::computeInstrLatency(const MachineInstr *MI, bool UseDefaultDefLatency) const { Index: lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- lib/Target/AArch64/AArch64InstrFormats.td +++ lib/Target/AArch64/AArch64InstrFormats.td @@ -1327,14 +1327,15 @@ } multiclass MulAccum { + // MADD/MSUB generation is decided by MachineCombiner.cpp def Wrrr : BaseMulAccum, + [/*(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))*/]>, Sched<[WriteIM32, ReadIM, ReadIM, ReadIMA]> { let Inst{31} = 0; } def Xrrr : BaseMulAccum, + [/*(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))*/]>, Sched<[WriteIM64, ReadIM, ReadIM, ReadIMA]> { let Inst{31} = 1; } Index: lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.h +++ lib/Target/AArch64/AArch64InstrInfo.h @@ -17,6 +17,7 @@ #include "AArch64.h" #include "AArch64RegisterInfo.h" #include "llvm/Target/TargetInstrInfo.h" +#include "llvm/CodeGen/MachineCombinerPattern.h" #define GET_INSTRINFO_HEADER #include "AArch64GenInstrInfo.inc" @@ -153,6 +154,22 @@ bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, int CmpValue, const MachineRegisterInfo *MRI) const override; + /// hasPattern - return true when there is potentially a faster code sequence + /// for an instruction chain ending in . All potential pattern a listed + /// in the array. + virtual bool hasPattern( + MachineInstr &Root, + SmallVectorImpl &Pattern) const; + + /// genAlternativeCodeSequence - when hasPattern() finds a pattern + /// this function generates the instructions that could replace the + /// original code sequence + virtual void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) + const; private: void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL, Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -14,6 +14,7 @@ #include "AArch64InstrInfo.h" #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" +#include "AArch64MachineCombinerPattern.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" @@ -652,17 +653,12 @@ return true; } -/// optimizeCompareInstr - Convert the instruction supplying the argument to the -/// comparison into one that sets the zero bit in the flags register. -bool AArch64InstrInfo::optimizeCompareInstr( - MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, - int CmpValue, const MachineRegisterInfo *MRI) const { - - // Replace SUBSWrr with SUBWrr if NZCV is not used. - int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true); - if (Cmp_NZCV != -1) { +/// convertFlagSettingOpcode - return opcode that does not +/// set flags when possible. The caller is responsible to do +/// the actual substitution and legality checking. +static unsigned convertFlagSettingOpcode(MachineInstr *MI) { unsigned NewOpc; - switch (CmpInstr->getOpcode()) { + switch (MI->getOpcode()) { default: return false; case AArch64::ADDSWrr: NewOpc = AArch64::ADDWrr; break; @@ -682,7 +678,22 @@ case AArch64::SUBSXrs: NewOpc = AArch64::SUBXrs; break; case AArch64::SUBSXrx: NewOpc = AArch64::SUBXrx; break; } + return NewOpc; +} + +/// optimizeCompareInstr - Convert the instruction supplying the argument to the +/// comparison into one that sets the zero bit in the flags register. +bool AArch64InstrInfo::optimizeCompareInstr( + MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask, + int CmpValue, const MachineRegisterInfo *MRI) const { + // Replace SUBSWrr with SUBWrr if NZCV is not used. + int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true); + if (Cmp_NZCV != -1) { + unsigned Opc = CmpInstr->getOpcode(); + unsigned NewOpc = convertFlagSettingOpcode(CmpInstr); + if (NewOpc == Opc) + return false; const MCInstrDesc &MCID = get(NewOpc); CmpInstr->setDesc(MCID); CmpInstr->RemoveOperand(Cmp_NZCV); @@ -2087,3 +2098,417 @@ NopInst.setOpcode(AArch64::HINT); NopInst.addOperand(MCOperand::CreateImm(0)); } +// +// True when Opc sets flag +static bool isCombineInstrSettingFlag(unsigned Opc) { + switch (Opc) { + case AArch64::ADDSWrr: + case AArch64::ADDSWri: + case AArch64::ADDSXrr: + case AArch64::ADDSXri: + case AArch64::SUBSWrr: + case AArch64::SUBSXrr: + // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. + case AArch64::SUBSWri: + case AArch64::SUBSXri: + return true; + default: + break; + } + return false; +} +// +// 32b Opcodes that can be combined with a MUL +static bool isCombineInstrCandidate32(unsigned Opc) { + switch (Opc) { + case AArch64::ADDWrr: + case AArch64::ADDWri: + case AArch64::SUBWrr: + case AArch64::ADDSWrr: + case AArch64::ADDSWri: + case AArch64::SUBSWrr: + // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. + case AArch64::SUBWri: + case AArch64::SUBSWri: + return true; + default: + break; + } + return false; +} +// +// 64b Opcodes that can be combined with a MUL +static bool isCombineInstrCandidate64(unsigned Opc) { + switch (Opc) { + case AArch64::ADDXrr: + case AArch64::ADDXri: + case AArch64::SUBXrr: + case AArch64::ADDSXrr: + case AArch64::ADDSXri: + case AArch64::SUBSXrr: + // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. + case AArch64::SUBXri: + case AArch64::SUBSXri: + return true; + default: + break; + } + return false; +} +// +// Opcodes that can be combined with a MUL +static bool isCombineInstrCandidate(unsigned Opc) { + return (isCombineInstrCandidate32(Opc) || isCombineInstrCandidate64(Opc)); +} + +static bool canCombineWithMUL(MachineBasicBlock &MBB, MachineOperand &MO, + unsigned MulOpc, unsigned ZeroReg) { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineInstr *MI = nullptr; + // We need a virtual register definition. + if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) + MI = MRI.getUniqueVRegDef(MO.getReg()); + // And it needs to be in the trace (otherwise, it won't have a depth). + if (!MI || MI->getParent() != &MBB || (unsigned)MI->getOpcode() != MulOpc) + return false; + + assert(MI->getNumOperands() >= 4 && MI->getOperand(0).isReg() && + MI->getOperand(1).isReg() && MI->getOperand(2).isReg() && + MI->getOperand(3).isReg() && "MAdd/MSub must have a least 4 regs"); + + // The third input reg must be zero. + if (MI->getOperand(3).getReg() != ZeroReg) + return false; + + // Must only used by the user we combine with. + if (!MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) + return false; + + return true; +} + +/// hasPattern - return true when there is potentially a faster code sequence +/// for an instruction chain ending in . All potential pattern a listed +/// in the array. +bool AArch64InstrInfo::hasPattern( + MachineInstr &Root, + SmallVectorImpl &Pattern) const { + unsigned Opc = Root.getOpcode(); + MachineBasicBlock &MBB = *Root.getParent(); + bool Found = false; + + if (!isCombineInstrCandidate(Opc)) + return 0; + // if (Root.isCompare()) { + if (isCombineInstrSettingFlag(Opc)) { + int Cmp_NZCV = Root.findRegisterDefOperandIdx(AArch64::NZCV, true); + // When NZCV is live bail out. + if (Cmp_NZCV == -1) + return 0; + unsigned NewOpc = convertFlagSettingOpcode(&Root); + // When opcode can't change bail out. + // CHECKME: do we miss any cases were we miss + // opcode conversion? + if (NewOpc == Opc) + return 0; + Opc = NewOpc; + } + + switch (Opc) { + default: + break; + case AArch64::ADDWrr: + assert(Root.getOperand(1).isReg() && Root.getOperand(2).isReg() && + "ADDWrr does not have register operands"); + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, + AArch64::WZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULADDW_OP1); + Found = true; + } + if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, + AArch64::WZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULADDW_OP2); + Found = true; + } + break; + case AArch64::ADDXrr: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, + AArch64::XZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULADDX_OP1); + Found = true; + } + if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, + AArch64::XZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULADDX_OP2); + Found = true; + } + break; + case AArch64::SUBWrr: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, + AArch64::WZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULSUBW_OP1); + Found = true; + } + if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDWrrr, + AArch64::WZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULSUBW_OP2); + Found = true; + } + break; + case AArch64::SUBXrr: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, + AArch64::XZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULSUBX_OP1); + Found = true; + } + if (canCombineWithMUL(MBB, Root.getOperand(2), AArch64::MADDXrrr, + AArch64::XZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULSUBX_OP2); + Found = true; + } + break; + case AArch64::ADDWri: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, + AArch64::WZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULADDWI_OP1); + Found = true; + } + break; + case AArch64::ADDXri: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, + AArch64::XZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULADDXI_OP1); + Found = true; + } + break; + case AArch64::SUBWri: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDWrrr, + AArch64::WZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULSUBWI_OP1); + Found = true; + } + break; + case AArch64::SUBXri: + if (canCombineWithMUL(MBB, Root.getOperand(1), AArch64::MADDXrrr, + AArch64::XZR)) { + Pattern.push_back(MachineCombinerPattern::MC_MULSUBXI_OP1); + Found = true; + } + break; + } + return Found; +} + +static MachineInstr *genMadd(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, MachineInstr &Root, + SmallVectorImpl &InsInstrs, + unsigned IdxMulOpd, unsigned MaddOpc) { + assert(IdxMulOpd == 1 || IdxMulOpd == 2); + + unsigned IdxOtherOpd = IdxMulOpd == 1 ? 2 : 1; + MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); + MachineOperand R = Root.getOperand(0); + MachineOperand A = MUL->getOperand(1); + MachineOperand B = MUL->getOperand(2); + MachineOperand C = Root.getOperand(IdxOtherOpd); + MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc)) + .addOperand(R) + .addOperand(A) + .addOperand(B) + .addOperand(C); + // (potentially) insert the MADD + InsInstrs.push_back(MIB); + return MUL; +} + +static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, MachineInstr &Root, + SmallVectorImpl &InsInstrs, + unsigned IdxMulOpd, unsigned MaddOpc, + unsigned VR) { + assert(IdxMulOpd == 1 || IdxMulOpd == 2); + + MachineInstr *MUL = MRI.getUniqueVRegDef(Root.getOperand(IdxMulOpd).getReg()); + MachineOperand R = Root.getOperand(0); + MachineOperand A = MUL->getOperand(1); + MachineOperand B = MUL->getOperand(2); + MachineInstrBuilder MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MaddOpc)) + .addOperand(R) + .addOperand(A) + .addOperand(B) + .addReg(VR); + // (potentially) insert the MADD + InsInstrs.push_back(MIB); + return MUL; +} +/// genAlternativeCodeSequence - when hasPattern() finds a pattern +/// this function generates the instructions that could replace the +/// original code sequence +void AArch64InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern::MC_PATTERN Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) + const { + MachineBasicBlock &MBB = *Root.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineFunction &MF = *MBB.getParent(); + const TargetInstrInfo *TII = MF.getTarget().getInstrInfo(); + + MachineInstr *MUL; + unsigned Opc; + switch (Pattern) { + default: + // signal error. + break; + case MachineCombinerPattern::MC_MULADDW_OP1: + case MachineCombinerPattern::MC_MULADDX_OP1: + // MUL I=A,B,0 + // ADD R,I,C + // ==> MADD R,A,B,C + // --- Create(MADD); + Opc = Pattern == MachineCombinerPattern::MC_MULADDW_OP1 ? AArch64::MADDWrrr + : AArch64::MADDXrrr; + MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 1, Opc); + break; + case MachineCombinerPattern::MC_MULADDW_OP2: + case MachineCombinerPattern::MC_MULADDX_OP2: + // MUL I=A,B,0 + // ADD R,C,I + // ==> MADD R,A,B,C + // --- Create(MADD); + Opc = Pattern == MachineCombinerPattern::MC_MULADDW_OP2 ? AArch64::MADDWrrr + : AArch64::MADDXrrr; + MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc); + break; + case MachineCombinerPattern::MC_MULADDWI_OP1: + case MachineCombinerPattern::MC_MULADDXI_OP1: + // MUL I=A,B,0 + // ADD R,I,Imm + // ==> ORR V, ZR, Imm + // ==> MADD R,A,B,V + // --- Create(MADD); + { + const TargetRegisterClass *RC = + MRI.getRegClass(Root.getOperand(1).getReg()); + unsigned NewVR = MRI.createVirtualRegister(RC); + unsigned BitSize, OrrOpc, ZeroReg; + if (Pattern == MachineCombinerPattern::MC_MULADDWI_OP1) { + BitSize = 32; + OrrOpc = AArch64::ORRWri; + ZeroReg = AArch64::WZR; + Opc = AArch64::MADDWrrr; + } else { + OrrOpc = AArch64::ORRXri; + BitSize = 64; + ZeroReg = AArch64::XZR; + Opc = AArch64::MADDXrrr; + } + uint64_t Imm = Root.getOperand(2).getImm(); + + if (Root.getOperand(3).isImm()) { + unsigned val = Root.getOperand(3).getImm(); + Imm = Imm << val; + } + uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize); + uint64_t Encoding; + + if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc)) + .addOperand(MachineOperand::CreateReg(NewVR, RegState::Define)) + .addReg(ZeroReg) + .addImm(Encoding); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR,0)); + MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR); + } + } + break; + case MachineCombinerPattern::MC_MULSUBW_OP1: + case MachineCombinerPattern::MC_MULSUBX_OP1: { + // MUL I=A,B,0 + // SUB R,I, C + // ==> SUB V, 0, C + // ==> MADD R,A,B,V // = -C + A*B + // --- Create(MADD); + const TargetRegisterClass *RC = + MRI.getRegClass(Root.getOperand(1).getReg()); + unsigned NewVR = MRI.createVirtualRegister(RC); + unsigned SubOpc, ZeroReg; + if (Pattern == MachineCombinerPattern::MC_MULSUBW_OP1) { + SubOpc = AArch64::SUBWrr; + ZeroReg = AArch64::WZR; + Opc = AArch64::MADDWrrr; + } else { + SubOpc = AArch64::SUBXrr; + ZeroReg = AArch64::XZR; + Opc = AArch64::MADDXrrr; + } + // SUB NewVR, 0, C + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(SubOpc)) + .addOperand(MachineOperand::CreateReg(NewVR, RegState::Define)) + .addReg(ZeroReg) + .addOperand(Root.getOperand(2)); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR,0)); + MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR); + } break; + case MachineCombinerPattern::MC_MULSUBW_OP2: + case MachineCombinerPattern::MC_MULSUBX_OP2: + // MUL I=A,B,0 + // SUB R,C,I + // ==> MSUB R,A,B,C (computes C - A*B) + // --- Create(MSUB); + Opc = Pattern == MachineCombinerPattern::MC_MULSUBW_OP2 ? AArch64::MSUBWrrr + : AArch64::MSUBXrrr; + MUL = genMadd(MF, MRI, TII, Root, InsInstrs, 2, Opc); + break; + case MachineCombinerPattern::MC_MULSUBWI_OP1: + case MachineCombinerPattern::MC_MULSUBXI_OP1: { + // MUL I=A,B,0 + // SUB R,I, Imm + // ==> ORR V, ZR, -Imm + // ==> MADD R,A,B,V // = -Imm + A*B + // --- Create(MADD); + const TargetRegisterClass *RC = + MRI.getRegClass(Root.getOperand(1).getReg()); + unsigned NewVR = MRI.createVirtualRegister(RC); + unsigned BitSize, OrrOpc, ZeroReg; + if (Pattern == MachineCombinerPattern::MC_MULSUBWI_OP1) { + BitSize = 32; + OrrOpc = AArch64::ORRWri; + ZeroReg = AArch64::WZR; + Opc = AArch64::MADDWrrr; + } else { + OrrOpc = AArch64::ORRXri; + BitSize = 64; + ZeroReg = AArch64::XZR; + Opc = AArch64::MADDXrrr; + } + int Imm = Root.getOperand(2).getImm(); + if (Root.getOperand(3).isImm()) { + unsigned val = Root.getOperand(3).getImm(); + Imm = Imm << val; + } + uint64_t UImm = -Imm << (64 - BitSize) >> (64 - BitSize); + uint64_t Encoding; + if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) { + MachineInstrBuilder MIB1 = + BuildMI(MF, Root.getDebugLoc(), TII->get(OrrOpc)) + .addOperand(MachineOperand::CreateReg(NewVR, RegState::Define)) + .addReg(ZeroReg) + .addImm(Encoding); + InsInstrs.push_back(MIB1); + InstrIdxForVirtReg.insert(std::make_pair(NewVR,0)); + MUL = genMaddR(MF, MRI, TII, Root, InsInstrs, 1, Opc, NewVR); + } + } break; + } + // (potentially) delete the MUL and ADD/SUB + DelInstrs.push_back(MUL); + DelInstrs.push_back(&Root); + + return; +} Index: lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetMachine.cpp +++ lib/Target/AArch64/AArch64TargetMachine.cpp @@ -24,6 +24,10 @@ EnableCCMP("aarch64-ccmp", cl::desc("Enable the CCMP formation pass"), cl::init(true), cl::Hidden); +static cl::opt EnableMCR("aarch64-mcr", + cl::desc("Enable the machine combiner pass"), + cl::init(true), cl::Hidden); + static cl::opt EnableStPairSuppress("aarch64-stp-suppress", cl::desc("Suppress STP for AArch64"), cl::init(true), cl::Hidden); @@ -176,6 +180,8 @@ bool AArch64PassConfig::addILPOpts() { if (EnableCCMP) addPass(createAArch64ConditionalCompares()); + if (EnableMCR) + addPass(&MachineCombinerID); addPass(&EarlyIfConverterID); if (EnableStPairSuppress) addPass(createAArch64StorePairSuppressPass()); Index: test/CodeGen/AArch64/aarch64-neon-mul-div.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/aarch64-neon-mul-div.ll @@ -0,0 +1,784 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -mcpu=cyclone | FileCheck %s +; arm64 has its own copy of this because of the intrinsics + +define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) { +; CHECK-LABEL: mul8xi8: +; CHECK: mul {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = mul <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) { +; CHECK-LABEL: mul16xi8: +; CHECK: mul {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = mul <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) { +; CHECK-LABEL: mul4xi16: +; CHECK: mul {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = mul <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) { +; CHECK-LABEL: mul8xi16: +; CHECK: mul {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = mul <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) { +; CHECK-LABEL: mul2xi32: +; CHECK: mul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = mul <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) { +; CHECK-LABEL: mul4x32: +; CHECK: mul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = mul <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + +define <1 x i64> @mul1xi64(<1 x i64> %A, <1 x i64> %B) { +; CHECK-LABEL: mul1xi64: +; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}} + %tmp3 = mul <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + +define <2 x i64> @mul2xi64(<2 x i64> %A, <2 x i64> %B) { +; CHECK-LABEL: mul2xi64: +; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}} +; CHECK: mul x{{[0-9]+}}, x{{[0-9]+}}, x{{[0-9]+}} + %tmp3 = mul <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + + define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) { +; CHECK-LABEL: mul2xfloat: +; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = fmul <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) { +; CHECK-LABEL: mul4xfloat: +; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = fmul <4 x float> %A, %B; + ret <4 x float> %tmp3 +} +define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) { +; CHECK-LABEL: mul2xdouble: +; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = fmul <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + + + define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) { +; CHECK-LABEL: div2xfloat: +; CHECK: fdiv {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = fdiv <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) { +; CHECK-LABEL: div4xfloat: +; CHECK: fdiv {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = fdiv <4 x float> %A, %B; + ret <4 x float> %tmp3 +} +define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) { +; CHECK-LABEL: div2xdouble: +; CHECK: fdiv {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = fdiv <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + +define <1 x i8> @sdiv1x8(<1 x i8> %A, <1 x i8> %B) { +; CHECK-LABEL: sdiv1x8: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <1 x i8> %A, %B; + ret <1 x i8> %tmp3 +} + +define <8 x i8> @sdiv8x8(<8 x i8> %A, <8 x i8> %B) { +; CHECK-LABEL: sdiv8x8: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @sdiv16x8(<16 x i8> %A, <16 x i8> %B) { +; CHECK-LABEL: sdiv16x8: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <1 x i16> @sdiv1x16(<1 x i16> %A, <1 x i16> %B) { +; CHECK-LABEL: sdiv1x16: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <1 x i16> %A, %B; + ret <1 x i16> %tmp3 +} + +define <4 x i16> @sdiv4x16(<4 x i16> %A, <4 x i16> %B) { +; CHECK-LABEL: sdiv4x16: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @sdiv8x16(<8 x i16> %A, <8 x i16> %B) { +; CHECK-LABEL: sdiv8x16: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <1 x i32> @sdiv1x32(<1 x i32> %A, <1 x i32> %B) { +; CHECK-LABEL: sdiv1x32: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <1 x i32> %A, %B; + ret <1 x i32> %tmp3 +} + +define <2 x i32> @sdiv2x32(<2 x i32> %A, <2 x i32> %B) { +; CHECK-LABEL: sdiv2x32: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @sdiv4x32(<4 x i32> %A, <4 x i32> %B) { +; CHECK-LABEL: sdiv4x32: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = sdiv <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + +define <1 x i64> @sdiv1x64(<1 x i64> %A, <1 x i64> %B) { +; CHECK-LABEL: sdiv1x64: +; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = sdiv <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + +define <2 x i64> @sdiv2x64(<2 x i64> %A, <2 x i64> %B) { +; CHECK-LABEL: sdiv2x64: +; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = sdiv <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + +define <1 x i8> @udiv1x8(<1 x i8> %A, <1 x i8> %B) { +; CHECK-LABEL: udiv1x8: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <1 x i8> %A, %B; + ret <1 x i8> %tmp3 +} + +define <8 x i8> @udiv8x8(<8 x i8> %A, <8 x i8> %B) { +; CHECK-LABEL: udiv8x8: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @udiv16x8(<16 x i8> %A, <16 x i8> %B) { +; CHECK-LABEL: udiv16x8: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <1 x i16> @udiv1x16(<1 x i16> %A, <1 x i16> %B) { +; CHECK-LABEL: udiv1x16: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <1 x i16> %A, %B; + ret <1 x i16> %tmp3 +} + +define <4 x i16> @udiv4x16(<4 x i16> %A, <4 x i16> %B) { +; CHECK-LABEL: udiv4x16: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @udiv8x16(<8 x i16> %A, <8 x i16> %B) { +; CHECK-LABEL: udiv8x16: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <1 x i32> @udiv1x32(<1 x i32> %A, <1 x i32> %B) { +; CHECK-LABEL: udiv1x32: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <1 x i32> %A, %B; + ret <1 x i32> %tmp3 +} + +define <2 x i32> @udiv2x32(<2 x i32> %A, <2 x i32> %B) { +; CHECK-LABEL: udiv2x32: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @udiv4x32(<4 x i32> %A, <4 x i32> %B) { +; CHECK-LABEL: udiv4x32: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = udiv <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + +define <1 x i64> @udiv1x64(<1 x i64> %A, <1 x i64> %B) { +; CHECK-LABEL: udiv1x64: +; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = udiv <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + +define <2 x i64> @udiv2x64(<2 x i64> %A, <2 x i64> %B) { +; CHECK-LABEL: udiv2x64: +; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = udiv <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + +define <1 x i8> @srem1x8(<1 x i8> %A, <1 x i8> %B) { +; CHECK-LABEL: srem1x8: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <1 x i8> %A, %B; + ret <1 x i8> %tmp3 +} + +define <8 x i8> @srem8x8(<8 x i8> %A, <8 x i8> %B) { +; CHECK-LABEL: srem8x8: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @srem16x8(<16 x i8> %A, <16 x i8> %B) { +; CHECK-LABEL: srem16x8: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <1 x i16> @srem1x16(<1 x i16> %A, <1 x i16> %B) { +; CHECK-LABEL: srem1x16: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <1 x i16> %A, %B; + ret <1 x i16> %tmp3 +} + +define <4 x i16> @srem4x16(<4 x i16> %A, <4 x i16> %B) { +; CHECK-LABEL: srem4x16: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @srem8x16(<8 x i16> %A, <8 x i16> %B) { +; CHECK-LABEL: srem8x16: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <1 x i32> @srem1x32(<1 x i32> %A, <1 x i32> %B) { +; CHECK-LABEL: srem1x32: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <1 x i32> %A, %B; + ret <1 x i32> %tmp3 +} + +define <2 x i32> @srem2x32(<2 x i32> %A, <2 x i32> %B) { +; CHECK-LABEL: srem2x32: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @srem4x32(<4 x i32> %A, <4 x i32> %B) { +; CHECK-LABEL: srem4x32: +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: sdiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = srem <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + +define <1 x i64> @srem1x64(<1 x i64> %A, <1 x i64> %B) { +; CHECK-LABEL: srem1x64: +; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = srem <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + +define <2 x i64> @srem2x64(<2 x i64> %A, <2 x i64> %B) { +; CHECK-LABEL: srem2x64: +; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +; CHECK: sdiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = srem <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + +define <1 x i8> @urem1x8(<1 x i8> %A, <1 x i8> %B) { +; CHECK-LABEL: urem1x8: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <1 x i8> %A, %B; + ret <1 x i8> %tmp3 +} + +define <8 x i8> @urem8x8(<8 x i8> %A, <8 x i8> %B) { +; CHECK-LABEL: urem8x8: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @urem16x8(<16 x i8> %A, <16 x i8> %B) { +; CHECK-LABEL: urem16x8: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <1 x i16> @urem1x16(<1 x i16> %A, <1 x i16> %B) { +; CHECK-LABEL: urem1x16: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <1 x i16> %A, %B; + ret <1 x i16> %tmp3 +} + +define <4 x i16> @urem4x16(<4 x i16> %A, <4 x i16> %B) { +; CHECK-LABEL: urem4x16: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @urem8x16(<8 x i16> %A, <8 x i16> %B) { +; CHECK-LABEL: urem8x16: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <1 x i32> @urem1x32(<1 x i32> %A, <1 x i32> %B) { +; CHECK-LABEL: urem1x32: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <1 x i32> %A, %B; + ret <1 x i32> %tmp3 +} + +define <2 x i32> @urem2x32(<2 x i32> %A, <2 x i32> %B) { +; CHECK-LABEL: urem2x32: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @urem4x32(<4 x i32> %A, <4 x i32> %B) { +; CHECK-LABEL: urem4x32: +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: udiv {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} +; CHECK: msub {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}} + %tmp3 = urem <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + +define <1 x i64> @urem1x64(<1 x i64> %A, <1 x i64> %B) { +; CHECK-LABEL: urem1x64: +; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = urem <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + +define <2 x i64> @urem2x64(<2 x i64> %A, <2 x i64> %B) { +; CHECK-LABEL: urem2x64: +; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +; CHECK: udiv {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} +; CHECK: msub {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}} + %tmp3 = urem <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + +define <2 x float> @frem2f32(<2 x float> %A, <2 x float> %B) { +; CHECK-LABEL: frem2f32: +; CHECK: bl fmodf +; CHECK: bl fmodf + %tmp3 = frem <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @frem4f32(<4 x float> %A, <4 x float> %B) { +; CHECK-LABEL: frem4f32: +; CHECK: bl fmodf +; CHECK: bl fmodf +; CHECK: bl fmodf +; CHECK: bl fmodf + %tmp3 = frem <4 x float> %A, %B; + ret <4 x float> %tmp3 +} + +define <1 x double> @frem1d64(<1 x double> %A, <1 x double> %B) { +; CHECK-LABEL: frem1d64: +; CHECK: bl fmod + %tmp3 = frem <1 x double> %A, %B; + ret <1 x double> %tmp3 +} + +define <2 x double> @frem2d64(<2 x double> %A, <2 x double> %B) { +; CHECK-LABEL: frem2d64: +; CHECK: bl fmod +; CHECK: bl fmod + %tmp3 = frem <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + +declare <8 x i8> @llvm.arm64.neon.pmul.v8i8(<8 x i8>, <8 x i8>) +declare <16 x i8> @llvm.arm64.neon.pmul.v16i8(<16 x i8>, <16 x i8>) + +define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK-LABEL: poly_mulv8i8: + %prod = call <8 x i8> @llvm.arm64.neon.pmul.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) + ret <8 x i8> %prod +} + +define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK-LABEL: poly_mulv16i8: + %prod = call <16 x i8> @llvm.arm64.neon.pmul.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) + ret <16 x i8> %prod +} + +declare <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK-LABEL: test_sqdmulh_v4i16: + %prod = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) + ret <4 x i16> %prod +} + +define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK-LABEL: test_sqdmulh_v8i16: + %prod = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) + ret <8 x i16> %prod +} + +define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK-LABEL: test_sqdmulh_v2i32: + %prod = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + ret <2 x i32> %prod +} + +define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK-LABEL: test_sqdmulh_v4i32: + %prod = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) + ret <4 x i32> %prod +} + +declare <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK-LABEL: test_sqrdmulh_v4i16: + %prod = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) + ret <4 x i16> %prod +} + +define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK-LABEL: test_sqrdmulh_v8i16: + %prod = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) + ret <8 x i16> %prod +} + +define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK-LABEL: test_sqrdmulh_v2i32: + %prod = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + ret <2 x i32> %prod +} + +define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK-LABEL: test_sqrdmulh_v4i32: + %prod = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) + ret <4 x i32> %prod +} + +declare <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK-LABEL: fmulx_v2f32: +; Using registers other than v0, v1 and v2 are possible, but would be odd. + %val = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs) + ret <2 x float> %val +} + +define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK-LABEL: fmulx_v4f32: +; Using registers other than v0, v1 and v2 are possible, but would be odd. + %val = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs) + ret <4 x float> %val +} + +define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK-LABEL: fmulx_v2f64: +; Using registers other than v0, v1 and v2 are possible, but would be odd. + %val = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs) + ret <2 x double> %val +} + Index: test/CodeGen/AArch64/early-ifcvt.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/early-ifcvt.ll @@ -0,0 +1,549 @@ +; RUN: llc < %s -stress-early-ifcvt -mcpu=cyclone | FileCheck %s +target triple = "aarch64-apple-ios" + +; CHECK: mm2 +define i32 @mm2(i32* nocapture %p, i32 %n) nounwind uwtable readonly ssp { +entry: + br label %do.body + +; CHECK: do.body +; Loop body has no branches before the backedge. +; CHECK-NOT: LBB +do.body: + %max.0 = phi i32 [ 0, %entry ], [ %max.1, %do.cond ] + %min.0 = phi i32 [ 0, %entry ], [ %min.1, %do.cond ] + %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.cond ] + %p.addr.0 = phi i32* [ %p, %entry ], [ %incdec.ptr, %do.cond ] + %incdec.ptr = getelementptr inbounds i32* %p.addr.0, i64 1 + %0 = load i32* %p.addr.0, align 4 + %cmp = icmp sgt i32 %0, %max.0 + br i1 %cmp, label %do.cond, label %if.else + +if.else: + %cmp1 = icmp slt i32 %0, %min.0 + %.min.0 = select i1 %cmp1, i32 %0, i32 %min.0 + br label %do.cond + +do.cond: + %max.1 = phi i32 [ %0, %do.body ], [ %max.0, %if.else ] + %min.1 = phi i32 [ %min.0, %do.body ], [ %.min.0, %if.else ] +; CHECK: cbnz + %dec = add i32 %n.addr.0, -1 + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %do.end, label %do.body + +do.end: + %sub = sub nsw i32 %max.1, %min.1 + ret i32 %sub +} + +; CHECK-LABEL: fold_inc_true_32: +; CHECK: {{subs.*wzr,|cmp}} w2, #1 +; CHECK-NEXT: csinc w0, w1, w0, eq +; CHECK-NEXT: ret +define i32 @fold_inc_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp eq i32 %c, 1 + %inc = add nsw i32 %x, 1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %y, %eq_bb ], [ %inc, %entry ] + ret i32 %cond +} + +; CHECK-LABEL: fold_inc_true_64: +; CHECK: {{subs.*xzr,|cmp}} x2, #1 +; CHECK-NEXT: csinc x0, x1, x0, eq +; CHECK-NEXT: ret +define i64 @fold_inc_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp eq i64 %c, 1 + %inc = add nsw i64 %x, 1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %y, %eq_bb ], [ %inc, %entry ] + ret i64 %cond +} + +; CHECK-LABEL: fold_inc_false_32: +; CHECK: {{subs.*wzr,|cmp}} w2, #1 +; CHECK-NEXT: csinc w0, w1, w0, ne +; CHECK-NEXT: ret +define i32 @fold_inc_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp eq i32 %c, 1 + %inc = add nsw i32 %x, 1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %inc, %eq_bb ], [ %y, %entry ] + ret i32 %cond +} + +; CHECK-LABEL: fold_inc_false_64: +; CHECK: {{subs.*xzr,|cmp}} x2, #1 +; CHECK-NEXT: csinc x0, x1, x0, ne +; CHECK-NEXT: ret +define i64 @fold_inc_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp eq i64 %c, 1 + %inc = add nsw i64 %x, 1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %inc, %eq_bb ], [ %y, %entry ] + ret i64 %cond +} + +; CHECK-LABEL: fold_inv_true_32: +; CHECK: {{subs.*wzr,|cmp}} w2, #1 +; CHECK-NEXT: csinv w0, w1, w0, eq +; CHECK-NEXT: ret +define i32 @fold_inv_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp eq i32 %c, 1 + %inv = xor i32 %x, -1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %y, %eq_bb ], [ %inv, %entry ] + ret i32 %cond +} + +; CHECK-LABEL: fold_inv_true_64: +; CHECK: {{subs.*xzr,|cmp}} x2, #1 +; CHECK-NEXT: csinv x0, x1, x0, eq +; CHECK-NEXT: ret +define i64 @fold_inv_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp eq i64 %c, 1 + %inv = xor i64 %x, -1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %y, %eq_bb ], [ %inv, %entry ] + ret i64 %cond +} + +; CHECK-LABEL: fold_inv_false_32: +; CHECK: {{subs.*wzr,|cmp}} w2, #1 +; CHECK-NEXT: csinv w0, w1, w0, ne +; CHECK-NEXT: ret +define i32 @fold_inv_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp eq i32 %c, 1 + %inv = xor i32 %x, -1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %inv, %eq_bb ], [ %y, %entry ] + ret i32 %cond +} + +; CHECK-LABEL: fold_inv_false_64: +; CHECK: {{subs.*xzr,|cmp}} x2, #1 +; CHECK-NEXT: csinv x0, x1, x0, ne +; CHECK-NEXT: ret +define i64 @fold_inv_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp eq i64 %c, 1 + %inv = xor i64 %x, -1 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %inv, %eq_bb ], [ %y, %entry ] + ret i64 %cond +} + +; CHECK-LABEL: fold_neg_true_32: +; CHECK: {{subs.*wzr,|cmp}} w2, #1 +; CHECK-NEXT: csneg w0, w1, w0, eq +; CHECK-NEXT: ret +define i32 @fold_neg_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp eq i32 %c, 1 + %neg = sub nsw i32 0, %x + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %y, %eq_bb ], [ %neg, %entry ] + ret i32 %cond +} + +; CHECK-LABEL: fold_neg_true_64: +; CHECK: {{subs.*xzr,|cmp}} x2, #1 +; CHECK-NEXT: csneg x0, x1, x0, eq +; CHECK-NEXT: ret +define i64 @fold_neg_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp eq i64 %c, 1 + %neg = sub nsw i64 0, %x + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %y, %eq_bb ], [ %neg, %entry ] + ret i64 %cond +} + +; CHECK-LABEL: fold_neg_false_32: +; CHECK: {{subs.*wzr,|cmp}} w2, #1 +; CHECK-NEXT: csneg w0, w1, w0, ne +; CHECK-NEXT: ret +define i32 @fold_neg_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp eq i32 %c, 1 + %neg = sub nsw i32 0, %x + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %neg, %eq_bb ], [ %y, %entry ] + ret i32 %cond +} + +; CHECK-LABEL: fold_neg_false_64: +; CHECK: {{subs.*xzr,|cmp}} x2, #1 +; CHECK-NEXT: csneg x0, x1, x0, ne +; CHECK-NEXT: ret +define i64 @fold_neg_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp eq i64 %c, 1 + %neg = sub nsw i64 0, %x + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %neg, %eq_bb ], [ %y, %entry ] + ret i64 %cond +} + +; CHECK: cbnz_32 +; CHECK: {{subs.*wzr,|cmp}} w2, #0 +; CHECK-NEXT: csel w0, w0, w1, eq +; CHECK-NEXT: ret +define i32 @cbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp eq i32 %c, 0 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ] + ret i32 %cond +} + +; CHECK: cbnz_64 +; CHECK: {{subs.*xzr,|cmp}} x2, #0 +; CHECK-NEXT: csel x0, x0, x1, eq +; CHECK-NEXT: ret +define i64 @cbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp eq i64 %c, 0 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ] + ret i64 %cond +} + +; CHECK: cbz_32 +; CHECK: {{subs.*wzr,|cmp}} w2, #0 +; CHECK-NEXT: csel +; CHECK-NEXT: ret +define i32 @cbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %tobool = icmp ne i32 %c, 0 + br i1 %tobool, label %ne_bb, label %done + +ne_bb: + br label %done + +done: + %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ] + ret i32 %cond +} + +; CHECK: cbz_64 +; CHECK: {{subs.*xzr,|cmp}} x2, #0 +; CHECK-NEXT: csel +; CHECK-NEXT: ret +define i64 @cbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %tobool = icmp ne i64 %c, 0 + br i1 %tobool, label %ne_bb, label %done + +ne_bb: + br label %done + +done: + %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ] + ret i64 %cond +} + +; CHECK: tbnz_32 +; CHECK: {{ands.*xzr,|tst}} w2, #0x80 +; CHECK-NEXT: csel +; CHECK-NEXT: ret +define i32 @tbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %mask = and i32 %c, 128 + %tobool = icmp eq i32 %mask, 0 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ] + ret i32 %cond +} + +; CHECK: tbnz_64 +; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000 +; CHECK-NEXT: csel +; CHECK-NEXT: ret +define i64 @tbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %mask = and i64 %c, 9223372036854775808 + %tobool = icmp eq i64 %mask, 0 + br i1 %tobool, label %eq_bb, label %done + +eq_bb: + br label %done + +done: + %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ] + ret i64 %cond +} + +; CHECK: tbz_32 +; CHECK: {{ands.*xzr,|tst}} w2, #0x80 +; CHECK-NEXT: csel +; CHECK-NEXT: ret +define i32 @tbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp { +entry: + %mask = and i32 %c, 128 + %tobool = icmp ne i32 %mask, 0 + br i1 %tobool, label %ne_bb, label %done + +ne_bb: + br label %done + +done: + %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ] + ret i32 %cond +} + +; CHECK: tbz_64 +; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000 +; CHECK-NEXT: csel +; CHECK-NEXT: ret +define i64 @tbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp { +entry: + %mask = and i64 %c, 9223372036854775808 + %tobool = icmp ne i64 %mask, 0 + br i1 %tobool, label %ne_bb, label %done + +ne_bb: + br label %done + +done: + %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ] + ret i64 %cond +} + +; This function from 175.vpr folds an ADDWri into a CSINC. +; Remember to clear the kill flag on the ADDWri. +define i32 @get_ytrack_to_xtracks() nounwind ssp { +entry: + br label %for.body + +for.body: + %x0 = load i32* undef, align 4 + br i1 undef, label %if.then.i146, label %is_sbox.exit155 + +if.then.i146: + %add8.i143 = add nsw i32 0, %x0 + %rem.i144 = srem i32 %add8.i143, %x0 + %add9.i145 = add i32 %rem.i144, 1 + br label %is_sbox.exit155 + +is_sbox.exit155: ; preds = %if.then.i146, %for.body + %seg_offset.0.i151 = phi i32 [ %add9.i145, %if.then.i146 ], [ undef, %for.body ] + %idxprom15.i152 = sext i32 %seg_offset.0.i151 to i64 + %arrayidx18.i154 = getelementptr inbounds i32* null, i64 %idxprom15.i152 + %x1 = load i32* %arrayidx18.i154, align 4 + br i1 undef, label %for.body51, label %for.body + +for.body51: ; preds = %is_sbox.exit155 + call fastcc void @get_switch_type(i32 %x1, i32 undef, i16 signext undef, i16 signext undef, i16* undef) + unreachable +} +declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, i16* nocapture) nounwind ssp + +define i32 @instcombine_dont_form_muladd(i16* %tmp40, i16* %tmp43, i32 %conv227, + i32 %conv219) { +entry: +br label %for.body231 + +for.body231: + %p.1754 = phi i16* [ %tmp40, %entry ], [ %incdec.ptr263, %for.body231 ] + %r.1753 = phi i16* [ %tmp43, %entry ], [ %incdec.ptr257, %for.body231 ] + %c.0752 = phi i32 [ 0, %entry ], [ %phitmp712, %for.body231 ] + %k.2751 = phi i32 [ %conv227, %entry ], [ %dec265, %for.body231 ] + %incdec.ptr233 = getelementptr i16* %r.1753, i64 1 + %tmp47 = load i16* %r.1753, align 2 + %conv234 = zext i16 %tmp47 to i32 + %mul235 = mul i32 %conv234, %conv219 + %add237 = add i32 %mul235, %c.0752 + %conv238 = trunc i32 %add237 to i16 + %incdec.ptr239 = getelementptr i16* %p.1754, i64 1 + store i16 %conv238, i16* %p.1754, align 2 + %incdec.ptr241 = getelementptr i16* %r.1753, i64 2 + %tmp48 = load i16* %incdec.ptr233, align 2 + %conv242 = zext i16 %tmp48 to i32 + %mul243 = mul i32 %conv242, %conv219 + %shr244 = lshr i32 %add237, 16 + %add245 = add i32 %shr244, %mul243 + %conv246 = trunc i32 %add245 to i16 + %incdec.ptr247 = getelementptr i16* %p.1754, i64 2 + store i16 %conv246, i16* %incdec.ptr239, align 2 + %incdec.ptr249 = getelementptr i16* %r.1753, i64 3 + %tmp49 = load i16* %incdec.ptr241, align 2 + %conv250 = zext i16 %tmp49 to i32 + %mul251 = mul i32 %conv250, %conv219 + %shr252 = lshr i32 %add245, 16 + %add253 = add i32 %shr252, %mul251 + %conv254 = trunc i32 %add253 to i16 + %incdec.ptr255 = getelementptr i16* %p.1754, i64 3 + store i16 %conv254, i16* %incdec.ptr247, align 2 + %incdec.ptr257 = getelementptr i16* %r.1753, i64 4 + %tmp50 = load i16* %incdec.ptr249, align 2 + %conv258 = zext i16 %tmp50 to i32 + %mul259 = mul i32 %conv258, %conv219 + %shr260 = lshr i32 %add253, 16 + %add261 = add i32 %shr260, %mul259 + %conv262 = trunc i32 %add261 to i16 + %incdec.ptr263 = getelementptr i16* %p.1754, i64 4 + store i16 %conv262, i16* %incdec.ptr255, align 2 + %dec265 = add i32 %k.2751, -1 + %phitmp712 = lshr i32 %add261, 16 + %cmp229 = icmp eq i32 %dec265, 0 + br i1 %cmp229, label %return, label %for.body231 + +return: + ret i32 %phitmp712 +} + +define i32 @instcombine_form_muladd(double * %d, double %d2, i16* %tmp40, + i16* %tmp43, i32 %conv227, i32 %conv219) { +entry: +br label %for.body231 + +for.body231: + %p.1754 = phi i16* [ %tmp40, %entry ], [ %incdec.ptr263, %for.body231 ] + %r.1753 = phi i16* [ %tmp43, %entry ], [ %incdec.ptr257, %for.body231 ] + %d.1 = phi double* [ %d, %entry ], [ %incdec.d, %for.body231 ] + %c.0752 = phi i32 [ 0, %entry ], [ %phitmp712, %for.body231 ] + %k.2751 = phi i32 [ %conv227, %entry ], [ %dec265, %for.body231 ] + + %d.2 = getelementptr double* %d.1, i64 1 + %d.load = load double* %d.1, align 8 + %d.load2 = load double* %d.2, align 8 + %d.div = fdiv double %d.load, %d.load2 + %d.mul = fmul double %d.div, %d2 + store double %d.mul, double* %d.2 + + %incdec.ptr233 = getelementptr i16* %r.1753, i64 1 + %tmp47 = load i16* %r.1753, align 2 + %conv234 = zext i16 %tmp47 to i32 + %mul235 = mul i32 %conv234, %conv219 + %add237 = add i32 %mul235, %c.0752 + +; Combine the multiply add sequence above. We don't lengthen the +; critical path. The floating point dependence chain is longer than the +; multiply-add chain. +; CHECK-LABEL: instcombine_form_muladd +; CHECK: madd + + + %conv238 = trunc i32 %add237 to i16 + %incdec.ptr239 = getelementptr i16* %p.1754, i64 1 + store i16 %conv238, i16* %p.1754, align 2 + %incdec.ptr241 = getelementptr i16* %r.1753, i64 2 + %tmp48 = load i16* %incdec.ptr233, align 2 + %conv242 = zext i16 %tmp48 to i32 + %mul243 = mul i32 %conv242, %conv219 + %shr244 = lshr i32 %add237, 16 + %add245 = add i32 %shr244, %mul243 + %conv246 = trunc i32 %add245 to i16 + %incdec.ptr247 = getelementptr i16* %p.1754, i64 2 + store i16 %conv246, i16* %incdec.ptr239, align 2 + %incdec.ptr249 = getelementptr i16* %r.1753, i64 3 + %tmp49 = load i16* %incdec.ptr241, align 2 + %conv250 = zext i16 %tmp49 to i32 + %mul251 = mul i32 %conv250, %conv219 + %shr252 = lshr i32 %add245, 16 + %add253 = add i32 %shr252, %mul251 + %conv254 = trunc i32 %add253 to i16 + %incdec.ptr255 = getelementptr i16* %p.1754, i64 3 + store i16 %conv254, i16* %incdec.ptr247, align 2 + %incdec.ptr257 = getelementptr i16* %r.1753, i64 4 + %tmp50 = load i16* %incdec.ptr249, align 2 + %conv258 = zext i16 %tmp50 to i32 + %mul259 = mul i32 %conv258, %conv219 + %shr260 = lshr i32 %add253, 16 + %add261 = add i32 %shr260, %mul259 + %conv262 = trunc i32 %add261 to i16 + %incdec.ptr263 = getelementptr i16* %p.1754, i64 4 + store i16 %conv262, i16* %incdec.ptr255, align 2 + %dec265 = add i32 %k.2751, -1 + %phitmp712 = lshr i32 %add261, 16 + %cmp229 = icmp eq i32 %dec265, 0 + %incdec.d = getelementptr double* %d.1, i64 2 + br i1 %cmp229, label %return, label %for.body231 + +return: + ret i32 %phitmp712 +}