Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -22,6 +22,7 @@ #include "SIInstrInfo.h" #include "SIISelLowering.h" #include "SIFrameLowering.h" +#include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/Triple.h" #include "llvm/CodeGen/GlobalISel/GISelAccessor.h" @@ -317,6 +318,11 @@ /// the given LDS memory size is the only constraint. unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; + unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const { + const auto *MFI = MF.getInfo(); + return getOccupancyWithLocalMemSize(MFI->getLDSSize(), *MF.getFunction()); + } + bool hasFP16Denormals() const { return FP64FP16Denormals; } Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -24,6 +24,7 @@ #endif #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" +#include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" #include "R600MachineScheduler.h" #include "SIMachineScheduler.h" @@ -155,6 +156,20 @@ return DAG; } +static ScheduleDAGInstrs * +createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { + auto DAG = new GCNIterativeScheduler(C, + GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + return DAG; +} + +static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) { + return new GCNIterativeScheduler(C, + GCNIterativeScheduler::SCHEDULE_MINREGFORCED); +} + static MachineSchedRegistry R600SchedRegistry("r600", "Run R600's custom scheduler", createR600MachineScheduler); @@ -168,6 +183,16 @@ "Run GCN scheduler to maximize occupancy", createGCNMaxOccupancyMachineScheduler); +static MachineSchedRegistry +IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental", + "Run GCN scheduler to maximize occupancy (experimental)", + createIterativeGCNMaxOccupancyMachineScheduler); + +static MachineSchedRegistry +GCNMinRegSchedRegistry("gcn-minreg", + "Run GCN iterative scheduler for minimal register usage (experimental)", + createMinRegScheduler); + static StringRef computeDataLayout(const Triple &TT) { if (TT.getArch() == Triple::r600) { // 32-bit pointers. Index: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt +++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt @@ -94,6 +94,9 @@ SIShrinkInstructions.cpp SITypeRewriter.cpp SIWholeQuadMode.cpp + GCNIterativeScheduler.cpp + GCNMinRegStrategy.cpp + GCNRegPressure.cpp ${GLOBAL_ISEL_BUILD_FILES} ) Index: llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.h +++ llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.h @@ -0,0 +1,118 @@ +//===--------- GCNIterativeScheduler.h - GCN Scheduler -*- C++ -*----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H +#define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H + +#include "GCNRegPressure.h" + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +class GCNIterativeScheduler : public ScheduleDAGMILive { + typedef ScheduleDAGMILive BaseClass; +public: + enum StrategyKind { + SCHEDULE_MINREGONLY, + SCHEDULE_MINREGFORCED, + SCHEDULE_LEGACYMAXOCCUPANCY + }; + + GCNIterativeScheduler(MachineSchedContext *C, + StrategyKind S); + + void schedule() override; + + void enterRegion(MachineBasicBlock *BB, + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned RegionInstrs) override; + + void finalizeSchedule() override; + +protected: + + typedef ArrayRef ScheduleRef; + + struct TentativeSchedule { + std::vector Schedule; + GCNRegPressure MaxPressure; + }; + + struct Region { + // Fields except for BestSchedule are supposed to reflect current IR state + // `const` fields are to emphasize they shouldn't change for any schedule. + MachineBasicBlock::iterator Begin; + // End is either a boundary instruction or end of basic block + const MachineBasicBlock::iterator End; + const unsigned NumRegionInstrs; + GCNRegPressure MaxPressure; + + // best schedule for the region so far (not scheduled yet) + std::unique_ptr BestSchedule; + }; + + SpecificBumpPtrAllocator Alloc; + std::vector Regions; + + MachineSchedContext *Context; + const StrategyKind Strategy; + mutable GCNUpwardRPTracker UPTracker; + + class BuildDAG; + class OverrideLegacyStrategy; + + template + GCNRegPressure getSchedulePressure(const Region &R, + Range &&Schedule) const; + + GCNRegPressure getRegionPressure(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End) const; + + GCNRegPressure getRegionPressure(const Region &R) const { + return getRegionPressure(R.Begin, R.End); + } + + void setBestSchedule(Region &R, + ScheduleRef Schedule, + const GCNRegPressure &MaxRP = GCNRegPressure()); + + void scheduleBest(Region &R); + + std::vector detachSchedule(ScheduleRef Schedule) const; + + void sortRegionsByPressure(unsigned TargetOcc); + + template + void scheduleRegion(Region &R, Range &&Schedule, + const GCNRegPressure &MaxRP = GCNRegPressure()); + + unsigned tryMaximizeOccupancy(unsigned TargetOcc = + std::numeric_limits::max()); + + void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true); + void scheduleMinReg(bool force = false); + + void printRegions(raw_ostream &OS) const; + void printSchedResult(raw_ostream &OS, + const Region *R, + const GCNRegPressure &RP) const; + void printSchedRP(raw_ostream &OS, + const GCNRegPressure &Before, + const GCNRegPressure &After) const; +}; + +} // End namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H Index: llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ llvm/trunk/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -0,0 +1,528 @@ +//===--------------------- GCNIterativeScheduler.cpp - --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "GCNIterativeScheduler.h" +#include "GCNSchedStrategy.h" +#include "SIMachineFunctionInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +namespace llvm { + std::vector makeMinRegSchedule(ArrayRef TopRoots, + const ScheduleDAG &DAG); +} + +// shim accessors for different order containers +static inline MachineInstr *getMachineInstr(MachineInstr *MI) { + return MI; +} +static inline MachineInstr *getMachineInstr(const SUnit *SU) { + return SU->getInstr(); +} +static inline MachineInstr *getMachineInstr(const SUnit &SU) { + return SU.getInstr(); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +static void printRegion(raw_ostream &OS, + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + const LiveIntervals *LIS, + unsigned MaxInstNum = + std::numeric_limits::max()) { + auto BB = Begin->getParent(); + OS << BB->getParent()->getName() << ":BB#" << BB->getNumber() + << ' ' << BB->getName() << ":\n"; + auto I = Begin; + MaxInstNum = std::max(MaxInstNum, 1u); + for (; I != End && MaxInstNum; ++I, --MaxInstNum) { + if (!I->isDebugValue() && LIS) + OS << LIS->getInstructionIndex(*I); + OS << '\t' << *I; + } + if (I != End) { + OS << "\t...\n"; + I = std::prev(End); + if (!I->isDebugValue() && LIS) + OS << LIS->getInstructionIndex(*I); + OS << '\t' << *I; + } + if (End != BB->end()) { // print boundary inst if present + OS << "----\n"; + if (LIS) OS << LIS->getInstructionIndex(*End) << '\t'; + OS << *End; + } +} + +LLVM_DUMP_METHOD +static void printLivenessInfo(raw_ostream &OS, + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + const LiveIntervals *LIS) { + const auto BB = Begin->getParent(); + const auto &MRI = BB->getParent()->getRegInfo(); + + const auto LiveIns = getLiveRegsBefore(*Begin, *LIS); + OS << "LIn RP: "; + getRegPressure(MRI, LiveIns).print(OS); + + const auto BottomMI = End == BB->end() ? std::prev(End) : End; + const auto LiveOuts = getLiveRegsAfter(*BottomMI, *LIS); + OS << "LOt RP: "; + getRegPressure(MRI, LiveOuts).print(OS); +} + +LLVM_DUMP_METHOD +void GCNIterativeScheduler::printRegions(raw_ostream &OS) const { + const auto &ST = MF.getSubtarget(); + for (const auto R : Regions) { + OS << "Region to schedule "; + printRegion(OS, R->Begin, R->End, LIS, 1); + printLivenessInfo(OS, R->Begin, R->End, LIS); + OS << "Max RP: "; + R->MaxPressure.print(OS, &ST); + } +} + +LLVM_DUMP_METHOD +void GCNIterativeScheduler::printSchedResult(raw_ostream &OS, + const Region *R, + const GCNRegPressure &RP) const { + OS << "\nAfter scheduling "; + printRegion(OS, R->Begin, R->End, LIS); + printSchedRP(OS, R->MaxPressure, RP); + OS << '\n'; +} + +LLVM_DUMP_METHOD +void GCNIterativeScheduler::printSchedRP(raw_ostream &OS, + const GCNRegPressure &Before, + const GCNRegPressure &After) const { + const auto &ST = MF.getSubtarget(); + OS << "RP before: "; + Before.print(OS, &ST); + OS << "RP after: "; + After.print(OS, &ST); +} + +#endif + +// DAG builder helper +class GCNIterativeScheduler::BuildDAG { + GCNIterativeScheduler &Sch; + SmallVector TopRoots; +public: + BuildDAG(const Region &R, GCNIterativeScheduler &_Sch) + : Sch(_Sch) { + auto BB = R.Begin->getParent(); + Sch.BaseClass::startBlock(BB); + Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs); + + Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr, + /*TrackLaneMask*/true); + Sch.Topo.InitDAGTopologicalSorting(); + + SmallVector BotRoots; + Sch.findRootsAndBiasEdges(TopRoots, BotRoots); + } + ~BuildDAG() { + Sch.BaseClass::exitRegion(); + Sch.BaseClass::finishBlock(); + } + ArrayRef getTopRoots() const { + return TopRoots; + } +}; + +class GCNIterativeScheduler::OverrideLegacyStrategy { + GCNIterativeScheduler &Sch; + Region &Rgn; + std::unique_ptr SaveSchedImpl; + GCNRegPressure SaveMaxRP; +public: + OverrideLegacyStrategy(Region &R, + MachineSchedStrategy &OverrideStrategy, + GCNIterativeScheduler &_Sch) + : Sch(_Sch) + , Rgn(R) + , SaveSchedImpl(std::move(_Sch.SchedImpl)) + , SaveMaxRP(R.MaxPressure) { + Sch.SchedImpl.reset(&OverrideStrategy); + auto BB = R.Begin->getParent(); + Sch.BaseClass::startBlock(BB); + Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs); + } + ~OverrideLegacyStrategy() { + Sch.BaseClass::exitRegion(); + Sch.BaseClass::finishBlock(); + Sch.SchedImpl.release(); + Sch.SchedImpl = std::move(SaveSchedImpl); + } + void schedule() { + assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End); + DEBUG(dbgs() << "\nScheduling "; + printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2)); + Sch.BaseClass::schedule(); + + // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore + Sch.RegionEnd = Rgn.End; + //assert(Rgn.End == Sch.RegionEnd); + Rgn.Begin = Sch.RegionBegin; + Rgn.MaxPressure.clear(); + } + void restoreOrder() { + assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End); + // DAG SUnits are stored using original region's order + // so just use SUnits as the restoring schedule + Sch.scheduleRegion(Rgn, Sch.SUnits, SaveMaxRP); + } +}; + +// just a stub to make base class happy +class SchedStrategyStub : public MachineSchedStrategy { +public: + bool shouldTrackPressure() const override { return false; } + bool shouldTrackLaneMasks() const override { return false; } + void initialize(ScheduleDAGMI *DAG) override {} + SUnit *pickNode(bool &IsTopNode) override { return nullptr; } + void schedNode(SUnit *SU, bool IsTopNode) override {} + void releaseTopNode(SUnit *SU) override {} + void releaseBottomNode(SUnit *SU) override {} +}; + +GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C, + StrategyKind S) + : BaseClass(C, llvm::make_unique()) + , Context(C) + , Strategy(S) + , UPTracker(*LIS) { +} + +// returns max pressure for a region +GCNRegPressure +GCNIterativeScheduler::getRegionPressure(MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End) + const { + // For the purpose of pressure tracking bottom inst of the region should + // be also processed. End is either BB end, BB terminator inst or sched + // boundary inst. + auto const BBEnd = Begin->getParent()->end(); + auto const BottomMI = End == BBEnd ? std::prev(End) : End; + + // scheduleRegions walks bottom to top, so its likely we just get next + // instruction to track + auto AfterBottomMI = std::next(BottomMI); + if (AfterBottomMI == BBEnd || + &*AfterBottomMI != UPTracker.getLastTrackedMI()) { + UPTracker.reset(*BottomMI); + } else { + assert(UPTracker.isValid()); + } + + for (auto I = BottomMI; I != Begin; --I) + UPTracker.recede(*I); + + UPTracker.recede(*Begin); + + assert(UPTracker.isValid() || + (dbgs() << "Tracked region ", + printRegion(dbgs(), Begin, End, LIS), false)); + return UPTracker.moveMaxPressure(); +} + +// returns max pressure for a tentative schedule +template GCNRegPressure +GCNIterativeScheduler::getSchedulePressure(const Region &R, + Range &&Schedule) const { + auto const BBEnd = R.Begin->getParent()->end(); + GCNUpwardRPTracker RPTracker(*LIS); + if (R.End != BBEnd) { + // R.End points to the boundary instruction but the + // schedule doesn't include it + RPTracker.reset(*R.End); + RPTracker.recede(*R.End); + } else { + // R.End doesn't point to the boundary instruction + RPTracker.reset(*std::prev(BBEnd)); + } + for (auto I = Schedule.end(), B = Schedule.begin(); I != B;) { + RPTracker.recede(*getMachineInstr(*--I)); + } + return RPTracker.moveMaxPressure(); +} + +void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden + MachineBasicBlock::iterator Begin, + MachineBasicBlock::iterator End, + unsigned NumRegionInstrs) { + BaseClass::enterRegion(BB, Begin, End, NumRegionInstrs); + if (NumRegionInstrs > 2) { + Regions.push_back( + new (Alloc.Allocate()) + Region { Begin, End, NumRegionInstrs, + getRegionPressure(Begin, End), nullptr }); + } +} + +void GCNIterativeScheduler::schedule() { // overriden + // do nothing + DEBUG( + printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS); + if (!Regions.empty() && Regions.back()->Begin == RegionBegin) { + dbgs() << "Max RP: "; + Regions.back()->MaxPressure.print(dbgs(), &MF.getSubtarget()); + } + dbgs() << '\n'; + ); +} + +void GCNIterativeScheduler::finalizeSchedule() { // overriden + if (Regions.empty()) + return; + switch (Strategy) { + case SCHEDULE_MINREGONLY: scheduleMinReg(); break; + case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break; + case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break; + } +} + +// Detach schedule from SUnits and interleave it with debug values. +// Returned schedule becomes independent of DAG state. +std::vector +GCNIterativeScheduler::detachSchedule(ScheduleRef Schedule) const { + std::vector Res; + Res.reserve(Schedule.size() * 2); + + if (FirstDbgValue) + Res.push_back(FirstDbgValue); + + const auto DbgB = DbgValues.begin(), DbgE = DbgValues.end(); + for (auto SU : Schedule) { + Res.push_back(SU->getInstr()); + const auto &D = std::find_if(DbgB, DbgE, [SU](decltype(*DbgB) &P) { + return P.second == SU->getInstr(); + }); + if (D != DbgE) + Res.push_back(D->first); + } + return Res; +} + +void GCNIterativeScheduler::setBestSchedule(Region &R, + ScheduleRef Schedule, + const GCNRegPressure &MaxRP) { + R.BestSchedule.reset( + new TentativeSchedule{ detachSchedule(Schedule), MaxRP }); +} + +void GCNIterativeScheduler::scheduleBest(Region &R) { + assert(R.BestSchedule.get() && "No schedule specified"); + scheduleRegion(R, R.BestSchedule->Schedule, R.BestSchedule->MaxPressure); + R.BestSchedule.reset(); +} + +// minimal required region scheduler, works for ranges of SUnits*, +// SUnits or MachineIntrs* +template +void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule, + const GCNRegPressure &MaxRP) { + assert(RegionBegin == R.Begin && RegionEnd == R.End); + assert(LIS != nullptr); +#ifndef NDEBUG + const auto SchedMaxRP = getSchedulePressure(R, Schedule); +#endif + auto BB = R.Begin->getParent(); + auto Top = R.Begin; + for (const auto &I : Schedule) { + auto MI = getMachineInstr(I); + if (MI != &*Top) { + BB->remove(MI); + BB->insert(Top, MI); + if (!MI->isDebugValue()) + LIS->handleMove(*MI, true); + } + if (!MI->isDebugValue()) { + // Reset read - undef flags and update them later. + for (auto &Op : MI->operands()) + if (Op.isReg() && Op.isDef()) + Op.setIsUndef(false); + + RegisterOperands RegOpers; + RegOpers.collect(*MI, *TRI, MRI, /*ShouldTrackLaneMasks*/true, + /*IgnoreDead*/false); + // Adjust liveness and add missing dead+read-undef flags. + auto SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot(); + RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI); + } + Top = std::next(MI->getIterator()); + } + RegionBegin = getMachineInstr(Schedule.front()); + + // Schedule consisting of MachineInstr* is considered 'detached' + // and already interleaved with debug values + if (!std::is_same::value) { + placeDebugValues(); + // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore + //assert(R.End == RegionEnd); + RegionEnd = R.End; + } + + R.Begin = RegionBegin; + R.MaxPressure = MaxRP; + +#ifndef NDEBUG + const auto RegionMaxRP = getRegionPressure(R); + const auto &ST = MF.getSubtarget(); +#endif + assert((SchedMaxRP == RegionMaxRP && (MaxRP.empty() || SchedMaxRP == MaxRP)) + || (dbgs() << "Max RP mismatch!!!\n" + "RP for schedule (calculated): ", + SchedMaxRP.print(dbgs(), &ST), + dbgs() << "RP for schedule (reported): ", + MaxRP.print(dbgs(), &ST), + dbgs() << "RP after scheduling: ", + RegionMaxRP.print(dbgs(), &ST), + false)); +} + +// Sort recorded regions by pressure - highest at the front +void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) { + const auto &ST = MF.getSubtarget(); + std::sort(Regions.begin(), Regions.end(), + [&ST, TargetOcc](const Region *R1, const Region *R2) { + return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc); + }); +} + +/////////////////////////////////////////////////////////////////////////////// +// Legacy MaxOccupancy Strategy + +// Tries to increase occupancy applying minreg scheduler for a sequence of +// most demanding regions. Obtained schedules are saved as BestSchedule for a +// region. +// TargetOcc is the best achievable occupancy for a kernel. +// Returns better occupancy on success or current occupancy on fail. +// BestSchedules aren't deleted on fail. +unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) { + // TODO: assert Regions are sorted descending by pressure + const auto &ST = MF.getSubtarget(); + const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); + DEBUG(dbgs() << "Trying to to improve occupancy, target = " << TargetOcc + << ", current = " << Occ << '\n'); + + auto NewOcc = TargetOcc; + for (auto R : Regions) { + if (R->MaxPressure.getOccupancy(ST) >= NewOcc) + break; + + DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3); + printLivenessInfo(dbgs(), R->Begin, R->End, LIS)); + + BuildDAG DAG(*R, *this); + const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this); + const auto MaxRP = getSchedulePressure(*R, MinSchedule); + DEBUG(dbgs() << "Occupancy improvement attempt:\n"; + printSchedRP(dbgs(), R->MaxPressure, MaxRP)); + + NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST)); + if (NewOcc <= Occ) + break; + + setBestSchedule(*R, MinSchedule, MaxRP); + } + DEBUG(dbgs() << "New occupancy = " << NewOcc + << ", prev occupancy = " << Occ << '\n'); + return std::max(NewOcc, Occ); +} + +void GCNIterativeScheduler::scheduleLegacyMaxOccupancy( + bool TryMaximizeOccupancy) { + const auto &ST = MF.getSubtarget(); + auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF); + + sortRegionsByPressure(TgtOcc); + auto Occ = Regions.front()->MaxPressure.getOccupancy(ST); + + if (TryMaximizeOccupancy && Occ < TgtOcc) + Occ = tryMaximizeOccupancy(TgtOcc); + + // This is really weird but for some magic scheduling regions twice + // gives performance improvement + const int NumPasses = Occ < TgtOcc ? 2 : 1; + + TgtOcc = std::min(Occ, TgtOcc); + DEBUG(dbgs() << "Scheduling using default scheduler, " + "target occupancy = " << TgtOcc << '\n'); + GCNMaxOccupancySchedStrategy LStrgy(Context); + + for (int I = 0; I < NumPasses; ++I) { + // running first pass with TargetOccupancy = 0 mimics previous scheduling + // approach and is a performance magic + LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc); + for (auto R : Regions) { + OverrideLegacyStrategy Ovr(*R, LStrgy, *this); + + Ovr.schedule(); + const auto RP = getRegionPressure(*R); + DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP)); + + if (RP.getOccupancy(ST) < TgtOcc) { + DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc); + if (R->BestSchedule.get() && + R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) { + DEBUG(dbgs() << ", scheduling minimal register\n"); + scheduleBest(*R); + } else { + DEBUG(dbgs() << ", restoring\n"); + Ovr.restoreOrder(); + assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc); + } + } + } + } +} + +/////////////////////////////////////////////////////////////////////////////// +// Minimal Register Strategy + +void GCNIterativeScheduler::scheduleMinReg(bool force) { + const auto &ST = MF.getSubtarget(); + const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF); + sortRegionsByPressure(TgtOcc); + + auto MaxPressure = Regions.front()->MaxPressure; + for (auto R : Regions) { + if (!force && R->MaxPressure.less(ST, MaxPressure, TgtOcc)) + break; + + BuildDAG DAG(*R, *this); + const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this); + + const auto RP = getSchedulePressure(*R, MinSchedule); + DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) { + dbgs() << "\nWarning: Pressure becomes worse after minreg!"; + printSchedRP(dbgs(), R->MaxPressure, RP); + }); + + if (!force && MaxPressure.less(ST, RP, TgtOcc)) + break; + + scheduleRegion(*R, MinSchedule, RP); + DEBUG(printSchedResult(dbgs(), R, RP)); + + MaxPressure = RP; + } +} Index: llvm/trunk/lib/Target/AMDGPU/GCNMinRegStrategy.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNMinRegStrategy.cpp +++ llvm/trunk/lib/Target/AMDGPU/GCNMinRegStrategy.cpp @@ -0,0 +1,266 @@ +//===----------------------- GCNMinRegStrategy.cpp - ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/ScheduleDAG.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +class GCNMinRegScheduler { + struct Candidate : ilist_node { + const SUnit *SU; + int Priority; + + Candidate(const SUnit *SU_, int Priority_ = 0) + : SU(SU_), Priority(Priority_) {} + }; + + SpecificBumpPtrAllocator Alloc; + typedef simple_ilist Queue; + Queue RQ; // Ready queue + + std::vector NumPreds; + + bool isScheduled(const SUnit *SU) const { + assert(!SU->isBoundaryNode()); + return NumPreds[SU->NodeNum] == std::numeric_limits::max(); + } + + void setIsScheduled(const SUnit *SU) { + assert(!SU->isBoundaryNode()); + NumPreds[SU->NodeNum] = std::numeric_limits::max(); + } + + unsigned getNumPreds(const SUnit *SU) const { + assert(!SU->isBoundaryNode()); + assert(NumPreds[SU->NodeNum] != std::numeric_limits::max()); + return NumPreds[SU->NodeNum]; + } + + unsigned decNumPreds(const SUnit *SU) { + assert(!SU->isBoundaryNode()); + assert(NumPreds[SU->NodeNum] != std::numeric_limits::max()); + return --NumPreds[SU->NodeNum]; + } + + void initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits); + + int getReadySuccessors(const SUnit *SU) const; + int getNotReadySuccessors(const SUnit *SU) const; + + template + unsigned findMax(unsigned Num, Calc C); + + Candidate* pickCandidate(); + + void bumpPredsPriority(const SUnit *SchedSU, int Priority); + void releaseSuccessors(const SUnit* SU, int Priority); + +public: + std::vector schedule(ArrayRef TopRoots, + const ScheduleDAG &DAG); +}; + +void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) { + NumPreds.resize(SUnits.size()); + for (unsigned I = 0; I < SUnits.size(); ++I) + NumPreds[I] = SUnits[I].NumPredsLeft; +} + +int GCNMinRegScheduler::getReadySuccessors(const SUnit *SU) const { + unsigned NumSchedSuccs = 0; + for (auto SDep : SU->Succs) { + bool wouldBeScheduled = true; + for (auto PDep : SDep.getSUnit()->Preds) { + auto PSU = PDep.getSUnit(); + assert(!PSU->isBoundaryNode()); + if (PSU != SU && !isScheduled(PSU)) { + wouldBeScheduled = false; + break; + } + } + NumSchedSuccs += wouldBeScheduled ? 1 : 0; + } + return NumSchedSuccs; +} + +int GCNMinRegScheduler::getNotReadySuccessors(const SUnit *SU) const { + return SU->Succs.size() - getReadySuccessors(SU); +} + +template +unsigned GCNMinRegScheduler::findMax(unsigned Num, Calc C) { + assert(!RQ.empty() && Num <= RQ.size()); + typedef decltype(C(*RQ.begin())) T; + T Max = std::numeric_limits::min(); + unsigned NumMax = 0; + for (auto I = RQ.begin(); Num; --Num) { + T Cur = C(*I); + if (Cur >= Max) { + if (Cur > Max) { + Max = Cur; + NumMax = 1; + } else + ++NumMax; + auto &Cand = *I++; + RQ.remove(Cand); + RQ.push_front(Cand); + continue; + } + ++I; + } + return NumMax; +} + +GCNMinRegScheduler::Candidate* GCNMinRegScheduler::pickCandidate() { + do { + unsigned Num = RQ.size(); + if (Num == 1) break; + + DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { return C.Priority; }); + if (Num == 1) break; + + DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among " + << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { + auto SU = C.SU; + int Res = getNotReadySuccessors(SU); + DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready " + << Res << " successors, metric = " << -Res << '\n'); + return -Res; + }); + if (Num == 1) break; + + DEBUG(dbgs() << "\nSelecting most producing candidate among " + << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { + auto SU = C.SU; + auto Res = getReadySuccessors(SU); + DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready " + << Res << " successors, metric = " << Res << '\n'); + return Res; + }); + if (Num == 1) break; + + Num = Num ? Num : RQ.size(); + DEBUG(dbgs() << "\nCan't find best candidate, selecting in program order among " + << Num << '\n'); + Num = findMax(Num, [=](const Candidate &C) { return -(int64_t)C.SU->NodeNum; }); + assert(Num == 1); + } while (false); + + return &RQ.front(); +} + +void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) { + SmallPtrSet Set; + for (const auto &S : SchedSU->Succs) { + if (S.getSUnit()->isBoundaryNode() || isScheduled(S.getSUnit()) || + S.getKind() != SDep::Data) + continue; + for (const auto &P : S.getSUnit()->Preds) { + auto PSU = P.getSUnit(); + assert(!PSU->isBoundaryNode()); + if (PSU != SchedSU && !isScheduled(PSU)) { + Set.insert(PSU); + } + } + } + SmallVector Worklist(Set.begin(), Set.end()); + while (!Worklist.empty()) { + auto SU = Worklist.pop_back_val(); + assert(!SU->isBoundaryNode()); + for (const auto &P : SU->Preds) { + if (!P.getSUnit()->isBoundaryNode() && !isScheduled(P.getSUnit()) && + Set.insert(P.getSUnit()).second) + Worklist.push_back(P.getSUnit()); + } + } + DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum + << ")'s non-ready successors of " << Priority + << " priority in ready queue: "); + const auto SetEnd = Set.end(); + for (auto &C : RQ) { + if (Set.find(C.SU) != SetEnd) { + C.Priority = Priority; + DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')'); + } + } + DEBUG(dbgs() << '\n'); +} + +void GCNMinRegScheduler::releaseSuccessors(const SUnit* SU, int Priority) { + for (const auto &S : SU->Succs) { + auto SuccSU = S.getSUnit(); + if (S.isWeak()) + continue; + assert(SuccSU->isBoundaryNode() || getNumPreds(SuccSU) > 0); + if (!SuccSU->isBoundaryNode() && decNumPreds(SuccSU) == 0) + RQ.push_front(*new (Alloc.Allocate()) Candidate(SuccSU, Priority)); + } +} + +std::vector +GCNMinRegScheduler::schedule(ArrayRef TopRoots, + const ScheduleDAG &DAG) { + const auto &SUnits = DAG.SUnits; + std::vector Schedule; + Schedule.reserve(SUnits.size()); + + initNumPreds(SUnits); + + int StepNo = 0; + + for (auto SU : TopRoots) { + RQ.push_back(*new (Alloc.Allocate()) Candidate(SU, StepNo)); + } + releaseSuccessors(&DAG.EntrySU, StepNo); + + while (!RQ.empty()) { + DEBUG( + dbgs() << "\n=== Picking candidate, Step = " << StepNo << "\n" + "Ready queue:"; + for (auto &C : RQ) + dbgs() << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')'; + dbgs() << '\n'; + ); + + auto C = pickCandidate(); + assert(C); + RQ.remove(*C); + auto SU = C->SU; + DEBUG(dbgs() << "Selected "; SU->dump(&DAG)); + + releaseSuccessors(SU, StepNo); + Schedule.push_back(SU); + setIsScheduled(SU); + + if (getReadySuccessors(SU) == 0) + bumpPredsPriority(SU, StepNo); + + ++StepNo; + } + assert(SUnits.size() == Schedule.size()); + + return Schedule; +} + +namespace llvm { +std::vector makeMinRegSchedule(ArrayRef TopRoots, + const ScheduleDAG &DAG) { + GCNMinRegScheduler S; + return S.schedule(TopRoots, DAG); +} +} Index: llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.h +++ llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.h @@ -0,0 +1,170 @@ +//===---------------------- GCNRegPressure.h -*- C++ -*--------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H +#define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H + +#include "AMDGPUSubtarget.h" + +#include + +namespace llvm { + +struct GCNRegPressure { + enum RegKind { + SGPR32, + SGPR_TUPLE, + VGPR32, + VGPR_TUPLE, + TOTAL_KINDS + }; + + GCNRegPressure() { + clear(); + } + + bool empty() const { return getSGRPNum() == 0 && getVGRPNum() == 0; } + + void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } + + unsigned getSGRPNum() const { return Value[SGPR32]; } + unsigned getVGRPNum() const { return Value[VGPR32]; } + + unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; } + unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; } + + unsigned getOccupancy(const SISubtarget &ST) const { + return std::min(ST.getOccupancyWithNumSGPRs(getSGRPNum()), + ST.getOccupancyWithNumVGPRs(getVGRPNum())); + } + + void inc(unsigned Reg, + LaneBitmask PrevMask, + LaneBitmask NewMask, + const MachineRegisterInfo &MRI); + + bool higherOccupancy(const SISubtarget &ST, const GCNRegPressure& O) const { + return getOccupancy(ST) > O.getOccupancy(ST); + } + + bool less(const SISubtarget &ST, const GCNRegPressure& O, + unsigned MaxOccupancy = std::numeric_limits::max()) const; + + bool operator==(const GCNRegPressure &O) const { + return std::equal(&Value[0], &Value[TOTAL_KINDS], O.Value); + } + + bool operator!=(const GCNRegPressure &O) const { + return !(*this == O); + } + + void print(raw_ostream &OS, const SISubtarget *ST=nullptr) const; + void dump() const { print(dbgs()); } + +private: + unsigned Value[TOTAL_KINDS]; + + static unsigned getRegKind(unsigned Reg, const MachineRegisterInfo &MRI); + + friend GCNRegPressure max(const GCNRegPressure &P1, + const GCNRegPressure &P2); +}; + +inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) { + GCNRegPressure Res; + for (unsigned I = 0; I < GCNRegPressure::TOTAL_KINDS; ++I) + Res.Value[I] = std::max(P1.Value[I], P2.Value[I]); + return Res; +} + +class GCNRPTracker { +public: + typedef DenseMap LiveRegSet; + +protected: + LiveRegSet LiveRegs; + GCNRegPressure CurPressure, MaxPressure; + const MachineInstr *LastTrackedMI = nullptr; + mutable const MachineRegisterInfo *MRI = nullptr; + GCNRPTracker() {} +public: + // live regs for the current state + const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; } + const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; } + + // returns MaxPressure, resetting it + decltype(MaxPressure) moveMaxPressure() { + auto Res = MaxPressure; + MaxPressure.clear(); + return Res; + } + decltype(LiveRegs) moveLiveRegs() { + return std::move(LiveRegs); + } +}; + +class GCNUpwardRPTracker : public GCNRPTracker { + const LiveIntervals &LIS; + LaneBitmask getDefRegMask(const MachineOperand &MO) const; + LaneBitmask getUsedRegMask(const MachineOperand &MO) const; +public: + GCNUpwardRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {} + // reset tracker to the point just below MI + // filling live regs upon this point using LIS + void reset(const MachineInstr &MI); + + // move to the state just above the MI + void recede(const MachineInstr &MI); + + // checks whether the tracker's state after receding MI corresponds + // to reported by LIS + bool isValid() const; +}; + +LaneBitmask getLiveLaneMask(unsigned Reg, + SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI); + +GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI); + +inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI, + const LiveIntervals &LIS) { + return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS, + MI.getParent()->getParent()->getRegInfo()); +} + +inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI, + const LiveIntervals &LIS) { + return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS, + MI.getParent()->getParent()->getRegInfo()); +} + +template +GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI, + Range &&LiveRegs) { + GCNRegPressure Res; + for (const auto &RM : LiveRegs) + Res.inc(RM.first, LaneBitmask::getNone(), RM.second, MRI); + return Res; +} + +void printLivesAt(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI); + +} // End namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H Index: llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.cpp +++ llvm/trunk/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -0,0 +1,355 @@ +//===------------------------- GCNRegPressure.cpp - -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +// +//===----------------------------------------------------------------------===// + +#include "GCNRegPressure.h" + +using namespace llvm; + +#define DEBUG_TYPE "misched" + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +void llvm::printLivesAt(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { + dbgs() << "Live regs at " << SI << ": " + << *LIS.getInstructionFromIndex(SI); + unsigned Num = 0; + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + const unsigned Reg = TargetRegisterInfo::index2VirtReg(I); + if (MRI.reg_nodbg_empty(Reg)) + continue; + const auto &LI = LIS.getInterval(Reg); + if (LI.hasSubRanges()) { + bool firstTime = true; + for (const auto &S : LI.subranges()) { + if (!S.liveAt(SI)) continue; + if (firstTime) { + dbgs() << " " << PrintReg(Reg, MRI.getTargetRegisterInfo()) + << '\n'; + firstTime = false; + } + dbgs() << " " << S << '\n'; + ++Num; + } + } else if (LI.liveAt(SI)) { + dbgs() << " " << LI << '\n'; + ++Num; + } + } + if (!Num) dbgs() << " \n"; +} + +static bool isEqual(const GCNRPTracker::LiveRegSet &S1, + const GCNRPTracker::LiveRegSet &S2) { + if (S1.size() != S2.size()) + return false; + + for (const auto &P : S1) { + auto I = S2.find(P.first); + if (I == S2.end() || I->second != P.second) + return false; + } + return true; +} + +static GCNRPTracker::LiveRegSet +stripEmpty(const GCNRPTracker::LiveRegSet &LR) { + GCNRPTracker::LiveRegSet Res; + for (const auto &P : LR) { + if (P.second.any()) + Res.insert(P); + } + return Res; +} +#endif + +/////////////////////////////////////////////////////////////////////////////// +// GCNRegPressure + +unsigned GCNRegPressure::getRegKind(unsigned Reg, + const MachineRegisterInfo &MRI) { + assert(TargetRegisterInfo::isVirtualRegister(Reg)); + const auto RC = MRI.getRegClass(Reg); + auto STI = static_cast(MRI.getTargetRegisterInfo()); + return STI->isSGPRClass(RC) ? + (RC->getSize() == 4 ? SGPR32 : SGPR_TUPLE) : + (RC->getSize() == 4 ? VGPR32 : VGPR_TUPLE); +} + +void GCNRegPressure::inc(unsigned Reg, + LaneBitmask PrevMask, + LaneBitmask NewMask, + const MachineRegisterInfo &MRI) { + if (NewMask == PrevMask) + return; + + int Sign = 1; + if (NewMask < PrevMask) { + std::swap(NewMask, PrevMask); + Sign = -1; + } +#ifndef NDEBUG + const auto MaxMask = MRI.getMaxLaneMaskForVReg(Reg); +#endif + switch (auto Kind = getRegKind(Reg, MRI)) { + case SGPR32: + case VGPR32: + assert(PrevMask.none() && NewMask == MaxMask); + Value[Kind] += Sign; + break; + + case SGPR_TUPLE: + case VGPR_TUPLE: + assert(NewMask < MaxMask || NewMask == MaxMask); + assert(PrevMask < NewMask); + + Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] += + Sign * countPopulation((~PrevMask & NewMask).getAsInteger()); + + if (PrevMask.none()) { + assert(NewMask.any()); + Value[Kind] += Sign * MRI.getPressureSets(Reg).getWeight(); + } + break; + + default: llvm_unreachable("Unknown register kind"); + } +} + +bool GCNRegPressure::less(const SISubtarget &ST, + const GCNRegPressure& O, + unsigned MaxOccupancy) const { + const auto SGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumSGPRs(getSGRPNum())); + const auto VGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumVGPRs(getVGRPNum())); + const auto OtherSGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumSGPRs(O.getSGRPNum())); + const auto OtherVGPROcc = std::min(MaxOccupancy, + ST.getOccupancyWithNumVGPRs(O.getVGRPNum())); + + const auto Occ = std::min(SGPROcc, VGPROcc); + const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc); + if (Occ != OtherOcc) + return Occ > OtherOcc; + + bool SGPRImportant = SGPROcc < VGPROcc; + const bool OtherSGPRImportant = OtherSGPROcc < OtherVGPROcc; + + // if both pressures disagree on what is more important compare vgprs + if (SGPRImportant != OtherSGPRImportant) { + SGPRImportant = false; + } + + // compare large regs pressure + bool SGPRFirst = SGPRImportant; + for (int I = 2; I > 0; --I, SGPRFirst = !SGPRFirst) { + if (SGPRFirst) { + auto SW = getSGPRTuplesWeight(); + auto OtherSW = O.getSGPRTuplesWeight(); + if (SW != OtherSW) + return SW < OtherSW; + } else { + auto VW = getVGPRTuplesWeight(); + auto OtherVW = O.getVGPRTuplesWeight(); + if (VW != OtherVW) + return VW < OtherVW; + } + } + return SGPRImportant ? (getSGRPNum() < O.getSGRPNum()): + (getVGRPNum() < O.getVGRPNum()); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const { + OS << "VGPRs: " << getVGRPNum(); + if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGRPNum()) << ')'; + OS << ", SGPRs: " << getSGRPNum(); + if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGRPNum()) << ')'; + OS << ", LVGPR WT: " << getVGPRTuplesWeight() + << ", LSGPR WT: " << getSGPRTuplesWeight(); + if (ST) OS << " -> Occ: " << getOccupancy(*ST); + OS << '\n'; +} +#endif + +/////////////////////////////////////////////////////////////////////////////// +// GCNRPTracker + +LaneBitmask llvm::getLiveLaneMask(unsigned Reg, + SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { + assert(!MRI.reg_nodbg_empty(Reg)); + LaneBitmask LiveMask; + const auto &LI = LIS.getInterval(Reg); + if (LI.hasSubRanges()) { + for (const auto &S : LI.subranges()) + if (S.liveAt(SI)) { + LiveMask |= S.LaneMask; + assert(LiveMask < MRI.getMaxLaneMaskForVReg(Reg) || + LiveMask == MRI.getMaxLaneMaskForVReg(Reg)); + } + } else if (LI.liveAt(SI)) { + LiveMask = MRI.getMaxLaneMaskForVReg(Reg); + } + return LiveMask; +} + +GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, + const LiveIntervals &LIS, + const MachineRegisterInfo &MRI) { + GCNRPTracker::LiveRegSet LiveRegs; + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + auto Reg = TargetRegisterInfo::index2VirtReg(I); + if (MRI.reg_nodbg_empty(Reg)) + continue; + auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI); + if (LiveMask.any()) + LiveRegs[Reg] = LiveMask; + } + return LiveRegs; +} + +void GCNUpwardRPTracker::reset(const MachineInstr &MI) { + MRI = &MI.getParent()->getParent()->getRegInfo(); + LiveRegs = getLiveRegsAfter(MI, LIS); + MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); +} + +LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const { + assert(MO.isDef() && MO.isReg() && + TargetRegisterInfo::isVirtualRegister(MO.getReg())); + + // We don't rely on read-undef flag because in case of tentative schedule + // tracking it isn't set correctly yet. This works correctly however since + // use mask has been tracked before using LIS. + return MO.getSubReg() == 0 ? + MRI->getMaxLaneMaskForVReg(MO.getReg()) : + MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg()); +} + +LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const { + assert(MO.isUse() && MO.isReg() && + TargetRegisterInfo::isVirtualRegister(MO.getReg())); + + if (auto SubReg = MO.getSubReg()) + return MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg); + + auto MaxMask = MRI->getMaxLaneMaskForVReg(MO.getReg()); + if (MaxMask.getAsInteger() == 1) // cannot have subregs + return MaxMask; + + // For a tentative schedule LIS isn't updated yet but livemask should remain + // the same on any schedule. Subreg defs can be reordered but they all must + // dominate uses anyway. + auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex(); + return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI); +} + +void GCNUpwardRPTracker::recede(const MachineInstr &MI) { + assert(MRI && "call reset first"); + + LastTrackedMI = &MI; + + if (MI.isDebugValue()) + return; + + // process all defs first to ensure early clobbers are handled correctly + // iterating over operands() to catch implicit defs + for (const auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef() || + !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + + auto Reg = MO.getReg(); + auto &LiveMask = LiveRegs[Reg]; + auto PrevMask = LiveMask; + LiveMask &= ~getDefRegMask(MO); + CurPressure.inc(Reg, PrevMask, LiveMask, *MRI); + } + + // then all uses + for (const auto &MO : MI.uses()) { + if (!MO.isReg() || !MO.readsReg() || + !TargetRegisterInfo::isVirtualRegister(MO.getReg())) + continue; + + auto Reg = MO.getReg(); + auto &LiveMask = LiveRegs[Reg]; + auto PrevMask = LiveMask; + LiveMask |= getUsedRegMask(MO); + CurPressure.inc(Reg, PrevMask, LiveMask, *MRI); + } + + MaxPressure = max(MaxPressure, CurPressure); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD +static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, + const GCNRPTracker::LiveRegSet &TrackedLR, + const TargetRegisterInfo *TRI) { + for (auto const &P : TrackedLR) { + auto I = LISLR.find(P.first); + if (I == LISLR.end()) { + dbgs() << " " << PrintReg(P.first, TRI) + << ":L" << PrintLaneMask(P.second) + << " isn't found in LIS reported set\n"; + } + else if (I->second != P.second) { + dbgs() << " " << PrintReg(P.first, TRI) + << " masks doesn't match: LIS reported " + << PrintLaneMask(I->second) + << ", tracked " + << PrintLaneMask(P.second) + << '\n'; + } + } + for (auto const &P : LISLR) { + auto I = TrackedLR.find(P.first); + if (I == TrackedLR.end()) { + dbgs() << " " << PrintReg(P.first, TRI) + << ":L" << PrintLaneMask(P.second) + << " isn't found in tracked set\n"; + } + } +} + +bool GCNUpwardRPTracker::isValid() const { + const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex(); + const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI); + const auto TrackedLR = stripEmpty(LiveRegs); + + if (!isEqual(LISLR, TrackedLR)) { + dbgs() << "\nGCNUpwardRPTracker error: Tracked and" + " LIS reported livesets mismatch:\n"; + printLivesAt(SI, LIS, *MRI); + reportMismatch(LISLR, TrackedLR, MRI->getTargetRegisterInfo()); + return false; + } + + auto LISPressure = getRegPressure(*MRI, LISLR); + if (LISPressure != CurPressure) { + dbgs() << "GCNUpwardRPTracker error: Pressure sets different\nTracked: "; + CurPressure.print(dbgs()); + dbgs() << "LIS rpt: "; + LISPressure.print(dbgs()); + return false; + } + return true; +} + +#endif Index: llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.h +++ llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -55,6 +55,8 @@ SUnit *pickNode(bool &IsTopNode) override; void initialize(ScheduleDAGMI *DAG) override; + + void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; } }; class GCNScheduleDAGMILive : public ScheduleDAGMILive { Index: llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -45,8 +45,6 @@ const SIRegisterInfo *SRI = static_cast(TRI); - if (MF != &DAG->MF) - TargetOccupancy = 0; MF = &DAG->MF; const SISubtarget &ST = MF->getSubtarget(); @@ -531,7 +529,7 @@ Stage++; GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; - S.TargetOccupancy = MinOccupancy; + S.setTargetOccupancy(MinOccupancy); MachineBasicBlock *MBB = nullptr; for (auto Region : Regions) { Index: llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll +++ llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll @@ -1,4 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s ; We expect a two digit VGPR usage here, not a three digit. ; CHECK: NumVgprs: {{[0-9][0-9]$}} Index: llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll +++ llvm/trunk/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll @@ -0,0 +1,288 @@ +; RUN: llc -march=amdgcn -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s + +; SI: NumSgprs: {{[1-9]$}} +; SI: NumVgprs: {{[1-9]$}} + +; stores may alias loads +; VI: NumSgprs: {{[1-5][0-9]$}} +; VI: NumVgprs: {{[1-3][0-9]$}} + +define void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) { +bb: + %adr.a.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20004 + %adr.b.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20252 + %adr.c.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20508 + %adr.a.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20772 + %adr.b.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21020 + %adr.c.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21276 + %adr.a.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21540 + %adr.b.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21788 + %adr.c.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22044 + %adr.a.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22308 + %adr.b.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22556 + %adr.c.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22812 + %adr.a.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23076 + %adr.b.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23324 + %adr.c.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23580 + %adr.a.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23844 + %adr.b.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24092 + %adr.c.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24348 + %adr.a.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24612 + %adr.b.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24860 + %adr.c.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25116 + %adr.a.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25380 + %adr.b.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25628 + %adr.c.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25884 + %adr.a.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26148 + %adr.b.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26396 + %adr.c.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26652 + %adr.a.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26916 + %adr.b.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27164 + %adr.c.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27420 + %adr.a.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27684 + %adr.b.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27932 + %adr.c.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28188 + %adr.a.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28452 + %adr.b.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28700 + %adr.c.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28956 + %adr.a.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29220 + %adr.b.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29468 + %adr.c.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29724 + %adr.a.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29988 + %adr.b.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30236 + %adr.c.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30492 + %adr.a.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30756 + %adr.b.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31004 + %adr.c.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31260 + %adr.a.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31524 + %adr.b.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31772 + %adr.c.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32028 + %adr.a.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32292 + %adr.b.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32540 + %adr.c.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32796 + %adr.a.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33060 + %adr.b.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33308 + %adr.c.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33564 + %adr.a.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33828 + %adr.b.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34076 + %adr.c.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34332 + %adr.a.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34596 + %adr.b.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34844 + %adr.c.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35100 + %adr.a.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35364 + %adr.b.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35612 + %adr.c.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35868 + %adr.a.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36132 + %adr.b.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36380 + %adr.c.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36636 + %adr.a.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36900 + %adr.b.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37148 + %adr.c.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37404 + %adr.a.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37668 + %adr.b.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37916 + %adr.c.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38172 + %adr.a.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38436 + %adr.b.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38684 + %adr.c.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38940 + %adr.a.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39204 + %adr.b.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39452 + %adr.c.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39708 + %adr.a.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39972 + %adr.b.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40220 + %adr.c.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40476 + %adr.a.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40740 + %adr.b.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40988 + %adr.c.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41244 + %adr.a.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41508 + %adr.b.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41756 + %adr.c.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42012 + %adr.a.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42276 + %adr.b.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42524 + %adr.c.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42780 + %a.0 = load float, float addrspace(3)* %adr.a.0, align 4 + %b.0 = load float, float addrspace(3)* %adr.b.0, align 4 + %c.0 = load float, float addrspace(3)* %adr.c.0, align 4 + %a.1 = load float, float addrspace(3)* %adr.a.1, align 4 + %b.1 = load float, float addrspace(3)* %adr.b.1, align 4 + %c.1 = load float, float addrspace(3)* %adr.c.1, align 4 + %a.2 = load float, float addrspace(3)* %adr.a.2, align 4 + %b.2 = load float, float addrspace(3)* %adr.b.2, align 4 + %c.2 = load float, float addrspace(3)* %adr.c.2, align 4 + %a.3 = load float, float addrspace(3)* %adr.a.3, align 4 + %b.3 = load float, float addrspace(3)* %adr.b.3, align 4 + %c.3 = load float, float addrspace(3)* %adr.c.3, align 4 + %a.4 = load float, float addrspace(3)* %adr.a.4, align 4 + %b.4 = load float, float addrspace(3)* %adr.b.4, align 4 + %c.4 = load float, float addrspace(3)* %adr.c.4, align 4 + %a.5 = load float, float addrspace(3)* %adr.a.5, align 4 + %b.5 = load float, float addrspace(3)* %adr.b.5, align 4 + %c.5 = load float, float addrspace(3)* %adr.c.5, align 4 + %a.6 = load float, float addrspace(3)* %adr.a.6, align 4 + %b.6 = load float, float addrspace(3)* %adr.b.6, align 4 + %c.6 = load float, float addrspace(3)* %adr.c.6, align 4 + %a.7 = load float, float addrspace(3)* %adr.a.7, align 4 + %b.7 = load float, float addrspace(3)* %adr.b.7, align 4 + %c.7 = load float, float addrspace(3)* %adr.c.7, align 4 + %a.8 = load float, float addrspace(3)* %adr.a.8, align 4 + %b.8 = load float, float addrspace(3)* %adr.b.8, align 4 + %c.8 = load float, float addrspace(3)* %adr.c.8, align 4 + %a.9 = load float, float addrspace(3)* %adr.a.9, align 4 + %b.9 = load float, float addrspace(3)* %adr.b.9, align 4 + %c.9 = load float, float addrspace(3)* %adr.c.9, align 4 + %a.10 = load float, float addrspace(3)* %adr.a.10, align 4 + %b.10 = load float, float addrspace(3)* %adr.b.10, align 4 + %c.10 = load float, float addrspace(3)* %adr.c.10, align 4 + %a.11 = load float, float addrspace(3)* %adr.a.11, align 4 + %b.11 = load float, float addrspace(3)* %adr.b.11, align 4 + %c.11 = load float, float addrspace(3)* %adr.c.11, align 4 + %a.12 = load float, float addrspace(3)* %adr.a.12, align 4 + %b.12 = load float, float addrspace(3)* %adr.b.12, align 4 + %c.12 = load float, float addrspace(3)* %adr.c.12, align 4 + %a.13 = load float, float addrspace(3)* %adr.a.13, align 4 + %b.13 = load float, float addrspace(3)* %adr.b.13, align 4 + %c.13 = load float, float addrspace(3)* %adr.c.13, align 4 + %a.14 = load float, float addrspace(3)* %adr.a.14, align 4 + %b.14 = load float, float addrspace(3)* %adr.b.14, align 4 + %c.14 = load float, float addrspace(3)* %adr.c.14, align 4 + %a.15 = load float, float addrspace(3)* %adr.a.15, align 4 + %b.15 = load float, float addrspace(3)* %adr.b.15, align 4 + %c.15 = load float, float addrspace(3)* %adr.c.15, align 4 + %a.16 = load float, float addrspace(3)* %adr.a.16, align 4 + %b.16 = load float, float addrspace(3)* %adr.b.16, align 4 + %c.16 = load float, float addrspace(3)* %adr.c.16, align 4 + %a.17 = load float, float addrspace(3)* %adr.a.17, align 4 + %b.17 = load float, float addrspace(3)* %adr.b.17, align 4 + %c.17 = load float, float addrspace(3)* %adr.c.17, align 4 + %a.18 = load float, float addrspace(3)* %adr.a.18, align 4 + %b.18 = load float, float addrspace(3)* %adr.b.18, align 4 + %c.18 = load float, float addrspace(3)* %adr.c.18, align 4 + %a.19 = load float, float addrspace(3)* %adr.a.19, align 4 + %b.19 = load float, float addrspace(3)* %adr.b.19, align 4 + %c.19 = load float, float addrspace(3)* %adr.c.19, align 4 + %a.20 = load float, float addrspace(3)* %adr.a.20, align 4 + %b.20 = load float, float addrspace(3)* %adr.b.20, align 4 + %c.20 = load float, float addrspace(3)* %adr.c.20, align 4 + %a.21 = load float, float addrspace(3)* %adr.a.21, align 4 + %b.21 = load float, float addrspace(3)* %adr.b.21, align 4 + %c.21 = load float, float addrspace(3)* %adr.c.21, align 4 + %a.22 = load float, float addrspace(3)* %adr.a.22, align 4 + %b.22 = load float, float addrspace(3)* %adr.b.22, align 4 + %c.22 = load float, float addrspace(3)* %adr.c.22, align 4 + %a.23 = load float, float addrspace(3)* %adr.a.23, align 4 + %b.23 = load float, float addrspace(3)* %adr.b.23, align 4 + %c.23 = load float, float addrspace(3)* %adr.c.23, align 4 + %a.24 = load float, float addrspace(3)* %adr.a.24, align 4 + %b.24 = load float, float addrspace(3)* %adr.b.24, align 4 + %c.24 = load float, float addrspace(3)* %adr.c.24, align 4 + %a.25 = load float, float addrspace(3)* %adr.a.25, align 4 + %b.25 = load float, float addrspace(3)* %adr.b.25, align 4 + %c.25 = load float, float addrspace(3)* %adr.c.25, align 4 + %a.26 = load float, float addrspace(3)* %adr.a.26, align 4 + %b.26 = load float, float addrspace(3)* %adr.b.26, align 4 + %c.26 = load float, float addrspace(3)* %adr.c.26, align 4 + %a.27 = load float, float addrspace(3)* %adr.a.27, align 4 + %b.27 = load float, float addrspace(3)* %adr.b.27, align 4 + %c.27 = load float, float addrspace(3)* %adr.c.27, align 4 + %a.28 = load float, float addrspace(3)* %adr.a.28, align 4 + %b.28 = load float, float addrspace(3)* %adr.b.28, align 4 + %c.28 = load float, float addrspace(3)* %adr.c.28, align 4 + %a.29 = load float, float addrspace(3)* %adr.a.29, align 4 + %b.29 = load float, float addrspace(3)* %adr.b.29, align 4 + %c.29 = load float, float addrspace(3)* %adr.c.29, align 4 + %res.0 = tail call float @llvm.fmuladd.f32(float %a.0, float %b.0, float %c.0) + %res.1 = tail call float @llvm.fmuladd.f32(float %a.1, float %b.1, float %c.1) + %res.2 = tail call float @llvm.fmuladd.f32(float %a.2, float %b.2, float %c.2) + %res.3 = tail call float @llvm.fmuladd.f32(float %a.3, float %b.3, float %c.3) + %res.4 = tail call float @llvm.fmuladd.f32(float %a.4, float %b.4, float %c.4) + %res.5 = tail call float @llvm.fmuladd.f32(float %a.5, float %b.5, float %c.5) + %res.6 = tail call float @llvm.fmuladd.f32(float %a.6, float %b.6, float %c.6) + %res.7 = tail call float @llvm.fmuladd.f32(float %a.7, float %b.7, float %c.7) + %res.8 = tail call float @llvm.fmuladd.f32(float %a.8, float %b.8, float %c.8) + %res.9 = tail call float @llvm.fmuladd.f32(float %a.9, float %b.9, float %c.9) + %res.10 = tail call float @llvm.fmuladd.f32(float %a.10, float %b.10, float %c.10) + %res.11 = tail call float @llvm.fmuladd.f32(float %a.11, float %b.11, float %c.11) + %res.12 = tail call float @llvm.fmuladd.f32(float %a.12, float %b.12, float %c.12) + %res.13 = tail call float @llvm.fmuladd.f32(float %a.13, float %b.13, float %c.13) + %res.14 = tail call float @llvm.fmuladd.f32(float %a.14, float %b.14, float %c.14) + %res.15 = tail call float @llvm.fmuladd.f32(float %a.15, float %b.15, float %c.15) + %res.16 = tail call float @llvm.fmuladd.f32(float %a.16, float %b.16, float %c.16) + %res.17 = tail call float @llvm.fmuladd.f32(float %a.17, float %b.17, float %c.17) + %res.18 = tail call float @llvm.fmuladd.f32(float %a.18, float %b.18, float %c.18) + %res.19 = tail call float @llvm.fmuladd.f32(float %a.19, float %b.19, float %c.19) + %res.20 = tail call float @llvm.fmuladd.f32(float %a.20, float %b.20, float %c.20) + %res.21 = tail call float @llvm.fmuladd.f32(float %a.21, float %b.21, float %c.21) + %res.22 = tail call float @llvm.fmuladd.f32(float %a.22, float %b.22, float %c.22) + %res.23 = tail call float @llvm.fmuladd.f32(float %a.23, float %b.23, float %c.23) + %res.24 = tail call float @llvm.fmuladd.f32(float %a.24, float %b.24, float %c.24) + %res.25 = tail call float @llvm.fmuladd.f32(float %a.25, float %b.25, float %c.25) + %res.26 = tail call float @llvm.fmuladd.f32(float %a.26, float %b.26, float %c.26) + %res.27 = tail call float @llvm.fmuladd.f32(float %a.27, float %b.27, float %c.27) + %res.28 = tail call float @llvm.fmuladd.f32(float %a.28, float %b.28, float %c.28) + %res.29 = tail call float @llvm.fmuladd.f32(float %a.29, float %b.29, float %c.29) + %adr.res.0 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 0 + %adr.res.1 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 2 + %adr.res.2 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 4 + %adr.res.3 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 6 + %adr.res.4 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 8 + %adr.res.5 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 10 + %adr.res.6 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 12 + %adr.res.7 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 14 + %adr.res.8 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 16 + %adr.res.9 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 18 + %adr.res.10 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 20 + %adr.res.11 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 22 + %adr.res.12 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 24 + %adr.res.13 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 26 + %adr.res.14 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 28 + %adr.res.15 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 30 + %adr.res.16 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 32 + %adr.res.17 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 34 + %adr.res.18 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 36 + %adr.res.19 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 38 + %adr.res.20 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 40 + %adr.res.21 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 42 + %adr.res.22 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 44 + %adr.res.23 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 46 + %adr.res.24 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 48 + %adr.res.25 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 50 + %adr.res.26 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 52 + %adr.res.27 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 54 + %adr.res.28 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 56 + %adr.res.29 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 58 + store float %res.0, float addrspace(1)* %adr.res.0, align 4 + store float %res.1, float addrspace(1)* %adr.res.1, align 4 + store float %res.2, float addrspace(1)* %adr.res.2, align 4 + store float %res.3, float addrspace(1)* %adr.res.3, align 4 + store float %res.4, float addrspace(1)* %adr.res.4, align 4 + store float %res.5, float addrspace(1)* %adr.res.5, align 4 + store float %res.6, float addrspace(1)* %adr.res.6, align 4 + store float %res.7, float addrspace(1)* %adr.res.7, align 4 + store float %res.8, float addrspace(1)* %adr.res.8, align 4 + store float %res.9, float addrspace(1)* %adr.res.9, align 4 + store float %res.10, float addrspace(1)* %adr.res.10, align 4 + store float %res.11, float addrspace(1)* %adr.res.11, align 4 + store float %res.12, float addrspace(1)* %adr.res.12, align 4 + store float %res.13, float addrspace(1)* %adr.res.13, align 4 + store float %res.14, float addrspace(1)* %adr.res.14, align 4 + store float %res.15, float addrspace(1)* %adr.res.15, align 4 + store float %res.16, float addrspace(1)* %adr.res.16, align 4 + store float %res.17, float addrspace(1)* %adr.res.17, align 4 + store float %res.18, float addrspace(1)* %adr.res.18, align 4 + store float %res.19, float addrspace(1)* %adr.res.19, align 4 + store float %res.20, float addrspace(1)* %adr.res.20, align 4 + store float %res.21, float addrspace(1)* %adr.res.21, align 4 + store float %res.22, float addrspace(1)* %adr.res.22, align 4 + store float %res.23, float addrspace(1)* %adr.res.23, align 4 + store float %res.24, float addrspace(1)* %adr.res.24, align 4 + store float %res.25, float addrspace(1)* %adr.res.25, align 4 + store float %res.26, float addrspace(1)* %adr.res.26, align 4 + store float %res.27, float addrspace(1)* %adr.res.27, align 4 + store float %res.28, float addrspace(1)* %adr.res.28, align 4 + store float %res.29, float addrspace(1)* %adr.res.29, align 4 + ret void +} +declare float @llvm.fmuladd.f32(float, float, float) #0 +attributes #0 = { nounwind readnone }