Index: lib/CodeGen/MachinePipeliner.cpp =================================================================== --- lib/CodeGen/MachinePipeliner.cpp +++ lib/CodeGen/MachinePipeliner.cpp @@ -119,6 +119,8 @@ #include #include +/* #define USE_DFAPacketizer_P 1 */ + using namespace llvm; #define DEBUG_TYPE "pipeliner" @@ -601,12 +603,18 @@ /// Virtual register information. MachineRegisterInfo &MRI; +#ifdef USE_DFAPacketizer_P std::unique_ptr Resources; - +#endif /* USE_DFAPacketizer_P */ public: +#ifdef USE_DFAPacketizer_P SMSchedule(MachineFunction *mf) : ST(mf->getSubtarget()), MRI(mf->getRegInfo()), Resources(ST.getInstrInfo()->CreateTargetScheduleState(ST)) {} +#else + SMSchedule(MachineFunction *mf) + : ST(mf->getSubtarget()), MRI(mf->getRegInfo()) {} +#endif /* USE_DFAPacketizer_P */ void reset() { ScheduledInstrs.clear(); @@ -1309,6 +1317,7 @@ } // end anonymous namespace +#ifdef USE_DFAPacketizer_P /// Calculate the resource constrained minimum initiation interval for the /// specified loop. We use the DFA to model the resources needed for /// each instruction, and we ignore dependences. A different DFA is created @@ -1377,6 +1386,19 @@ Resources.clear(); return Resmii; } +#else +unsigned SwingSchedulerDAG::calculateResMII() { + // Consider only issue width + MachineBasicBlock *MBB = Loop.getHeader(); + unsigned size = 0; + for (MachineBasicBlock::iterator I = MBB->getFirstNonPHI(), + E = MBB->getFirstTerminator(); + I != E; ++I) { + size++; + } + return (size + SchedModel.getIssueWidth() - 1) / SchedModel.getIssueWidth(); +} +#endif /* USE_DFAPacketizer_P */ /// Calculate the recurrence-constrainted minimum initiation interval. /// Iterate over each circuit. Compute the delay(c) and distance(c) @@ -3483,6 +3505,7 @@ M->apply(this); } +#ifdef USE_DFAPacketizer_P /// Try to schedule the node at the specified StartCycle and continue /// until the node is schedule or the EndCycle is reached. This function /// returns true if the node is scheduled. This routine may search either @@ -3536,6 +3559,131 @@ } return false; } +#else +static void clearResources(std::vector& ResourceTable) +{ + for (unsigned Idx = 0; Idx < ResourceTable.size(); ++Idx) { + ResourceTable[Idx] = 0; + } +} + +static bool canReserveResources(const TargetSubtargetInfo *STI, + const MCSchedModel &SchedModel, + std::vector& ResourceTable, SUnit *SU) +{ + unsigned SchedClass = SU->getInstr()->getDesc().getSchedClass(); + const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(SchedClass); + unsigned Issues = 0; + for (unsigned Idx = 0; Idx < ResourceTable.size(); ++Idx) { + Issues += ResourceTable[Idx]; + } + if (Issues >= SchedModel.IssueWidth) + return false; + + // TODO: This process is not accurate. + // Information on elements of the set of ProcResGroup as follows is necessary. + // [TargetSchedule] Expose sub-units of a ProcResGroup in MCProcResourceDesc. + // https://reviews.llvm.org/D43023 + // In addition, resource management using ResourceCycles is necessary. + bool Check = false; + for (const MCWriteProcResEntry &PRE : + make_range(STI->getWriteProcResBegin(SC), + STI->getWriteProcResEnd(SC))) { + unsigned Idx = PRE.ProcResourceIdx; + if (SchedModel.getProcResource(Idx)->NumUnits > ResourceTable[Idx]) { + return true; + } + Check = true; + } + if (!Check) { + ResourceTable[0]++; + return true; + } + return false; +} + +static void reserveResources(const TargetSubtargetInfo *STI, + const MCSchedModel &SchedModel, + std::vector& ResourceTable, SUnit *SU) +{ + unsigned SchedClass = SU->getInstr()->getDesc().getSchedClass(); + const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(SchedClass); + bool Check = false; + for (const MCWriteProcResEntry &PRE : + make_range(STI->getWriteProcResBegin(SC), + STI->getWriteProcResEnd(SC))) { + unsigned Idx = PRE.ProcResourceIdx; + if (SchedModel.getProcResource(Idx)->NumUnits > ResourceTable[Idx]) { + ResourceTable[Idx]++; + return; + } + Check = true; + } + if (!Check) { + ResourceTable[0]++; + return; + } + llvm_unreachable("reserveResources"); +} + +bool SMSchedule::insert(SUnit *SU, int StartCycle, int EndCycle, int II) { + const TargetSubtargetInfo *STI = &ST; + std::vector ResourceTable; + const MCSchedModel &SchedModel = ST.getSchedModel(); + { + unsigned NumRes = SchedModel.getNumProcResourceKinds(); + for (unsigned Idx = 0; Idx < NumRes; ++Idx) { + ResourceTable.push_back(0); + } + } + bool forward = true; + if (StartCycle > EndCycle) + forward = false; + + // The terminating condition depends on the direction. + int termCycle = forward ? EndCycle + 1 : EndCycle - 1; + for (int curCycle = StartCycle; curCycle != termCycle; + forward ? ++curCycle : --curCycle) { + + // Add the already scheduled instructions at the specified cycle. + clearResources(ResourceTable); + for (int checkCycle = FirstCycle + ((curCycle - FirstCycle) % II); + checkCycle <= LastCycle; checkCycle += II) { + std::deque &cycleInstrs = ScheduledInstrs[checkCycle]; + + for (std::deque::iterator I = cycleInstrs.begin(), + E = cycleInstrs.end(); + I != E; ++I) { + if (ST.getInstrInfo()->isZeroCost((*I)->getInstr()->getOpcode())) + continue; + assert(canReserveResources(STI, SchedModel, ResourceTable, (*I)) && + "These instructions have already been scheduled."); + reserveResources(STI, SchedModel, ResourceTable, (*I)); + } + } + if (ST.getInstrInfo()->isZeroCost(SU->getInstr()->getOpcode()) || + canReserveResources(STI, SchedModel, ResourceTable, SU)) { + DEBUG({ + dbgs() << "\tinsert at cycle " << curCycle << " "; + SU->getInstr()->dump(); + }); + + ScheduledInstrs[curCycle].push_back(SU); + InstrToCycle.insert(std::make_pair(SU, curCycle)); + if (curCycle > LastCycle) + LastCycle = curCycle; + if (curCycle < FirstCycle) + FirstCycle = curCycle; + return true; + } + DEBUG({ + dbgs() << "\tfailed to insert at cycle " << curCycle << " "; + SU->getInstr()->dump(); + }); + } + return false; +} +#endif /* USE_DFAPacketizer_P */ // Return the cycle of the earliest scheduled instruction in the chain. int SMSchedule::earliestCycleInChain(const SDep &Dep) { Index: lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.h +++ lib/Target/AArch64/AArch64InstrInfo.h @@ -293,6 +293,16 @@ MachineBasicBlock *FBB, ArrayRef Cond, const DebugLoc &DL, int *BytesAdded = nullptr) const override; + + bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst, + MachineInstr *&CmpInst) const override; + + unsigned reduceLoopCount(MachineBasicBlock &MBB, + MachineInstr *IndVar, MachineInstr &Cmp, + SmallVectorImpl &Cond, + SmallVectorImpl &PrevInsts, + unsigned Iter, unsigned MaxIter) const override; + bool reverseBranchCondition(SmallVectorImpl &Cond) const override; bool canInsertSelect(const MachineBasicBlock &, ArrayRef Cond, @@ -309,6 +319,7 @@ bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg, unsigned &SrcReg2, int &CmpMask, int &CmpValue) const override; + /// optimizeCompareInstr - Convert the instruction supplying the argument to /// the comparison into one that sets the zero bit in the flags register. bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -5063,3 +5063,57 @@ return It; } + +bool AArch64InstrInfo::analyzeLoop(MachineLoop &L, + MachineInstr *&IndVarInst, + MachineInstr *&CmpInst) const { + MachineBasicBlock *LoopEnd = L.getBottomBlock(); + MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator(); + MachineBasicBlock::iterator E = LoopEnd->getFirstNonDebugInstr(); + MachineInstr *BccMI = nullptr; + MachineInstr *CompMI = nullptr; + MachineInstr *CopyMI = nullptr; + MachineInstr *AddMI = nullptr; + for (; I != E; --I) { + if (!BccMI && I->getOpcode() == AArch64::Bcc) { + BccMI = &*I; + AArch64CC::CondCode CC = (AArch64CC::CondCode)BccMI->getOperand(0).getImm(); + if (CC != AArch64CC::LT) + return true; + } else if (BccMI && !CompMI && I->getOpcode() == AArch64::SUBSXrr) { + CompMI = &*I; + } else if (CompMI && !CopyMI && I->getOpcode() == AArch64::COPY) { + if (CompMI->getOperand(1).getReg() == I->getOperand(1).getReg()) { + CopyMI = &*I; + } + } else if (CopyMI && !AddMI && I->getOpcode() == AArch64::ADDXri) { + if (CompMI->getOperand(1).getReg() == I->getOperand(0).getReg()) { + AddMI = &*I; + } + } else if (AddMI && I->isPHI()) { + if (I->getOperand(0).getReg() == AddMI->getOperand(1).getReg() + && I->getOperand(3).getReg() == CopyMI->getOperand(0).getReg()) { + IndVarInst = AddMI; + CmpInst = CompMI; + return false; + } + } + } + return true; +} + +unsigned AArch64InstrInfo::reduceLoopCount(MachineBasicBlock &MBB, + MachineInstr *IndVar, MachineInstr &Cmp, + SmallVectorImpl &Cond, + SmallVectorImpl &PrevInsts, + unsigned Iter, unsigned MaxIter) const { + MachineInstr *CompMI = nullptr; + for (auto I = MBB.instr_rbegin(), E = MBB.instr_rend(); I != E; ++I) { + if (I->getOpcode() == AArch64::SUBSXrr) { + CompMI = &*I; + } + } + unsigned LoopCount = CompMI->getOperand(1).getReg(); + Cond.push_back(MachineOperand::CreateImm(AArch64CC::LT)); + return LoopCount; +} Index: lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetMachine.cpp +++ lib/Target/AArch64/AArch64TargetMachine.cpp @@ -488,6 +488,8 @@ // be register coaleascer friendly. addPass(&PeepholeOptimizerID); } + if (TM->getOptLevel() >= CodeGenOpt::Default) + addPass(&MachinePipelinerID); } void AArch64PassConfig::addPostRegAlloc() { Index: lib/Target/X86/X86InstrInfo.h =================================================================== --- lib/Target/X86/X86InstrInfo.h +++ lib/Target/X86/X86InstrInfo.h @@ -77,12 +77,6 @@ unsigned getCMovFromCond(CondCode CC, unsigned RegBytes, bool HasMemoryOperand = false); -// Turn jCC opcode into condition code. -CondCode getCondFromBranchOpc(unsigned Opc); - -// Turn setCC opcode into condition code. -CondCode getCondFromSETOpc(unsigned Opc); - // Turn CMov opcode into condition code. CondCode getCondFromCMovOpc(unsigned Opc); @@ -366,6 +360,16 @@ MachineBasicBlock *FBB, ArrayRef Cond, const DebugLoc &DL, int *BytesAdded = nullptr) const override; + + bool analyzeLoop(MachineLoop &L, MachineInstr *&IndVarInst, + MachineInstr *&CmpInst) const override; + + unsigned reduceLoopCount(MachineBasicBlock &MBB, + MachineInstr *IndVar, MachineInstr &Cmp, + SmallVectorImpl &Cond, + SmallVectorImpl &PrevInsts, + unsigned Iter, unsigned MaxIter) const override; + bool canInsertSelect(const MachineBasicBlock &, ArrayRef Cond, unsigned, unsigned, int &, int &, int &) const override; void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Index: lib/Target/X86/X86InstrInfo.cpp =================================================================== --- lib/Target/X86/X86InstrInfo.cpp +++ lib/Target/X86/X86InstrInfo.cpp @@ -5782,7 +5782,7 @@ return false; } -X86::CondCode X86::getCondFromBranchOpc(unsigned BrOpc) { +static X86::CondCode getCondFromBranchOpc(unsigned BrOpc) { switch (BrOpc) { default: return X86::COND_INVALID; case X86::JE_1: return X86::COND_E; @@ -5805,7 +5805,7 @@ } /// Return condition code of a SET opcode. -X86::CondCode X86::getCondFromSETOpc(unsigned Opc) { +static X86::CondCode getCondFromSETOpc(unsigned Opc) { switch (Opc) { default: return X86::COND_INVALID; case X86::SETAr: case X86::SETAm: return X86::COND_A; @@ -6130,7 +6130,7 @@ if (!I->isBranch()) assert(0 && "Can't find the branch to replace!"); - X86::CondCode CC = X86::getCondFromBranchOpc(I->getOpcode()); + X86::CondCode CC = getCondFromBranchOpc(I->getOpcode()); assert(BranchCond.size() == 1); if (CC != BranchCond[0].getImm()) continue; @@ -6237,7 +6237,7 @@ } // Handle conditional branches. - X86::CondCode BranchCode = X86::getCondFromBranchOpc(I->getOpcode()); + X86::CondCode BranchCode = getCondFromBranchOpc(I->getOpcode()); if (BranchCode == X86::COND_INVALID) return true; // Can't handle indirect branch. @@ -6433,7 +6433,7 @@ if (I->isDebugValue()) continue; if (I->getOpcode() != X86::JMP_1 && - X86::getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) + getCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID) break; // Remove the branch. I->eraseFromParent(); @@ -6710,12 +6710,102 @@ return; } - if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) { - // FIXME: We use a fatal error here because historically LLVM has tried - // lower some of these physreg copies and we want to ensure we get - // reasonable bug reports if someone encounters a case no other testing - // found. This path should be removed after the LLVM 7 release. - report_fatal_error("Unable to copy EFLAGS physical register!"); + bool FromEFLAGS = SrcReg == X86::EFLAGS; + bool ToEFLAGS = DestReg == X86::EFLAGS; + int Reg = FromEFLAGS ? DestReg : SrcReg; + bool is32 = X86::GR32RegClass.contains(Reg); + bool is64 = X86::GR64RegClass.contains(Reg); + + if ((FromEFLAGS || ToEFLAGS) && (is32 || is64)) { + int Mov = is64 ? X86::MOV64rr : X86::MOV32rr; + int Push = is64 ? X86::PUSH64r : X86::PUSH32r; + int PushF = is64 ? X86::PUSHF64 : X86::PUSHF32; + int Pop = is64 ? X86::POP64r : X86::POP32r; + int PopF = is64 ? X86::POPF64 : X86::POPF32; + int AX = is64 ? X86::RAX : X86::EAX; + + if (!Subtarget.hasLAHFSAHF()) { + assert(Subtarget.is64Bit() && + "Not having LAHF/SAHF only happens on 64-bit."); + // Moving EFLAGS to / from another register requires a push and a pop. + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. See X86FrameLowering.cpp - usesTheStack. + if (FromEFLAGS) { + BuildMI(MBB, MI, DL, get(PushF)); + BuildMI(MBB, MI, DL, get(Pop), DestReg); + } + if (ToEFLAGS) { + BuildMI(MBB, MI, DL, get(Push)) + .addReg(SrcReg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(PopF)); + } + return; + } + + // The flags need to be saved, but saving EFLAGS with PUSHF/POPF is + // inefficient. Instead: + // - Save the overflow flag OF into AL using SETO, and restore it using a + // signed 8-bit addition of AL and INT8_MAX. + // - Save/restore the bottom 8 EFLAGS bits (CF, PF, AF, ZF, SF) to/from AH + // using LAHF/SAHF. + // - When RAX/EAX is live and isn't the destination register, make sure it + // isn't clobbered by PUSH/POP'ing it before and after saving/restoring + // the flags. + // This approach is ~2.25x faster than using PUSHF/POPF. + // + // This is still somewhat inefficient because we don't know which flags are + // actually live inside EFLAGS. Were we able to do a single SETcc instead of + // SETO+LAHF / ADDB+SAHF the code could be 1.02x faster. + // + // PUSHF/POPF is also potentially incorrect because it affects other flags + // such as TF/IF/DF, which LLVM doesn't model. + // + // Notice that we have to adjust the stack if we don't want to clobber the + // first frame index. + // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment. + + const TargetRegisterInfo &TRI = getRegisterInfo(); + MachineBasicBlock::LivenessQueryResult LQR = + MBB.computeRegisterLiveness(&TRI, AX, MI); + // We do not want to save and restore AX if we do not have to. + // Moreover, if we do so whereas AX is dead, we would need to set + // an undef flag on the use of AX, otherwise the verifier will + // complain that we read an undef value. + // We do not want to change the behavior of the machine verifier + // as this is usually wrong to read an undef value. + if (MachineBasicBlock::LQR_Unknown == LQR) { + LivePhysRegs LPR(TRI); + LPR.addLiveOuts(MBB); + MachineBasicBlock::iterator I = MBB.end(); + while (I != MI) { + --I; + LPR.stepBackward(*I); + } + // AX contains the top most register in the aliasing hierarchy. + // It may not be live, but one of its aliases may be. + for (MCRegAliasIterator AI(AX, &TRI, true); + AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI) + LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live + : MachineBasicBlock::LQR_Dead; + } + bool AXDead = (Reg == AX) || (MachineBasicBlock::LQR_Dead == LQR); + if (!AXDead) + BuildMI(MBB, MI, DL, get(Push)).addReg(AX, getKillRegState(true)); + if (FromEFLAGS) { + BuildMI(MBB, MI, DL, get(X86::SETOr), X86::AL); + BuildMI(MBB, MI, DL, get(X86::LAHF)); + BuildMI(MBB, MI, DL, get(Mov), Reg).addReg(AX); + } + if (ToEFLAGS) { + BuildMI(MBB, MI, DL, get(Mov), AX).addReg(Reg, getKillRegState(KillSrc)); + BuildMI(MBB, MI, DL, get(X86::ADD8ri), X86::AL) + .addReg(X86::AL) + .addImm(INT8_MAX); + BuildMI(MBB, MI, DL, get(X86::SAHF)); + } + if (!AXDead) + BuildMI(MBB, MI, DL, get(Pop), AX); + return; } DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) @@ -7375,9 +7465,9 @@ if (IsCmpZero || IsSwapped) { // We decode the condition code from opcode. if (Instr.isBranch()) - OldCC = X86::getCondFromBranchOpc(Instr.getOpcode()); + OldCC = getCondFromBranchOpc(Instr.getOpcode()); else { - OldCC = X86::getCondFromSETOpc(Instr.getOpcode()); + OldCC = getCondFromSETOpc(Instr.getOpcode()); if (OldCC != X86::COND_INVALID) OpcIsSET = true; else @@ -9323,9 +9413,8 @@ isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const { // FIXME: Return false for x87 stack register classes for now. We can't // allow any loads of these registers before FpGet_ST0_80. - return !(RC == &X86::CCRRegClass || RC == &X86::DFCCRRegClass || - RC == &X86::RFP32RegClass || RC == &X86::RFP64RegClass || - RC == &X86::RFP80RegClass); + return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass || + RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass); } /// Return a virtual register initialized with the @@ -10872,3 +10961,49 @@ return It; } + +bool X86InstrInfo::analyzeLoop(MachineLoop &L, + MachineInstr *&IndVarInst, + MachineInstr *&CmpInst) const { + MachineBasicBlock *LoopEnd = L.getBottomBlock(); + MachineBasicBlock::iterator I = LoopEnd->getFirstTerminator(); + MachineBasicBlock::iterator E = LoopEnd->getFirstNonDebugInstr(); + MachineInstr *JumpMI = nullptr; + MachineInstr *CompMI = nullptr; + MachineInstr *AddMI = nullptr; + for (; I != E; --I) { + if (!JumpMI && I->getOpcode() == X86::JL_1) { + JumpMI = &*I; + } else if (JumpMI && !CompMI && I->getOpcode() == X86::CMP64rr) { + CompMI = &*I; + } else if (CompMI && !AddMI && + (I->getOpcode() == X86::INC64r || I->getOpcode() == X86::ADD64ri8) + && CompMI->getOperand(0).getReg() == I->getOperand(0).getReg()) { + AddMI = &*I; + } else if (AddMI && I->isPHI()) { + if (I->getOperand(0).getReg() == AddMI->getOperand(1).getReg() + && I->getOperand(3).getReg() == AddMI->getOperand(0).getReg()) { + IndVarInst = AddMI; + CmpInst = CompMI; + return false; + } + } + } + return true; +} + +unsigned X86InstrInfo::reduceLoopCount(MachineBasicBlock &MBB, + MachineInstr *IndVar, MachineInstr &Cmp, + SmallVectorImpl &Cond, + SmallVectorImpl &PrevInsts, + unsigned Iter, unsigned MaxIter) const { + MachineInstr *CompMI = nullptr; + for (auto I = MBB.instr_rbegin(), E = MBB.instr_rend(); I != E; ++I) { + if (I->getOpcode() == X86::CMP64rr) { + CompMI = &*I; + } + } + unsigned LoopCount = CompMI->getOperand(0).getReg(); + Cond.push_back(MachineOperand::CreateImm(X86::COND_L)); + return LoopCount; +} Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -62,7 +62,6 @@ void initializeX86CmovConverterPassPass(PassRegistry &); void initializeX86ExecutionDepsFixPass(PassRegistry &); void initializeX86DomainReassignmentPass(PassRegistry &); -void initializeX86FlagsCopyLoweringPassPass(PassRegistry &); } // end namespace llvm @@ -81,7 +80,6 @@ initializeX86CmovConverterPassPass(PR); initializeX86ExecutionDepsFixPass(PR); initializeX86DomainReassignmentPass(PR); - initializeX86FlagsCopyLoweringPassPass(PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -417,7 +415,9 @@ addPass(createX86CallFrameOptimization()); } - addPass(createX86FlagsCopyLoweringPass()); + if (TM->getOptLevel() >= CodeGenOpt::Default) + addPass(&MachinePipelinerID); + addPass(createX86WinAllocaExpander()); } void X86PassConfig::addMachineSSAOptimization() {