Index: lib/CodeGen/ExecutionDepsFix.cpp =================================================================== --- lib/CodeGen/ExecutionDepsFix.cpp +++ lib/CodeGen/ExecutionDepsFix.cpp @@ -142,8 +142,30 @@ std::vector> AliasMap; const unsigned NumRegs; LiveReg *LiveRegs; - typedef DenseMap LiveOutMap; - LiveOutMap LiveOuts; + struct MBBInfo { + // Keeps clearance and domain information for all registers. Note that this + // is different from the usual definition notion of liveness. The CPU + // doesn't care whether or not we consider a register killed. + LiveReg *OutRegs; + + // Whether we have gotten to this block in primary processing yet. + bool PrimaryCompleted; + + // The number of predecessors for which primary processing has completed + unsigned IncomingProcessed; + + // The value of `IncomingProcessed` at the start of primary processing + unsigned PrimaryIncoming; + + // The number of predecessors for which all processing steps are done. + unsigned IncomingCompleted; + + MBBInfo() + : OutRegs(nullptr), PrimaryCompleted(false), IncomingProcessed(0), + PrimaryIncoming(0), IncomingCompleted(0) {} + }; + typedef DenseMap MBBInfoMap; + MBBInfoMap MBBInfos; /// List of undefined register reads in this block in forward order. std::vector > UndefReads; @@ -154,11 +176,6 @@ /// Current instruction number. /// The first instruction in each basic block is 0. int CurInstr; - - /// True when the current block has a predecessor that hasn't been visited - /// yet. - bool SeenUnknownBackEdge; - public: ExeDepsFix(const TargetRegisterClass *rc) : MachineFunctionPass(ID), RC(rc), NumRegs(RC->getNumRegs()) {} @@ -180,7 +197,6 @@ private: iterator_range::const_iterator> regIndices(unsigned Reg) const; - // DomainValue allocation. DomainValue *alloc(int domain = -1); DomainValue *retain(DomainValue *DV) { @@ -199,8 +215,11 @@ void enterBasicBlock(MachineBasicBlock*); void leaveBasicBlock(MachineBasicBlock*); - void visitInstr(MachineInstr*); - void processDefs(MachineInstr*, bool Kill); + bool isBlockDone(MachineBasicBlock *); + void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass); + void updateSuccessors(MachineBasicBlock *MBB, bool PrimaryPass); + bool visitInstr(MachineInstr *); + void processDefs(MachineInstr *, bool breakDependency, bool Kill); void visitSoftInstr(MachineInstr*, unsigned mask); void visitHardInstr(MachineInstr*, unsigned domain); void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx, @@ -360,9 +379,6 @@ /// Set up LiveRegs by merging predecessor live-out values. void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) { - // Detect back-edges from predecessors we haven't processed yet. - SeenUnknownBackEdge = false; - // Reset instruction counter in each basic block. CurInstr = 0; @@ -397,18 +413,19 @@ // Try to coalesce live-out registers from predecessors. for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(), pe = MBB->pred_end(); pi != pe; ++pi) { - LiveOutMap::const_iterator fi = LiveOuts.find(*pi); - if (fi == LiveOuts.end()) { - SeenUnknownBackEdge = true; + auto fi = MBBInfos.find(*pi); + assert(fi != MBBInfos.end() && + "Should have pre-allocated MBBInfos for all MBBs"); + LiveReg *Incoming = fi->second.OutRegs; + if (Incoming == nullptr) { continue; } - assert(fi->second && "Can't have NULL entries"); for (unsigned rx = 0; rx != NumRegs; ++rx) { // Use the most recent predecessor def for each register. - LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, fi->second[rx].Def); + LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def); - DomainValue *pdv = resolve(fi->second[rx].Value); + DomainValue *pdv = resolve(Incoming[rx].Value); if (!pdv) continue; if (!LiveRegs[rx].Value) { @@ -432,35 +449,34 @@ force(rx, pdv->getFirstDomain()); } } - DEBUG(dbgs() << "BB#" << MBB->getNumber() - << (SeenUnknownBackEdge ? ": incomplete\n" : ": all preds known\n")); + DEBUG( + dbgs() << "BB#" << MBB->getNumber() + << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n")); } void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) { assert(LiveRegs && "Must enter basic block first."); - // Save live registers at end of MBB - used by enterBasicBlock(). - // Also use LiveOuts as a visited set to detect back-edges. - bool First = LiveOuts.insert(std::make_pair(MBB, LiveRegs)).second; - - if (First) { - // LiveRegs was inserted in LiveOuts. Adjust all defs to be relative to - // the end of this block instead of the beginning. - for (unsigned i = 0, e = NumRegs; i != e; ++i) - LiveRegs[i].Def -= CurInstr; - } else { - // Insertion failed, this must be the second pass. + LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs; + // Save register clearances at end of MBB - used by enterBasicBlock(). + MBBInfos[MBB].OutRegs = LiveRegs; + + // While processing the basic block, we kept `Def` relative to the start + // of the basic block for convenience. However, future use of this information + // only cares about the clearance from the end of the block, so adjust + // everything to be relative to the end of the basic block. + for (unsigned i = 0, e = NumRegs; i != e; ++i) + LiveRegs[i].Def -= CurInstr; + if (OldOutRegs) { + // This must be the second pass. // Release all the DomainValues instead of keeping them. for (unsigned i = 0, e = NumRegs; i != e; ++i) - release(LiveRegs[i].Value); - delete[] LiveRegs; + release(OldOutRegs[i].Value); + delete[] OldOutRegs; } LiveRegs = nullptr; } -void ExeDepsFix::visitInstr(MachineInstr *MI) { - if (MI->isDebugValue()) - return; - +bool ExeDepsFix::visitInstr(MachineInstr *MI) { // Update instructions with explicit execution domains. std::pair DomP = TII->getExecutionDomain(*MI); if (DomP.first) { @@ -470,9 +486,7 @@ visitHardInstr(MI, DomP.first); } - // Process defs to track register ages, and kill values clobbered by generic - // instructions. - processDefs(MI, !DomP.first); + return !DomP.first; } /// \brief Helps avoid false dependencies on undef registers by updating the @@ -542,14 +556,7 @@ DEBUG(dbgs() << ": Break dependency.\n"); continue; } - // The current clearance seems OK, but we may be ignoring a def from a - // back-edge. - if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) { - DEBUG(dbgs() << ": OK .\n"); - return false; - } - // A def from an unprocessed back-edge may make us break this dependency. - DEBUG(dbgs() << ": Wait for back-edge to resolve.\n"); + DEBUG(dbgs() << ": OK .\n"); return false; } return true; @@ -559,16 +566,19 @@ // If Kill is set, also kill off DomainValues clobbered by the defs. // // Also break dependencies on partial defs and undef uses. -void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) { +void ExeDepsFix::processDefs(MachineInstr *MI, bool breakDependency, + bool Kill) { assert(!MI->isDebugValue() && "Won't process debug values"); // Break dependence on undef uses. Do this before updating LiveRegs below. unsigned OpNum; - unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); - if (Pref) { - pickBestRegisterForUndef(MI, OpNum, Pref); - if (shouldBreakDependence(MI, OpNum, Pref)) - UndefReads.push_back(std::make_pair(MI, OpNum)); + if (breakDependency) { + unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI); + if (Pref) { + pickBestRegisterForUndef(MI, OpNum, Pref); + if (shouldBreakDependence(MI, OpNum, Pref)) + UndefReads.push_back(std::make_pair(MI, OpNum)); + } } const MCInstrDesc &MCID = MI->getDesc(); for (unsigned i = 0, @@ -584,11 +594,13 @@ DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr << '\t' << *MI); - // Check clearance before partial register updates. - // Call breakDependence before setting LiveRegs[rx].Def. - unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); - if (Pref && shouldBreakDependence(MI, i, Pref)) - TII->breakPartialRegDependency(*MI, i, TRI); + if (breakDependency) { + // Check clearance before partial register updates. + // Call breakDependence before setting LiveRegs[rx].Def. + unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI); + if (Pref && shouldBreakDependence(MI, i, Pref)) + TII->breakPartialRegDependency(*MI, i, TRI); + } // How many instructions since rx was last written? LiveRegs[rx].Def = CurInstr; @@ -780,6 +792,52 @@ } } +void ExeDepsFix::processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) { + enterBasicBlock(MBB); + // If this block is not done, it makes little sense to make any decisions + // based on clearance information. We need to make a second pass anyway, + // and by then we'll have better information, so we can avoid doing the work + // to try and break dependencies now. + bool breakDependency = isBlockDone(MBB); + for (MachineInstr &MI : *MBB) { + if (!MI.isDebugValue()) { + bool Kill = false; + if (PrimaryPass) + Kill = visitInstr(&MI); + processDefs(&MI, breakDependency, Kill); + } + } + if (breakDependency) + processUndefReads(MBB); + leaveBasicBlock(MBB); +} + +bool ExeDepsFix::isBlockDone(MachineBasicBlock *MBB) { + return MBBInfos[MBB].PrimaryCompleted && + MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming && + MBBInfos[MBB].IncomingProcessed == MBB->pred_size(); +} + +void ExeDepsFix::updateSuccessors(MachineBasicBlock *MBB, bool Primary) { + bool Done = isBlockDone(MBB); + for (auto *Succ : MBB->successors()) { + if (!isBlockDone(Succ)) { + if (Primary) { + MBBInfos[Succ].IncomingProcessed++; + } + if (Done) { + MBBInfos[Succ].IncomingCompleted++; + } + if (isBlockDone(Succ)) { + // Perform secondary processing for this successor. See the big comment + // in runOnMachineFunction, for an explanation of the iteration order. + processBasicBlock(Succ, false); + updateSuccessors(Succ, false); + } + } + } +} + bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) { if (skipFunction(*mf.getFunction())) return false; @@ -816,44 +874,80 @@ AliasMap[*AI].push_back(i); } + // Initialize the MMBInfos + for (auto &MBB : mf) { + MBBInfo InitialInfo; + MBBInfos.insert(std::make_pair(&MBB, InitialInfo)); + } + + /* + * We want to visit every instruction in every basic block in order to update + * it's execution domain or break any false dependencies. However, for the + * dependency breaking, we need to know clearances from all predecessors + * (including any backedges). One way to do so would be to do two complete + * passes over all basic blocks/instructions, the first for recording + * clearances, the second to break the dependencies. However, for functions + * without backedges, or functions with a lot of straight-line code, and + * a small loop, that would be a lot of unnecessary work (since only the + * BBs that are part of the loop require two passes). As an example, + * consider the following loop. + * + * + * PH -> A -> B (xmm -> xmm) -> C -> D -> EXIT + * ^ | + * +----------------------------------+ + * + * The iteration order is as follows: + * Naive: PH A B C D A' B' C' D' + * Optimized: PH A B C A' B' C' D + * + * Note that we avoid processing D twice, because we can entirely process + * the predecessors before getting to D. We call a block that is ready + * for its second round of processing `done` (isBlockDone). Once we finish + * processing some block, we update the counters in MBBInfos and re-process + * any successors that are now done. + */ + MachineBasicBlock *Entry = &*MF->begin(); ReversePostOrderTraversal RPOT(Entry); - SmallVector Loops; for (ReversePostOrderTraversal::rpo_iterator MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) { MachineBasicBlock *MBB = *MBBI; - enterBasicBlock(MBB); - if (SeenUnknownBackEdge) - Loops.push_back(MBB); - for (MachineInstr &MI : *MBB) - visitInstr(&MI); - processUndefReads(MBB); - leaveBasicBlock(MBB); - } - - // Visit all the loop blocks again in order to merge DomainValues from - // back-edges. - for (MachineBasicBlock *MBB : Loops) { - enterBasicBlock(MBB); - for (MachineInstr &MI : *MBB) - if (!MI.isDebugValue()) - processDefs(&MI, false); - processUndefReads(MBB); - leaveBasicBlock(MBB); + // N.B: IncomingProcessed and IncomingCompleted were already updated while + // processing this block's predecessors. + MBBInfos[MBB].PrimaryCompleted = true; + MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed; + processBasicBlock(MBB, true); + updateSuccessors(MBB, true); + } + + // We need to go through again and finalize any blocks that are not done yet. + // This is possible if blocks have dead predecessors, so we didn't visit them + // above. + for (ReversePostOrderTraversal::rpo_iterator + MBBI = RPOT.begin(), + MBBE = RPOT.end(); + MBBI != MBBE; ++MBBI) { + MachineBasicBlock *MBB = *MBBI; + if (!isBlockDone(MBB)) { + processBasicBlock(MBB, false); + // Don't update successors here. We'll get to them anyway through this + // loop. + } } // Clear the LiveOuts vectors and collapse any remaining DomainValues. for (ReversePostOrderTraversal::rpo_iterator MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) { - LiveOutMap::const_iterator FI = LiveOuts.find(*MBBI); - if (FI == LiveOuts.end() || !FI->second) + auto FI = MBBInfos.find(*MBBI); + if (FI == MBBInfos.end() || !FI->second.OutRegs) continue; for (unsigned i = 0, e = NumRegs; i != e; ++i) - if (FI->second[i].Value) - release(FI->second[i].Value); - delete[] FI->second; + if (FI->second.OutRegs[i].Value) + release(FI->second.OutRegs[i].Value); + delete[] FI->second.OutRegs; } - LiveOuts.clear(); + MBBInfos.clear(); UndefReads.clear(); Avail.clear(); Allocator.DestroyAll(); Index: test/CodeGen/X86/break-false-dep.ll =================================================================== --- test/CodeGen/X86/break-false-dep.ll +++ test/CodeGen/X86/break-false-dep.ll @@ -277,3 +277,60 @@ ;AVX: vcvtsi2sdq {{.*}}, [[XMM4_7:%xmm[4-7]]], {{%xmm[0-9]+}} ;AVX-NOT: [[XMM4_7]] } + +; Make sure we are making a smart choice regarding undef registers even for more +; complicated loop structures. This example is the inner loop from +; julia> a = falses(10000); a[1:4:end] = true +; julia> linspace(1.0,2.0,10000)[a] +define void @loopclearance2(double* nocapture %y, i64* %x, double %c1, double %c2, double %c3, double %c4, i64 %size) { +entry: + tail call void asm sideeffect "", "~{xmm7},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"() + tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"() + br label %loop + +loop: + %phi_i = phi i64 [ 1, %entry ], [ %nexti, %loop_end ] + %phi_j = phi i64 [ 1, %entry ], [ %nextj, %loop_end ] + %phi_k = phi i64 [ 0, %entry ], [ %nextk, %loop_end ] + br label %inner_loop + +inner_loop: + %phi = phi i64 [ %phi_k, %loop ], [ %nextk, %inner_loop ] + %idx = lshr i64 %phi, 6 + %inputptr = getelementptr i64, i64* %x, i64 %idx + %input = load i64, i64* %inputptr, align 8 + %masked = and i64 %phi, 63 + %shiftedmasked = shl i64 1, %masked + %maskedinput = and i64 %input, %shiftedmasked + %cmp = icmp eq i64 %maskedinput, 0 + %nextk = add i64 %phi, 1 + br i1 %cmp, label %inner_loop, label %loop_end + +loop_end: + %nexti = add i64 %phi_i, 1 + %nextj = add i64 %phi_j, 1 + ; Register use, plus us clobbering 7-15 above, basically forces xmm7 here as + ; the only reasonable choice. The primary thing we care about is that it's + ; not one of the registers used in the loop (e.g. not the output reg here) +;AVX-NOT: %xmm6 +;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}} +;AVX-NOT: %xmm6 + %nexti_f = sitofp i64 %nexti to double + %sub = fsub double %c1, %nexti_f + %mul = fmul double %sub, %c2 +;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}} +;AVX-NOT: %xmm6 + %phi_f = sitofp i64 %phi to double + %mul2 = fmul double %phi_f, %c3 + %add2 = fadd double %mul, %mul2 + %div = fdiv double %add2, %c4 + %prev_j = add i64 %phi_j, -1 + %outptr = getelementptr double, double* %y, i64 %prev_j + store double %div, double* %outptr, align 8 + %done = icmp slt i64 %size, %nexti + br i1 %done, label %loopdone, label %loop + +loopdone: + ret void +}