Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -880,24 +880,14 @@ // Start with an assumption that there is no need to emit. unsigned int EmitWaitcnt = 0; - // No need to wait before phi. If a phi-move exists, then the wait should - // has been inserted before the move. If a phi-move does not exist, then - // wait should be inserted before the real use. The same is true for - // sc-merge. It is not a coincident that all these cases correspond to the - // instructions that are skipped in the assembling loop. - bool NeedLineMapping = false; // TODO: Check on this. - // ForceEmitZeroWaitcnt: force a single s_waitcnt 0 due to hw bug bool ForceEmitZeroWaitcnt = false; setForceEmitWaitcnt(); bool IsForceEmitWaitcnt = isForceEmitWaitcnt(); - if (MI.isDebugInstr() && - // TODO: any other opcode? - !NeedLineMapping) { + if (MI.isDebugInstr()) return; - } // See if an s_waitcnt is forced at block entry, or is needed at // program end. @@ -1141,7 +1131,6 @@ if (EmitWaitcnt || IsForceEmitWaitcnt) { int CntVal[NUM_INST_CNTS]; - bool UseDefaultWaitcntStrategy = true; if (ForceEmitZeroWaitcnt || ForceEmitZeroWaitcnts) { // Force all waitcnts to 0. for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; @@ -1151,10 +1140,7 @@ CntVal[VM_CNT] = 0; CntVal[EXP_CNT] = 0; CntVal[LGKM_CNT] = 0; - UseDefaultWaitcntStrategy = false; - } - - if (UseDefaultWaitcntStrategy) { + } else { for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; T = (enum InstCounterType)(T + 1)) { if (EmitWaitcnt & CNT_MASK(T)) { @@ -1178,95 +1164,89 @@ } } - // If we are not waiting on any counter we can skip the wait altogether. - if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) { - MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt(); - int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm(); - if (!OldWaitcnt || - (AMDGPU::decodeVmcnt(IV, Imm) != - (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) || - (AMDGPU::decodeExpcnt(IV, Imm) != - (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) || - (AMDGPU::decodeLgkmcnt(IV, Imm) != - (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) { - MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent()); - if (ContainingLoop) { - MachineBasicBlock *TBB = ContainingLoop->getHeader(); - BlockWaitcntBrackets *ScoreBracket = - BlockWaitcntBracketsMap[TBB].get(); - if (!ScoreBracket) { - assert(!BlockVisitedSet.count(TBB)); - BlockWaitcntBracketsMap[TBB] = - llvm::make_unique(ST); - ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); - } - ScoreBracket->setRevisitLoop(true); - LLVM_DEBUG(dbgs() - << "set-revisit2: Block" - << ContainingLoop->getHeader()->getNumber() << '\n';); + MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt(); + int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm(); + if (!OldWaitcnt || + (AMDGPU::decodeVmcnt(IV, Imm) != + (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) || + (AMDGPU::decodeExpcnt(IV, Imm) != + (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) || + (AMDGPU::decodeLgkmcnt(IV, Imm) != + (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) { + MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent()); + if (ContainingLoop) { + MachineBasicBlock *TBB = ContainingLoop->getHeader(); + BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); + if (!ScoreBracket) { + assert(!BlockVisitedSet.count(TBB)); + BlockWaitcntBracketsMap[TBB] = + llvm::make_unique(ST); + ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); } + ScoreBracket->setRevisitLoop(true); + LLVM_DEBUG(dbgs() << "set-revisit2: Block" + << ContainingLoop->getHeader()->getNumber() << '\n';); } + } - // Update an existing waitcount, or make a new one. - unsigned Enc = AMDGPU::encodeWaitcnt(IV, + // Update an existing waitcount, or make a new one. + unsigned Enc = AMDGPU::encodeWaitcnt(IV, ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT], ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT], ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]); - // We don't remove waitcnts that existed prior to the waitcnt - // pass. Check if the waitcnt to-be-inserted can be avoided - // or if the prev waitcnt can be updated. - bool insertSWaitInst = true; - for (MachineBasicBlock::iterator I = MI.getIterator(), - B = MI.getParent()->begin(); - insertSWaitInst && I != B; --I) { - if (I == MI.getIterator()) - continue; + // We don't remove waitcnts that existed prior to the waitcnt + // pass. Check if the waitcnt to-be-inserted can be avoided + // or if the prev waitcnt can be updated. + bool insertSWaitInst = true; + for (MachineBasicBlock::iterator I = MI.getIterator(), + B = MI.getParent()->begin(); + insertSWaitInst && I != B; --I) { + if (I == MI.getIterator()) + continue; - switch (I->getOpcode()) { - case AMDGPU::S_WAITCNT: - if (isWaitcntStronger(I->getOperand(0).getImm(), Enc)) - insertSWaitInst = false; - else if (!OldWaitcnt) { - OldWaitcnt = &*I; - Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc); - } - break; - // TODO: skip over instructions which never require wait. + switch (I->getOpcode()) { + case AMDGPU::S_WAITCNT: + if (isWaitcntStronger(I->getOperand(0).getImm(), Enc)) + insertSWaitInst = false; + else if (!OldWaitcnt) { + OldWaitcnt = &*I; + Enc = combineWaitcnt(I->getOperand(0).getImm(), Enc); } break; + // TODO: skip over instructions which never require wait. } - if (insertSWaitInst) { - if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) { - if (ForceEmitZeroWaitcnts) - LLVM_DEBUG( - dbgs() - << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n"); - if (IsForceEmitWaitcnt) - LLVM_DEBUG(dbgs() - << "Force emit a s_waitcnt due to debug counter\n"); - - OldWaitcnt->getOperand(0).setImm(Enc); - if (!OldWaitcnt->getParent()) - MI.getParent()->insert(MI, OldWaitcnt); - - LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n" - << "Old Instr: " << MI << '\n' - << "New Instr: " << *OldWaitcnt << '\n'); - } else { - auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), - MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + break; + } + if (insertSWaitInst) { + if (OldWaitcnt) { + assert(OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT); + if (ForceEmitZeroWaitcnts) + LLVM_DEBUG(dbgs() + << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n"); + if (IsForceEmitWaitcnt) + LLVM_DEBUG(dbgs() << "Force emit a s_waitcnt due to debug counter\n"); + + OldWaitcnt->getOperand(0).setImm(Enc); + if (!OldWaitcnt->getParent()) + MI.getParent()->insert(MI, OldWaitcnt); + + LLVM_DEBUG(dbgs() << "updateWaitcntInBlock\n" + << "Old Instr: " << MI << '\n' + << "New Instr: " << *OldWaitcnt << '\n'); + } else { + auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), + MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) .addImm(Enc); - TrackedWaitcntSet.insert(SWaitInst); + TrackedWaitcntSet.insert(SWaitInst); - LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n" - << "Old Instr: " << MI << '\n' - << "New Instr: " << *SWaitInst << '\n'); - } + LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n" + << "Old Instr: " << MI << '\n' + << "New Instr: " << *SWaitInst << '\n'); } + } - if (CntVal[EXP_CNT] == 0) { - ScoreBrackets->setMixedExpTypes(false); - } + if (CntVal[EXP_CNT] == 0) { + ScoreBrackets->setMixedExpTypes(false); } } }