diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -318,38 +318,63 @@ LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI); LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI)); - if (!UseLRQ.valueIn()) + const VNInfo *Value = UseLRQ.valueIn(); + if (!Value) return; // Note: this code assumes that lane masks on AMDGPU completely // cover registers. + const LaneBitmask UseLanes = + SubReg ? TRI->getSubRegIndexLaneMask(SubReg) + : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg) + : LaneBitmask::getNone()); + + // Perform a depth-first iteration of the LiveRange graph marking defs. + // Stop processing of a given branch when all use lanes have been defined. + // The first definition stops processing for a physical register. + struct PhiEntry { + const VNInfo *Phi; + unsigned PredIdx; + unsigned VisitIdx; + LaneBitmask DefinedLanes; + + PhiEntry(const VNInfo *Phi, unsigned PredIdx, unsigned VisitIdx, + LaneBitmask DefinedLanes) + : Phi(Phi), PredIdx(PredIdx), VisitIdx(VisitIdx), + DefinedLanes(DefinedLanes) {} + }; + SmallSetVector Visited; + SmallVector PhiStack; LaneBitmask DefinedLanes; - LaneBitmask UseLanes; - if (SubReg) { - UseLanes = TRI->getSubRegIndexLaneMask(SubReg); - } else if (Reg.isVirtual()) { - UseLanes = MRI->getMaxLaneMaskForVReg(Reg); - } - - SmallPtrSet Visited; - SmallVector ToProcess; - ToProcess.push_back(UseLRQ.valueIn()); + unsigned NextPredIdx; // Only used for processing phi nodes do { - const VNInfo *Value = ToProcess.pop_back_val(); - Visited.insert(Value); + const VNInfo *NextValue = nullptr; + + if (!Visited.count(Value)) { + Visited.insert(Value); + // On first visit to a phi then start processing first predecessor + NextPredIdx = 0; + } if (Value->isPHIDef()) { - // Need to mark all defs used in the PHI node + // Each predecessor node in the phi must be processed as a subgraph const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def); assert(MBB && "Phi-def has no defining MBB"); - for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), - PE = MBB->pred_end(); - PI != PE; ++PI) { + + // Find next predecessor to process + unsigned Idx = NextPredIdx; + auto PI = MBB->pred_begin() + Idx; + auto PE = MBB->pred_end(); + for (; PI != PE && !NextValue; ++PI, ++Idx) { if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) { if (!Visited.count(VN)) - ToProcess.push_back(VN); + NextValue = VN; } } + + // If there are more predecessors to process; add phi to stack + if (PI != PE) + PhiStack.emplace_back(Value, Idx, Visited.size(), DefinedLanes); } else { MachineInstr *MI = LIS->getInstructionFromIndex(Value->def); assert(MI && "Def has no defining instruction"); @@ -370,17 +395,20 @@ // Record if this instruction defined any of use HasDef |= Overlap.any(); - // Check if all lanes of use have been defined + // Mark any lanes defined DefinedLanes |= OpLanes; - if ((DefinedLanes & UseLanes) != UseLanes) { - // Definition not complete; need to process input value - LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI)); - if (const VNInfo *VN = LRQ.valueIn()) { - if (!Visited.count(VN)) - ToProcess.push_back(VN); - } + } + + // Check if all lanes of use have been defined + if ((DefinedLanes & UseLanes) != UseLanes) { + // Definition not complete; need to process input value + LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI)); + if (const VNInfo *VN = LRQ.valueIn()) { + if (!Visited.count(VN)) + NextValue = VN; } } + // Only mark the instruction if it defines some part of the use if (HasDef) markInstruction(*MI, Flag, Worklist); @@ -389,9 +417,21 @@ markInstruction(*MI, Flag, Worklist); } } - } while (!ToProcess.empty()); - assert(!Reg.isVirtual() || ((DefinedLanes & UseLanes) == UseLanes)); + if (!NextValue && !PhiStack.empty()) { + // Reach end of chain; revert to processing last phi + PhiEntry &Entry = PhiStack.back(); + NextValue = Entry.Phi; + NextPredIdx = Entry.PredIdx; + DefinedLanes = Entry.DefinedLanes; + // Rewind visited set to correct state + while (Visited.size() > Entry.VisitIdx) + Visited.pop_back(); + PhiStack.pop_back(); + } + + Value = NextValue; + } while (Value); } void SIWholeQuadMode::markOperand(const MachineInstr &MI, diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -859,6 +859,10 @@ ; CHECK-NEXT: ; %entry ; CHECK-NEXT: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec ; CHECK: s_wqm_b64 exec, exec +; CHECK: v_mov +; CHECK: v_mov +; CHECK: v_mov +; CHECK: v_mov ; CHECK: s_and_b64 exec, exec, [[LIVE]] ; CHECK: image_store ; CHECK: s_wqm_b64 exec, exec diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir --- a/llvm/test/CodeGen/AMDGPU/wqm.mir +++ b/llvm/test/CodeGen/AMDGPU/wqm.mir @@ -259,3 +259,42 @@ $vgpr1 = STRICT_WWM %3.sub1:vreg_64, implicit $exec SI_RETURN_TO_EPILOG $vgpr0, $vgpr1 ... + +--- +# Check that WQM marking occurs correctly through phi nodes in live range graph. +# If not then initial V_MOV will not be in WQM. +# +#CHECK-LABEL: name: test_wqm_lr_phi +#CHECK: COPY $exec +#CHECK-NEXT: S_WQM +#CHECK-NEXT: V_MOV_B32_e32 -10 +#CHECK-NEXT: V_MOV_B32_e32 0 +name: test_wqm_lr_phi +tracksRegLiveness: true +body: | + bb.0: + undef %0.sub0:vreg_64 = V_MOV_B32_e32 -10, implicit $exec + %0.sub1:vreg_64 = V_MOV_B32_e32 0, implicit $exec + %1:sreg_64 = S_GETPC_B64 + %2:sgpr_256 = S_LOAD_DWORDX8_IMM %1:sreg_64, 32, 0, 0 + + bb.1: + $vcc = V_CMP_LT_U32_e64 4, 4, implicit $exec + S_CBRANCH_VCCNZ %bb.3, implicit $vcc + S_BRANCH %bb.2 + + bb.2: + %0.sub0:vreg_64 = V_ADD_U32_e32 1, %0.sub1, implicit $exec + S_BRANCH %bb.3 + + bb.3: + %0.sub1:vreg_64 = V_ADD_U32_e32 1, %0.sub1, implicit $exec + S_BRANCH %bb.4 + + bb.4: + %3:sgpr_128 = IMPLICIT_DEF + %4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %2:sgpr_256, %3:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource") + $vgpr0 = COPY %4.sub0:vreg_128 + $vgpr1 = COPY %4.sub1:vreg_128 + SI_RETURN_TO_EPILOG $vgpr0, $vgpr1 +...