diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp --- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -165,6 +165,8 @@ std::vector &Worklist); void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg, unsigned SubReg, char Flag, std::vector &Worklist); + void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag, + std::vector &Worklist); void markInstructionUses(const MachineInstr &MI, char Flag, std::vector &Worklist); char scanInstructions(MachineFunction &MF, std::vector &Worklist); @@ -272,8 +274,6 @@ assert(!(Flag & StateExact) && Flag != 0); - LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI); - // Remove any disabled states from the flag. The user that required it gets // an undefined value in the helper lanes. For example, this can happen if // the result of an atomic is used by instruction that requires WQM, where @@ -285,6 +285,7 @@ if ((II.Needs & Flag) == Flag) return; + LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI); II.Needs |= Flag; Worklist.push_back(&MI); } @@ -299,6 +300,16 @@ if (!UseLRQ.valueIn()) return; + // Note: this code assumes that lane masks on AMDGPU completely + // cover registers. + LaneBitmask DefinedLanes; + LaneBitmask UseLanes; + if (SubReg) { + UseLanes = TRI->getSubRegIndexLaneMask(SubReg); + } else if (Reg.isVirtual()) { + UseLanes = MRI->getMaxLaneMaskForVReg(Reg); + } + SmallPtrSet Visited; SmallVector ToProcess; ToProcess.push_back(UseLRQ.valueIn()); @@ -321,64 +332,93 @@ } else { MachineInstr *MI = LIS->getInstructionFromIndex(Value->def); assert(MI && "Def has no defining instruction"); - markInstruction(*MI, Flag, Worklist); - // Iterate over all operands to find relevant definitions - for (const MachineOperand &Op : MI->operands()) { - if (!(Op.isReg() && Op.getReg() == Reg)) - continue; - - // Does this def cover whole register? - bool DefinesFullReg = - Op.isUndef() || !Op.getSubReg() || Op.getSubReg() == SubReg; - if (!DefinesFullReg) { - // Partial definition; need to follow and mark input value - LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI)); - if (const VNInfo *VN = LRQ.valueIn()) { - if (!Visited.count(VN)) - ToProcess.push_back(VN); + if (Reg.isVirtual()) { + // Iterate over all operands to find relevant definitions + bool HasDef = false; + for (const MachineOperand &Op : MI->operands()) { + if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg)) + continue; + + // Compute lanes defined and overlap with use + LaneBitmask OpLanes = + Op.isUndef() ? LaneBitmask::getAll() + : TRI->getSubRegIndexLaneMask(Op.getSubReg()); + LaneBitmask Overlap = (UseLanes & OpLanes); + + // Record if this instruction defined any of use + HasDef |= Overlap.any(); + + // Check if all lanes of use have been defined + DefinedLanes |= OpLanes; + if ((DefinedLanes & UseLanes) != UseLanes) { + // Definition not complete; need to process input value + LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI)); + if (const VNInfo *VN = LRQ.valueIn()) { + if (!Visited.count(VN)) + ToProcess.push_back(VN); + } } } + // Only mark the instruction if it defines some part of the use + if (HasDef) + markInstruction(*MI, Flag, Worklist); + } else { + // For physical registers simply mark the defining instruction + markInstruction(*MI, Flag, Worklist); } } } while (!ToProcess.empty()); -} -/// Mark all instructions defining the uses in \p MI with \p Flag. -void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, - std::vector &Worklist) { - - LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": " - << MI); + assert(!Reg.isVirtual() || ((DefinedLanes & UseLanes) == UseLanes)); +} - for (const MachineOperand &Use : MI.uses()) { - if (!Use.isReg() || !Use.isUse()) - continue; +void SIWholeQuadMode::markOperand(const MachineInstr &MI, + const MachineOperand &Op, char Flag, + std::vector &Worklist) { + assert(Op.isReg()); + Register Reg = Op.getReg(); - Register Reg = Use.getReg(); + // Ignore some hardware registers + switch (Reg) { + case AMDGPU::EXEC: + case AMDGPU::EXEC_LO: + return; + default: + break; + } + LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op + << " for " << MI); + if (Reg.isVirtual()) { + LiveRange &LR = LIS->getInterval(Reg); + markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist); + } else { // Handle physical registers that we need to track; this is mostly relevant // for VCC, which can appear as the (implicit) input of a uniform branch, // e.g. when a loop counter is stored in a VGPR. - if (!Reg.isVirtual()) { - if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO) + for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid(); + ++RegUnit) { + LiveRange &LR = LIS->getRegUnit(*RegUnit); + const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); + if (!Value) continue; - for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid(); - ++RegUnit) { - LiveRange &LR = LIS->getRegUnit(*RegUnit); - const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn(); - if (!Value) - continue; + markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist); + } + } +} - markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist); - } +/// Mark all instructions defining the uses in \p MI with \p Flag. +void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag, + std::vector &Worklist) { + LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": " + << MI); + for (const MachineOperand &Use : MI.uses()) { + if (!Use.isReg() || !Use.isUse()) continue; - } - - LiveRange &LR = LIS->getInterval(Reg); - markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist); + markOperand(MI, Use, Flag, Worklist); } } @@ -441,11 +481,7 @@ if (Inactive.isUndef()) { LowerToCopyInstrs.push_back(&MI); } else { - Register Reg = Inactive.getReg(); - if (Reg.isVirtual()) { - for (MachineInstr &DefMI : MRI->def_instructions(Reg)) - markInstruction(DefMI, StateWWM, Worklist); - } + markOperand(MI, Inactive, StateWWM, Worklist); } } SetInactiveInstrs.push_back(&MI); diff --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir --- a/llvm/test/CodeGen/AMDGPU/wqm.mir +++ b/llvm/test/CodeGen/AMDGPU/wqm.mir @@ -233,3 +233,29 @@ $vgpr3 = COPY %9.sub1:vreg_128 SI_RETURN_TO_EPILOG $vgpr0, $vgpr1, $vgpr2, $vgpr3 ... + +--- +# Check that unnecessary instruction do not get marked for WWM +# +#CHECK-NOT: ENTER_WWM +#CHECK: BUFFER_LOAD_DWORDX2 +#CHECK-NOT: ENTER_WWM +#CHECK: V_SET_INACTIVE_B32 +#CHECK: V_SET_INACTIVE_B32 +#CHECK: ENTER_WWM +#CHECK: V_MAX +name: test_wwm_set_inactive_propagation +tracksRegLiveness: true +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0 + %0:sgpr_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 + %1:vgpr_32 = COPY $vgpr0 + %2:vreg_64 = BUFFER_LOAD_DWORDX2_OFFEN %1:vgpr_32, %0:sgpr_128, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %2.sub0:vreg_64 = V_SET_INACTIVE_B32 %2.sub0:vreg_64, 0, implicit $exec, implicit-def $scc + %2.sub1:vreg_64 = V_SET_INACTIVE_B32 %2.sub1:vreg_64, 0, implicit $exec, implicit-def $scc + %3:vreg_64 = nnan nsz arcp contract reassoc nofpexcept V_MAX_F64_e64 0, %2:vreg_64, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec + $vgpr0 = WWM %3.sub0:vreg_64, implicit $exec + $vgpr1 = WWM %3.sub1:vreg_64, implicit $exec + SI_RETURN_TO_EPILOG $vgpr0, $vgpr1 +...