diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp --- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp @@ -372,19 +372,19 @@ unsigned ShiftedBank = Bank; if (Bank != -1 && R == Reg && Op.getSubReg()) { - unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger(); - if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) { + unsigned Offset = TRI->getChannelFromSubReg(Op.getSubReg()); + LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()); + if (Offset && Bank < NUM_VGPR_BANKS) { // If a register spans all banks we cannot shift it to avoid conflict. - if (countPopulation(LM) >= NUM_VGPR_BANKS) + if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS) continue; - ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS; - } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) { + ShiftedBank = (Bank + Offset) % NUM_VGPR_BANKS; + } else if (Offset > 1 && Bank >= SGPR_BANK_OFFSET) { // If a register spans all banks we cannot shift it to avoid conflict. - if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS) + if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS) continue; - ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET + - (countTrailingZeros(LM) >> 1)) % - NUM_SGPR_BANKS; + ShiftedBank = SGPR_BANK_OFFSET + + (Bank - SGPR_BANK_OFFSET + (Offset >> 1)) % NUM_SGPR_BANKS; } } @@ -496,16 +496,16 @@ unsigned FreeBanks = getFreeBanks(Mask, UsedBanks); - unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger(); - if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) { - unsigned Shift = countTrailingZeros(LM); + unsigned Offset = TRI->getChannelFromSubReg(SubReg); + if (Offset && (Mask & VGPR_BANK_MASK)) { + unsigned Shift = Offset; if (Shift >= NUM_VGPR_BANKS) return 0; unsigned VB = FreeBanks & VGPR_BANK_MASK; FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) & VGPR_BANK_MASK; - } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) { - unsigned Shift = countTrailingZeros(LM) >> 1; + } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) { + unsigned Shift = Offset >> 1; if (Shift >= NUM_SGPR_BANKS) return 0; unsigned SB = FreeBanks >> SGPR_BANK_OFFSET; diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp --- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -129,7 +129,7 @@ assert(PrevMask < NewMask); Value[Kind == SGPR_TUPLE ? SGPR32 : Kind == AGPR_TUPLE ? AGPR32 : VGPR32] += - Sign * (~PrevMask & NewMask).getNumLanes(); + Sign * SIRegisterInfo::getNumCoveredRegs(~PrevMask & NewMask); if (PrevMask.none()) { assert(NewMask.any()); @@ -221,7 +221,7 @@ return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg); auto MaxMask = MRI.getMaxLaneMaskForVReg(MO.getReg()); - if (MaxMask == LaneBitmask::getLane(0)) // cannot have subregs + if (SIRegisterInfo::getNumCoveredRegs(MaxMask) > 1) // cannot have subregs return MaxMask; // For a tentative schedule LIS isn't updated yet but livemask should remain diff --git a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp --- a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp +++ b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp @@ -111,10 +111,6 @@ unsigned ActiveLanes = TII->isGather4(Opcode) ? 4 : countPopulation(dmask); - // Subreg indices are counted from 1 - // When D16 then we want next whole VGPR after write data. - static_assert(AMDGPU::sub0 == 1 && AMDGPU::sub4 == 5, "Subreg indices different from expected"); - bool Packed = !ST.hasUnpackedD16VMem(); unsigned InitIdx = @@ -137,7 +133,7 @@ // all the result registers to 0, otherwise just the error indication // register (VGPRn+1) unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1; - unsigned CurrIdx = ST.usePRTStrictNull() ? 1 : InitIdx; + unsigned CurrIdx = ST.usePRTStrictNull() ? 0 : (InitIdx - 1); if (DstSize == 1) { // In this case we can just initialize the result directly @@ -158,7 +154,7 @@ BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst) .addReg(PrevDst) .addReg(SubReg) - .addImm(CurrIdx); + .addImm(AMDGPURegisterInfo::getSubRegFromChannel(CurrIdx)); PrevDst = NewDst; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3303,7 +3303,7 @@ if (Offset >= NumElts || Offset < 0) return std::make_pair(AMDGPU::sub0, Offset); - return std::make_pair(AMDGPU::sub0 + Offset, 0); + return std::make_pair(AMDGPURegisterInfo::getSubRegFromChannel(Offset), 0); } // Return true if the index is an SGPR and was set. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -831,7 +831,7 @@ MI.getParent()->getParent()->getRegInfo(). getRegClass(MO.getReg()), SubReg)) >= 32 && "Sub-dword subregs are not supported"); - return RI.getSubRegIndexLaneMask(SubReg).getNumLanes() * 4; + return RI.getNumChannelsFromSubReg(SubReg) * 4; } } return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -798,7 +798,7 @@ int64_t IdxValue = Idx == 0 ? Value : 0; MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, - get(Opcode), RI.getSubReg(DestReg, Idx)); + get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); Builder.addImm(IdxValue); } } diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -289,6 +289,21 @@ const uint32_t *getAllVGPRRegMask() const; const uint32_t *getAllAllocatableSRegMask() const; + // \returns number of 32 bit registers covered by a \p LM + static unsigned getNumCoveredRegs(LaneBitmask LM) { + return LM.getNumLanes(); + } + + // \returns a DWORD offset of a \p SubReg + unsigned getChannelFromSubReg(unsigned SubReg) const { + return SubReg ? alignTo(getSubRegIdxOffset(SubReg), 32) / 32 : 0; + } + + // \returns a DWORD size of a \p SubReg + unsigned getNumChannelsFromSubReg(unsigned SubReg) const { + return getNumCoveredRegs(getSubRegIndexLaneMask(SubReg)); + } + private: void buildSpillLoadStore(MachineBasicBlock::iterator MI, unsigned LoadStoreOp, diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1391,7 +1391,7 @@ return RC; // We can assume that each lane corresponds to one 32-bit register. - unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes(); + unsigned Count = getNumChannelsFromSubReg(SubIdx); if (isSGPRClass(RC)) { switch (Count) { case 1: diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -426,8 +426,7 @@ if (Register::isPhysicalRegister(Reg)) { Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I)); } else { - LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub); - Sub = TRI.getSubRegFromChannel(I + countTrailingZeros(LM.getAsInteger())); + Sub = TRI.getSubRegFromChannel(I + TRI.getChannelFromSubReg(Sub)); } } return TargetInstrInfo::RegSubRegPair(Reg, Sub); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -960,7 +960,7 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID); - const unsigned FirstSubReg = TRI->getSubReg(Reg, 1); + const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0); return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) || Reg == AMDGPU::SCC; }