diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp --- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp +++ b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp @@ -83,9 +83,10 @@ class Candidate { public: - Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks, - unsigned weight) - : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {} + Candidate(MachineInstr *mi, unsigned reg, unsigned subreg, + unsigned freebanks, unsigned weight) + : MI(mi), Reg(reg), SubReg(subreg), FreeBanks(freebanks), + Weight(weight) {} bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; } @@ -100,6 +101,7 @@ MachineInstr *MI; unsigned Reg; + unsigned SubReg; unsigned FreeBanks; unsigned Weight; }; @@ -162,7 +164,7 @@ const MCPhysReg *CSRegs; // Returns bank for a phys reg. - unsigned getPhysRegBank(unsigned Reg) const; + unsigned getPhysRegBank(unsigned Reg, unsigned SubReg) const; // Return a bit set for each register bank used. 4 banks for VGPRs and // 8 banks for SGPRs. @@ -176,7 +178,7 @@ // a register chosen from Bank. std::pair analyzeInst(const MachineInstr &MI, unsigned Reg = AMDGPU::NoRegister, - int Bank = -1); + unsigned SubReg = 0, int Bank = -1); // Return true if register is regular VGPR or SGPR or their tuples. // Returns false for special registers like m0, vcc etc. @@ -216,11 +218,12 @@ // candidates are collected and added to work list. unsigned computeStallCycles(unsigned SrcReg, unsigned Reg = AMDGPU::NoRegister, - int Bank = -1, bool Collect = false); + unsigned SubReg = 0, int Bank = -1, + bool Collect = false); // Search for a register in Bank unused within LI. // Returns phys reg or NoRegister. - unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const; + unsigned scavengeReg(LiveInterval &LI, unsigned Bank, unsigned SubReg) const; // Try to reassign candidate. Returns number or stall cycles saved. unsigned tryReassign(Candidate &C); @@ -277,15 +280,24 @@ char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID; -unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const { +unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg, + unsigned SubReg) const { assert(Register::isPhysicalRegister(Reg)); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg); unsigned Size = TRI->getRegSizeInBits(*RC); if (Size == 16) Reg = TRI->get32BitRegister(Reg); - else if (Size > 32) - Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + else if (Size > 32) { + if (SubReg) { + const TargetRegisterClass *SubRC = TRI->getSubRegClass(RC, SubReg); + Reg = TRI->getSubReg(Reg, SubReg); + if (TRI->getRegSizeInBits(*SubRC) > 32) + Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + } else { + Reg = TRI->getSubReg(Reg, AMDGPU::sub0); + } + } if (TRI->hasVGPRs(RC)) { Reg -= AMDGPU::VGPR0; @@ -360,7 +372,7 @@ std::pair GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg, - int Bank) { + unsigned SubReg, int Bank) { unsigned StallCycles = 0; unsigned UsedBanks = 0; @@ -382,19 +394,23 @@ unsigned ShiftedBank = Bank; if (Bank != -1 && R == Reg && Op.getSubReg()) { + unsigned RegOffset = + TRI->getChannelFromSubReg(SubReg ? SubReg : (unsigned)AMDGPU::sub0); unsigned Offset = TRI->getChannelFromSubReg(Op.getSubReg()); LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()); - if (Offset && Bank < NUM_VGPR_BANKS) { + if (Bank < NUM_VGPR_BANKS) { // If a register spans all banks we cannot shift it to avoid conflict. if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS) continue; - ShiftedBank = (Bank + Offset) % NUM_VGPR_BANKS; - } else if (Offset > 1 && Bank >= SGPR_BANK_OFFSET) { + unsigned Shift = ((NUM_VGPR_BANKS + Offset) - RegOffset); + ShiftedBank = (Bank + Shift) % NUM_VGPR_BANKS; + } else if (Bank >= SGPR_BANK_OFFSET) { // If a register spans all banks we cannot shift it to avoid conflict. if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS) continue; + unsigned Shift = (NUM_SGPR_BANKS + (Offset >> 1)) - (RegOffset >> 1); ShiftedBank = SGPR_BANK_OFFSET + - (Bank - SGPR_BANK_OFFSET + (Offset >> 1)) % NUM_SGPR_BANKS; + (Bank - SGPR_BANK_OFFSET + Shift) % NUM_SGPR_BANKS; } } @@ -576,17 +592,17 @@ unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks); unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks); if (FreeBanks1) - Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight - + ((Size2 > Size1) ? 1 : 0))); + Candidates.push(Candidate(&MI, Reg1, SubReg1, FreeBanks1, + Weight + ((Size2 > Size1) ? 1 : 0))); if (FreeBanks2) - Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight - + ((Size1 > Size2) ? 1 : 0))); + Candidates.push(Candidate(&MI, Reg2, SubReg2, FreeBanks2, + Weight + ((Size1 > Size2) ? 1 : 0))); } } } -unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, - unsigned Reg, int Bank, +unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg, unsigned Reg, + unsigned SubReg, int Bank, bool Collect) { unsigned TotalStallCycles = 0; SmallSet Visited; @@ -598,7 +614,7 @@ continue; unsigned StallCycles; unsigned UsedBanks; - std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, Bank); + std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, SubReg, Bank); TotalStallCycles += StallCycles; if (Collect) collectCandidates(MI, UsedBanks, StallCycles); @@ -607,8 +623,8 @@ return TotalStallCycles; } -unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI, - unsigned Bank) const { +unsigned GCNRegBankReassign::scavengeReg(LiveInterval &LI, unsigned Bank, + unsigned SubReg) const { const TargetRegisterClass *RC = MRI->getRegClass(LI.reg); unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs : MaxNumSGPRs; @@ -620,7 +636,7 @@ if (TRI->isSubRegisterEq(Reg, MaxReg)) break; - if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank) + if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg, SubReg) != Bank) continue; for (unsigned I = 0; CSRegs[I]; ++I) @@ -669,7 +685,7 @@ for (int Bank = 0; Bank < NUM_BANKS; ++Bank) { if (C.FreeBanks & (1 << Bank)) { LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n'); - unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank); + unsigned Stalls = computeStallCycles(C.Reg, C.Reg, C.SubReg, Bank); if (Stalls < OrigStalls) { LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> " << Stalls << '\n'); @@ -683,7 +699,7 @@ LRM->unassign(LI); while (!BankStalls.empty()) { BankStall BS = BankStalls.pop_back_val(); - unsigned Reg = scavengeReg(LI, BS.Bank); + unsigned Reg = scavengeReg(LI, BS.Bank, C.SubReg); if (Reg == AMDGPU::NoRegister) { LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank) << '\n'); @@ -801,7 +817,7 @@ Candidates.pop_back(); if (LocalCyclesSaved) { removeCandidates(C.Reg); - computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true); + computeStallCycles(C.Reg, AMDGPU::NoRegister, 0, -1, true); Candidates.sort(); LLVM_DEBUG(dbgs() << "\nCandidates:\n\n"; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -1492,7 +1492,7 @@ ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v18 ; MOVREL-NEXT: v_mov_b32_e32 v19, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 -; MOVREL-NEXT: v_mov_b32_e32 v20, v1 +; MOVREL-NEXT: v_mov_b32_e32 v23, v1 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 3, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 7, v18 @@ -1501,7 +1501,7 @@ ; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v16, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4 -; MOVREL-NEXT: v_cndmask_b32_e32 v1, v20, v17, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v17, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v17, s4 @@ -2123,7 +2123,7 @@ ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_add_nc_u32_e32 v18, 1, v18 ; MOVREL-NEXT: v_mov_b32_e32 v19, v0 -; MOVREL-NEXT: v_mov_b32_e32 v20, v1 +; MOVREL-NEXT: v_mov_b32_e32 v23, v1 ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v18 @@ -2137,7 +2137,7 @@ ; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, v16, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, v16, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v16, s4 -; MOVREL-NEXT: v_cndmask_b32_e32 v1, v20, v17, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v1, v23, v17, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, v17, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v17, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v11, v11, v17, s4 @@ -4111,7 +4111,7 @@ ; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 4, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 5, v16 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 6, v16 -; MOVREL-NEXT: v_mov_b32_e32 v17, v2 +; MOVREL-NEXT: v_mov_b32_e32 v19, v2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v16 ; MOVREL-NEXT: v_mov_b32_e32 v18, v3 ; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v14, s2 @@ -4119,7 +4119,7 @@ ; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v14, s4 ; MOVREL-NEXT: v_cndmask_b32_e64 v12, v12, v14, s5 ; MOVREL-NEXT: v_cndmask_b32_e64 v7, v7, v15, s2 -; MOVREL-NEXT: v_cndmask_b32_e64 v2, v17, v14, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v2, v19, v14, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v15, s3 ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v18, v15, s0 ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v14, vcc_lo @@ -4251,42 +4251,42 @@ ; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s8, s10 ; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: v_mov_b32_e32 v18, s15 -; MOVREL-NEXT: v_mov_b32_e32 v17, s14 -; MOVREL-NEXT: v_mov_b32_e32 v16, s13 -; MOVREL-NEXT: v_mov_b32_e32 v15, s12 -; MOVREL-NEXT: v_mov_b32_e32 v14, s11 -; MOVREL-NEXT: v_mov_b32_e32 v13, s10 -; MOVREL-NEXT: v_mov_b32_e32 v12, s9 -; MOVREL-NEXT: v_mov_b32_e32 v11, s8 -; MOVREL-NEXT: v_mov_b32_e32 v10, s7 -; MOVREL-NEXT: v_mov_b32_e32 v9, s6 -; MOVREL-NEXT: v_mov_b32_e32 v8, s5 -; MOVREL-NEXT: v_mov_b32_e32 v7, s4 -; MOVREL-NEXT: v_mov_b32_e32 v6, s3 -; MOVREL-NEXT: v_mov_b32_e32 v5, s2 -; MOVREL-NEXT: v_mov_b32_e32 v4, s1 -; MOVREL-NEXT: v_mov_b32_e32 v3, s0 +; MOVREL-NEXT: v_mov_b32_e32 v20, s15 +; MOVREL-NEXT: v_mov_b32_e32 v19, s14 +; MOVREL-NEXT: v_mov_b32_e32 v18, s13 +; MOVREL-NEXT: v_mov_b32_e32 v17, s12 +; MOVREL-NEXT: v_mov_b32_e32 v16, s11 +; MOVREL-NEXT: v_mov_b32_e32 v15, s10 +; MOVREL-NEXT: v_mov_b32_e32 v14, s9 +; MOVREL-NEXT: v_mov_b32_e32 v13, s8 +; MOVREL-NEXT: v_mov_b32_e32 v12, s7 +; MOVREL-NEXT: v_mov_b32_e32 v11, s6 +; MOVREL-NEXT: v_mov_b32_e32 v10, s5 +; MOVREL-NEXT: v_mov_b32_e32 v9, s4 +; MOVREL-NEXT: v_mov_b32_e32 v8, s3 +; MOVREL-NEXT: v_mov_b32_e32 v7, s2 +; MOVREL-NEXT: v_mov_b32_e32 v6, s1 +; MOVREL-NEXT: v_mov_b32_e32 v5, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 ; MOVREL-NEXT: ; implicit-def: $vcc_hi -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v3, v0, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v3, v4, v1, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v4, v5, v0, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v5, v6, v1, s0 +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v5, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v6, v1, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v4, v7, v0, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v8, v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 3 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v4 -; MOVREL-NEXT: v_readfirstlane_b32 s3, v5 -; MOVREL-NEXT: v_cndmask_b32_e32 v6, v7, v0, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v7, v8, v1, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e64 v8, v9, v0, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v9, v10, v1, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v0, v11, v0, s1 -; MOVREL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s1 +; MOVREL-NEXT: v_cndmask_b32_e32 v6, v9, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v7, v10, v1, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e64 v8, v11, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v9, v12, v1, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v0, v13, v0, s1 +; MOVREL-NEXT: v_cndmask_b32_e64 v1, v14, v1, s1 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v3 +; MOVREL-NEXT: v_readfirstlane_b32 s3, v5 ; MOVREL-NEXT: v_readfirstlane_b32 s4, v6 ; MOVREL-NEXT: v_readfirstlane_b32 s5, v7 ; MOVREL-NEXT: v_readfirstlane_b32 s6, v8 @@ -4448,7 +4448,7 @@ ; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_s: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 -; MOVREL-NEXT: v_mov_b32_e32 v13, v2 +; MOVREL-NEXT: v_mov_b32_e32 v15, v2 ; MOVREL-NEXT: v_mov_b32_e32 v14, v3 ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo @@ -4457,7 +4457,7 @@ ; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 2 ; MOVREL-NEXT: v_readfirstlane_b32 s3, v3 ; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc_lo @@ -4514,7 +4514,7 @@ ; MOVREL-LABEL: dyn_insertelement_v5f64_v_v_v: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 -; MOVREL-NEXT: v_mov_b32_e32 v13, v2 +; MOVREL-NEXT: v_mov_b32_e32 v15, v2 ; MOVREL-NEXT: v_mov_b32_e32 v14, v3 ; MOVREL-NEXT: ; implicit-def: $vcc_hi ; MOVREL-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo @@ -4522,7 +4522,7 @@ ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v12 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v0 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v1 -; MOVREL-NEXT: v_cndmask_b32_e32 v2, v13, v10, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v2, v15, v10, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v14, v11, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v12 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v2 diff --git a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir --- a/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir +++ b/llvm/test/CodeGen/AMDGPU/regbank-reassign.mir @@ -494,3 +494,19 @@ %2 = V_AND_B32_e32 %1, %0, implicit $exec S_ENDPGM 0 ... + +# GCN-LABEL: s0_vs_s15_16_17_sub1{{$}} +# GCN: S_AND_B32 renamable $sgpr13, $sgpr0, +--- +name: s0_vs_s15_16_17_sub1 +tracksRegLiveness: true +registers: + - { id: 0, class: sgpr_96, preferred-register: '$sgpr15_sgpr16_sgpr17' } + - { id: 1, class: sgpr_32 } +body: | + bb.0: + %0 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + %1 = S_AND_B32 %0.sub1, $sgpr0, implicit-def $scc + S_ENDPGM 0 +...