Index: lib/CodeGen/RegisterCoalescer.h =================================================================== --- lib/CodeGen/RegisterCoalescer.h +++ lib/CodeGen/RegisterCoalescer.h @@ -55,6 +55,9 @@ /// SrcReg and DstReg. const TargetRegisterClass *NewRC = nullptr; + /// Whether subranges need to be recalculated after coalescing. + bool NeedRecalcSubRanges = false; + public: CoalescerPair(const TargetRegisterInfo &tri) : TRI(tri) {} @@ -106,6 +109,13 @@ /// Return the register class of the coalesced register. const TargetRegisterClass *getNewRC() const { return NewRC; } + + /// Get whether subranges need to be recalculated after coalescing. + bool getNeedRecalcSubRanges() const { return NeedRecalcSubRanges; } + + /// Flag that subranges will need to be recalculated after coalescing. + void setNeedRecalcSubRanges() { NeedRecalcSubRanges = true; } + }; } // end namespace llvm Index: lib/CodeGen/RegisterCoalescer.cpp =================================================================== --- lib/CodeGen/RegisterCoalescer.cpp +++ lib/CodeGen/RegisterCoalescer.cpp @@ -1785,6 +1785,13 @@ // Update regalloc hint. TRI->updateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF); + if (CP.getNeedRecalcSubRanges()) { + // CoalescerPair decided that subranges need to be recalculated. + // Recalculate the whole live interval. + LIS->removeInterval(CP.getDstReg()); + LIS->createAndComputeVirtRegInterval(CP.getDstReg()); + } + DEBUG({ dbgs() << "\tSuccess: " << printReg(CP.getSrcReg(), TRI, CP.getSrcIdx()) << " -> " << printReg(CP.getDstReg(), TRI, CP.getDstIdx()) << '\n'; @@ -2197,7 +2204,8 @@ /// Removes subranges starting at copies that get removed. This sometimes /// happens when undefined subranges are copied around. These ranges contain /// no useful information and can be removed. - void pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask); + /// Returns whether subrange recalculation is needed. + bool pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask); /// Pruning values in subranges can lead to removing segments in these /// subranges started by IMPLICIT_DEFs. The corresponding segments in @@ -2779,7 +2787,9 @@ } } -void JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask) { +// Returns whether subrange recalculation is needed. +bool JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask) { + bool NeedRecalcSubRanges = false; // Look for values being erased. bool DidPrune = false; for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) { @@ -2797,8 +2807,10 @@ for (LiveInterval::SubRange &S : LI.subranges()) { LiveQueryResult Q = S.Query(Def); - // If a subrange starts at the copy then an undefined value has been - // copied and we must remove that subrange value as well. + // If a subrange starts at the copy then an undefined value has + // possibly, but not definitely, been copied. We need to remove the + // subrange value, but we need to ask for subrange recalculation too, in + // case it wasn't dead. VNInfo *ValueOut = Q.valueOutOrDead(); if (ValueOut != nullptr && Q.valueIn() == nullptr) { DEBUG(dbgs() << "\t\tPrune sublane " << PrintLaneMask(S.LaneMask) @@ -2807,6 +2819,7 @@ DidPrune = true; // Mark value number as unused. ValueOut->markUnused(); + NeedRecalcSubRanges = true; continue; } // If a subrange ends at the copy, then a value was copied but only @@ -2820,6 +2833,7 @@ } if (DidPrune) LI.removeEmptySubRanges(); + return NeedRecalcSubRanges; } /// Check if any of the subranges of @p LI contain a definition at @p Def. @@ -3098,8 +3112,10 @@ // having stale segments. LHSVals.pruneMainSegments(LHS, ShrinkMainRange); - LHSVals.pruneSubRegValues(LHS, ShrinkMask); - RHSVals.pruneSubRegValues(LHS, ShrinkMask); + if (LHSVals.pruneSubRegValues(LHS, ShrinkMask)) + CP.setNeedRecalcSubRanges(); + if (RHSVals.pruneSubRegValues(LHS, ShrinkMask)) + CP.setNeedRecalcSubRanges(); } // The merging algorithm in LiveInterval::join() can't handle conflicting Index: test/CodeGen/AMDGPU/coalescing-with-subregs-in-loop-bug.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/coalescing-with-subregs-in-loop-bug.mir @@ -0,0 +1,220 @@ +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 -run-pass=simple-register-coalescing,rename-independent-subregs %s -o - | FileCheck -check-prefix=GCN %s + +# This test is for a bug where the following happens: +# +# Inside the loop, %29.sub2 is used in a V_LSHLREV whose result is then used +# in an LDS read. %29 is a 128 bit value that is linked by copies to +# %45 (from phi elimination), %28 (the value in the loop pre-header), +# %31 (defined and subreg-modified in the loop, and used after the loop) +# and %30: +# +# %45:vreg_128 = COPY killed %28 +# bb.39: +# %29:vreg_128 = COPY killed %45 +# %39:vgpr_32 = V_LSHLREV_B32_e32 2, %29.sub2, implicit $exec +# %31:vreg_128 = COPY killed %29 +# %31.sub1:vreg_128 = COPY %34 +# %30:vreg_128 = COPY %31 +# %45:vreg_128 = COPY killed %30 +# S_CBRANCH_EXECNZ %bb.39, implicit $exec +# S_BRANCH %bb.40 +# bb.40: +# undef %32.sub0:vreg_128 = COPY killed %31.sub0 +# +# So this coalesces together into a single 128 bit value whose sub1 is modified +# in the loop, but the sub2 used in the V_LSHLREV is not modified in the loop. +# +# The bug is that the coalesced value has a L00000004 subrange (for sub2) that +# says that it is not live up to the end of the loop block. The symptom is that +# Rename Independent Subregs separates sub2 into its own register, and it is +# not live round the loop, so that pass adds an IMPLICIT_DEF for it just before +# the loop backedge. + +# GCN: bb.1 (%ir-block.6): +# GCN: V_LSHLREV_B32_e32 2, [[val:%[0-9][0-9]*]].sub2 +# GCN-NOT: [[val]]:vreg_128 = IMPLICIT_DEF + +--- | + ; ModuleID = '../test/CodeGen/AMDGPU/coalescing-with-subregs-in-loop-bug.mir' + source_filename = "../test/CodeGen/AMDGPU/coalescing-with-subregs-in-loop-bug.mir" + target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + target triple = "amdgcn--amdpal" + + define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg, i32 inreg, i32 inreg, <3 x i32> inreg, i32 inreg, <3 x i32>) local_unnamed_addr #0 { + .entry: + br label %6 + + ;