Index: lib/CodeGen/RegisterCoalescer.cpp =================================================================== --- lib/CodeGen/RegisterCoalescer.cpp +++ lib/CodeGen/RegisterCoalescer.cpp @@ -1396,11 +1396,29 @@ } else if (SrcLI.liveAt(Idx)) return false; - LLVM_DEBUG(dbgs() << "\tEliminating copy of value\n"); - - // Remove any DstReg segments starting at the instruction. + // If the undef copy defines a live-out value (i.e. an input to a PHI def), + // then replace it with an IMPLICIT_DEF. LiveInterval &DstLI = LIS->getInterval(DstReg); SlotIndex RegIndex = Idx.getRegSlot(); + LiveRange::Segment *Seg = DstLI.getSegmentContaining(RegIndex); + assert(Seg != nullptr && "No segment for defining instruction"); + if (VNInfo *V = DstLI.getVNInfoAt(Seg->end)) { + if (V->isPHIDef()) { + CopyMI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF)); + for (unsigned i = CopyMI->getNumOperands(); i != 0; --i) { + MachineOperand &MO = CopyMI->getOperand(i-1); + if (MO.isReg() && MO.isUse()) + CopyMI->RemoveOperand(i-1); + } + LLVM_DEBUG(dbgs() << "\tReplaced copy of value with an " + "implicit def\n"); + return false; + } + } + + // Remove any DstReg segments starting at the instruction. + LLVM_DEBUG(dbgs() << "\tEliminating copy of value\n"); + // Remove value or merge with previous one in case of a subregister def. if (VNInfo *PrevVNI = DstLI.getVNInfoAt(Idx)) { VNInfo *VNI = DstLI.getVNInfoAt(RegIndex); @@ -2078,6 +2096,13 @@ /// True once Pruned above has been computed. bool PrunedComputed = false; + /// True if this value is determined to be identical to OtherVNI + /// (in valuesIdentical). This is used with CR_Erase where the erased + /// copy is redundant, i.e. the source value is already the same as + /// the destination. In such cases the subranges need to be updated + /// properly. See comment at pruneSubRegValues for more info. + bool Identical = false; + Val() = default; bool isAnalyzed() const { return WriteLanes.any(); } @@ -2256,7 +2281,7 @@ const VNInfo *Orig0; unsigned Reg0; std::tie(Orig0, Reg0) = followCopyChain(Value0); - if (Orig0 == Value1) + if (Orig0 == Value1 || Reg0 == Other.Reg) return true; const VNInfo *Orig1; @@ -2438,8 +2463,10 @@ // %this = COPY %ext <-- Erase this copy // if (DefMI->isFullCopy() && !CP.isPartial() - && valuesIdentical(VNI, V.OtherVNI, Other)) + && valuesIdentical(VNI, V.OtherVNI, Other)) { + V.Identical = true; return CR_Erase; + } // If the lanes written by this instruction were all undef in OtherVNI, it is // still safe to join the live ranges. This can't be done with a simple value @@ -2743,19 +2770,62 @@ } } +/// Consider the following situation when coalescing the copy between +/// %31 and %45 at 800. (The vertical lines represent live range segments.) +/// +/// Main range Subrange 0004 (sub2) +/// %31 %45 %31 %45 +/// 544 %45 = COPY %28 + + +/// | v1 | v1 +/// 560B bb.1: + + +/// 624 = %45.sub2 | v2 | v2 +/// 800 %31 = COPY %45 + + + + +/// | v0 | v0 +/// 816 %31.sub1 = ... + | +/// 880 %30 = COPY %31 | v1 + +/// 928 %45 = COPY %30 | + + +/// | | v0 | v0 <--+ +/// 992B ; backedge -> bb.1 | + + | +/// 1040 = %31.sub0 + | +/// This value must remain +/// live-out! +/// +/// Assuming that %31 is coalesced into %45, the copy at 928 becomes +/// redundant, since it copies the value from %45 back into it. The +/// conflict resolution for the main range determines that %45.v0 is +/// to be erased, which is ok since %31.v1 is identical to it. +/// The problem happens with the subrange for sub2: it has to be live +/// on exit from the block, but since 928 was actually a point of +/// definition of %45.sub2, %45.sub2 was not live immediately prior +/// to that definition. As a result, when 928 was erased, the value v0 +/// for %45.sub2 was pruned in pruneSubRegValues. Consequently, an +/// IMPLICIT_DEF was inserted as a "backedge" definition for %45.sub2, +/// providing an incorrect value to the use at 624. +/// +/// Since the main-range values %31.v1 and %45.v0 were proved to be +/// identical, the corresponding values in subranges must also be the +/// same. A redundant copy is removed because it's not needed, and not +/// because it copied an undefined value, so any liveness that originated +/// from that copy cannot disappear. When pruning a value that started +/// at the removed copy, the corresponding identical value must be +/// extended to replace it. void JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask) { // Look for values being erased. bool DidPrune = false; for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) { + Val &V = Vals[i]; // We should trigger in all cases in which eraseInstrs() does something. // match what eraseInstrs() is doing, print a message so - if (Vals[i].Resolution != CR_Erase && - (Vals[i].Resolution != CR_Keep || !Vals[i].ErasableImplicitDef || - !Vals[i].Pruned)) + if (V.Resolution != CR_Erase && + (V.Resolution != CR_Keep || !V.ErasableImplicitDef || !V.Pruned)) continue; // Check subranges at the point where the copy will be removed. SlotIndex Def = LR.getValNumInfo(i)->def; + SlotIndex OtherDef; + if (V.Identical) + OtherDef = V.OtherVNI->def; + // Print message so mismatches with eraseInstrs() can be diagnosed. LLVM_DEBUG(dbgs() << "\t\tExpecting instruction removal at " << Def << '\n'); @@ -2772,6 +2842,12 @@ DidPrune = true; // Mark value number as unused. ValueOut->markUnused(); + + if (V.Identical) { + LiveRange::iterator Id = S.find(OtherDef); + assert(Id != S.end()); + LIS->extendToIndices(S, {Id->end, Q.endPoint()}); + } continue; } // If a subrange ends at the copy, then a value was copied but only Index: test/CodeGen/AMDGPU/coalescing-with-subregs-in-loop-bug.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/coalescing-with-subregs-in-loop-bug.mir @@ -0,0 +1,197 @@ +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 -run-pass=simple-register-coalescing,rename-independent-subregs %s -o - | FileCheck -check-prefix=GCN %s + +# This test is for a bug where the following happens: +# +# Inside the loop, %29.sub2 is used in a V_LSHLREV whose result is then used +# in an LDS read. %29 is a 128 bit value that is linked by copies to +# %45 (from phi elimination), %28 (the value in the loop pre-header), +# %31 (defined and subreg-modified in the loop, and used after the loop) +# and %30: +# +# %45:vreg_128 = COPY killed %28 +# bb.39: +# %29:vreg_128 = COPY killed %45 +# %39:vgpr_32 = V_LSHLREV_B32_e32 2, %29.sub2, implicit $exec +# %31:vreg_128 = COPY killed %29 +# %31.sub1:vreg_128 = COPY %34 +# %30:vreg_128 = COPY %31 +# %45:vreg_128 = COPY killed %30 +# S_CBRANCH_EXECNZ %bb.39, implicit $exec +# S_BRANCH %bb.40 +# bb.40: +# undef %8615.sub0:vreg_128 = COPY killed %31.sub0 +# +# So this coalesces together into a single 128 bit value whose sub1 is modified +# in the loop, but the sub2 used in the V_LSHLREV is not modified in the loop. +# +# The bug is that the coalesced value has a L00000004 subrange (for sub2) that +# says that it is not live up to the end of the loop block. The symptom is that +# Rename Independent Subregs separates sub2 into its own register, and it is +# not live round the loop, so that pass adds an IMPLICIT_DEF for it just before +# the loop backedge. + +# GCN: bb.1 (%ir-block.6): +# GCN: V_LSHLREV_B32_e32 2, [[val:%[0-9][0-9]*]].sub2 +# GCN-NOT: [[val]]:vreg_128 = IMPLICIT_DEF + +--- | + target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + target triple = "amdgcn--amdpal" + + define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg, i32 inreg, i32 inreg, <3 x i32> inreg, i32 inreg, <3 x i32>) local_unnamed_addr #0 { + .entry: + br label %6 + + ;