Index: lib/CodeGen/RegisterCoalescer.cpp =================================================================== --- lib/CodeGen/RegisterCoalescer.cpp +++ lib/CodeGen/RegisterCoalescer.cpp @@ -2452,7 +2452,10 @@ // %other = COPY %ext // %this = COPY %ext <-- Erase this copy // - if (DefMI->isFullCopy() && !CP.isPartial() + // Avoid this case when there are subregs, as it can result in an incorrect + // subreg live range. + // + if (!TrackSubRegLiveness && DefMI->isFullCopy() && !CP.isPartial() && valuesIdentical(VNI, V.OtherVNI, Other)) return CR_Erase; Index: test/CodeGen/AMDGPU/coalescing-with-subregs-in-loop-bug.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/coalescing-with-subregs-in-loop-bug.mir @@ -0,0 +1,220 @@ +# RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx803 -run-pass=simple-register-coalescing,rename-independent-subregs %s -o - | FileCheck -check-prefix=GCN %s + +# This test is for a bug where the following happens: +# +# Inside the loop, %8611.sub2 is used in a V_LSHLREV whose result is then used +# in an LDS read. %8611 is a 128 bit value that is linked by copies to +# %11954 (from phi elimination), %8610 (the value in the loop pre-header), +# %8613 (defined and subreg-modified in the loop, and used after the loop) +# and %8612: +# +# %11954:vreg_128 = COPY killed %8610 +# bb.39: +# %8611:vreg_128 = COPY killed %11954 +# %10280:vgpr_32 = V_LSHLREV_B32_e32 2, %8611.sub2, implicit $exec +# %8613:vreg_128 = COPY killed %8611 +# %8613.sub1:vreg_128 = COPY %9719 +# %8612:vreg_128 = COPY %8613 +# %11954:vreg_128 = COPY killed %8612 +# S_CBRANCH_EXECNZ %bb.39, implicit $exec +# S_BRANCH %bb.40 +# bb.40: +# undef %8615.sub0:vreg_128 = COPY killed %8613.sub0 +# +# So this coalesces together into a single 128 bit value whose sub1 is modified +# in the loop, but the sub2 used in the V_LSHLREV is not modified in the loop. +# +# The bug is that the coalesced value has a L00000004 subrange (for sub2) that +# says that it is not live up to the end of the loop block. The symptom is that +# Rename Independent Subregs separates sub2 into its own register, and it is +# not live round the loop, so that pass adds an IMPLICIT_DEF for it just before +# the loop backedge. + +# GCN: bb.1 (%ir-block.6): +# GCN: V_LSHLREV_B32_e32 2, [[val:%[0-9][0-9]*]].sub2 +# GCN-NOT: [[val]]:vreg_128 = IMPLICIT_DEF + +--- | + target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + target triple = "amdgcn--amdpal" + + ; Function Attrs: nounwind + define dllexport amdgpu_cs void @_amdgpu_cs_main(i32 inreg, i32 inreg, i32 inreg, <3 x i32> inreg, i32 inreg, <3 x i32>) local_unnamed_addr { + .entry: + br label %6 + + ;