Index: llvm/lib/CodeGen/RegisterCoalescer.cpp =================================================================== --- llvm/lib/CodeGen/RegisterCoalescer.cpp +++ llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -3114,6 +3114,13 @@ } } +// Check if the segment consists of a copied live-through value (i.e. the copy +// in the block only extended the liveness, of an undef value which we may need +// to handle). +static bool mayBeUndefSubrangeLiveThrough(const LiveQueryResult Q) { + return Q.valueIn() && Q.valueIn()->isPHIDef() && Q.valueIn() == Q.valueOut(); +} + /// Consider the following situation when coalescing the copy between /// %31 and %45 at 800. (The vertical lines represent live range segments.) /// @@ -3196,11 +3203,21 @@ // with V.OtherVNI. LIS->extendToIndices(S, EndPoints); } + + // We may need to eliminate the subrange if the copy introduced a live + // out undef value. + if (ValueOut->isPHIDef()) + ShrinkMask |= S.LaneMask; continue; } + // If a subrange ends at the copy, then a value was copied but only // partially used later. Shrink the subregister range appropriately. - if (Q.valueIn() != nullptr && Q.valueOut() == nullptr) { + // + // Ultimately this calls shrinkToUses, so assuming ShrinkMask is + // conservatively correct. + if ((Q.valueIn() != nullptr && Q.valueOut() == nullptr) || + (V.Resolution == CR_Erase && mayBeUndefSubrangeLiveThrough(Q))) { LLVM_DEBUG(dbgs() << "\t\tDead uses at sublane " << PrintLaneMask(S.LaneMask) << " at " << Def << "\n"); Index: llvm/test/CodeGen/AMDGPU/coalesce-identity-copies-undef-subregs.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/coalesce-identity-copies-undef-subregs.mir +++ llvm/test/CodeGen/AMDGPU/coalesce-identity-copies-undef-subregs.mir @@ -250,57 +250,150 @@ ... -# FIXME: This testcase is broken # The coalescing of the %0 = %2 COPY in %bb.2 needs to prune the dead # phi range across %bb.1 after it is erased. -# --- -# name: prune_subrange_phi_value_0 -# tracksRegLiveness: true -# body: | -# bb.0: -# liveins: $vgpr0 +--- +name: prune_subrange_phi_value_0 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: prune_subrange_phi_value_0 + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0 + ; CHECK: undef %2.sub1:vreg_64 = COPY $vgpr0 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: S_BRANCH %bb.1 + bb.0: + liveins: $vgpr0 -# undef %0.sub1:vreg_64 = COPY killed $vgpr0 + undef %0.sub1:vreg_64 = COPY killed $vgpr0 -# bb.1: -# successors: %bb.2, %bb.1 + bb.1: + successors: %bb.2, %bb.1 -# %1:vreg_64 = COPY killed %0 -# %0:vreg_64 = COPY %1 -# S_CBRANCH_EXECNZ %bb.1, implicit $exec -# S_BRANCH %bb.2 + %1:vreg_64 = COPY killed %0 + %0:vreg_64 = COPY %1 + S_CBRANCH_EXECNZ %bb.1, implicit $exec + S_BRANCH %bb.2 -# bb.2: -# undef %2.sub1:vreg_64 = COPY %0.sub1 -# %0:vreg_64 = COPY killed %2 -# S_BRANCH %bb.1 + bb.2: + undef %2.sub1:vreg_64 = COPY %0.sub1 + %0:vreg_64 = COPY killed %2 + S_BRANCH %bb.1 -# ... +... + +--- +name: prune_subrange_phi_value_0_0 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: prune_subrange_phi_value_0_0 + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0 + ; CHECK: undef %0.sub1:vreg_64 = COPY $vgpr0 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: S_BRANCH %bb.1 + bb.0: + liveins: $vgpr0 + + undef %0.sub1:vreg_64 = COPY killed $vgpr0 + + bb.1: + successors: %bb.2, %bb.1 + + %1:vreg_64 = COPY killed %0 + %0:vreg_64 = COPY %1 + S_CBRANCH_EXECNZ %bb.1, implicit $exec + S_BRANCH %bb.2 + + bb.2: + undef %0.sub1:vreg_64 = COPY %0.sub1 + S_BRANCH %bb.1 + +... + +--- +name: prune_subrange_phi_value_0_1 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: prune_subrange_phi_value_0_1 + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0 + ; CHECK: undef %0.sub1:vreg_64 = COPY $vgpr0 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: S_BRANCH %bb.1 + bb.0: + liveins: $vgpr0 + + undef %0.sub1:vreg_64 = COPY killed $vgpr0 + + bb.1: + successors: %bb.2, %bb.1 + + %1:vreg_64 = COPY killed %0 + %0:vreg_64 = COPY %1 + S_CBRANCH_EXECNZ %bb.1, implicit $exec + S_BRANCH %bb.2 + + bb.2: + S_BRANCH %bb.1 + +... # Variant of testcase that asserts since there wasn't already an # incoming segment at the erased copy, and no valid end point. -# --- -# name: prune_subrange_phi_value_1 -# tracksRegLiveness: true -# body: | -# bb.0: -# liveins: $vgpr0 +--- +name: prune_subrange_phi_value_1 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: prune_subrange_phi_value_1 + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0 + ; CHECK: undef %0.sub1:vreg_64 = COPY $vgpr0 + ; CHECK: bb.1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: S_BRANCH %bb.2 + ; CHECK: bb.2: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: undef %0.sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 %0.sub1, implicit $mode, implicit $exec + ; CHECK: S_BRANCH %bb.1 + bb.0: + liveins: $vgpr0 -# undef %0.sub1:vreg_64 = COPY killed $vgpr0 + undef %0.sub1:vreg_64 = COPY killed $vgpr0 -# bb.1: -# successors: %bb.2, %bb.1 + bb.1: + successors: %bb.2, %bb.1 -# %0:vreg_64 = COPY killed %0 -# S_CBRANCH_EXECNZ %bb.1, implicit $exec -# S_BRANCH %bb.2 + %0:vreg_64 = COPY killed %0 + S_CBRANCH_EXECNZ %bb.1, implicit $exec + S_BRANCH %bb.2 -# bb.2: -# undef %1.sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 killed %0.sub1, implicit $mode, implicit $exec -# %0:vreg_64 = COPY killed %1 -# S_BRANCH %bb.1 + bb.2: + undef %1.sub1:vreg_64 = nofpexcept V_CEIL_F32_e32 killed %0.sub1, implicit $mode, implicit $exec + %0:vreg_64 = COPY killed %1 + S_BRANCH %bb.1 -# ... +... --- name: prune_subrange_phi_value_2 Index: llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/loop-live-out-copy-undef-subrange.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s + +; This example used to produce a verifier error resulting from the +; register coalescer leaving behind a false live interval when a live +; out copy introduced new liveness for a subregister. + +define <3 x float> @liveout_undef_subrange(<3 x float> %arg) { +; CHECK-LABEL: liveout_undef_subrange: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: v_add_f32_e32 v3, v2, v2 +; CHECK-NEXT: v_add_f32_e32 v1, v1, v1 +; CHECK-NEXT: v_add_f32_e32 v0, v0, v0 +; CHECK-NEXT: BB0_1: ; %bb1 +; CHECK-NEXT: ; =>This Loop Header: Depth=1 +; CHECK-NEXT: ; Child Loop BB0_2 Depth 2 +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: BB0_2: ; %bb1 +; CHECK-NEXT: ; Parent Loop BB0_1 Depth=1 +; CHECK-NEXT: ; => This Inner Loop Header: Depth=2 +; CHECK-NEXT: v_cmp_neq_f32_e32 vcc, 0, v2 +; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5] +; CHECK-NEXT: s_cbranch_execnz BB0_2 +; CHECK-NEXT: ; %bb.3: ; %bb2 +; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 +; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] +; CHECK-NEXT: v_mul_f32_e32 v2, v3, v2 +; CHECK-NEXT: s_branch BB0_1 +bb: + br label %bb1 + +bb1: ; preds = %bb3, %bb + %i = phi <3 x float> [ %arg, %bb ], [ %i11, %bb3 ] + %i2 = extractelement <3 x float> %i, i64 2 + %i3 = fmul float %i2, 1.000000e+00 + %i4 = fmul nsz <3 x float> %arg, + %i5 = insertelement <3 x float> undef, float %i3, i32 0 + %i6 = shufflevector <3 x float> %i5, <3 x float> undef, <3 x i32> zeroinitializer + %i7 = fmul <3 x float> %i4, %i6 + %i8 = fcmp oeq float %i3, 0.000000e+00 + br i1 %i8, label %bb3, label %bb2 + +bb2: ; preds = %bb1 + br label %bb3 + +bb3: ; preds = %bb2, %bb1 + %i11 = phi <3 x float> [ %i7, %bb2 ], [ %i, %bb1 ] + br label %bb1 +}