Index: llvm/lib/CodeGen/RegisterCoalescer.cpp =================================================================== --- llvm/lib/CodeGen/RegisterCoalescer.cpp +++ llvm/lib/CodeGen/RegisterCoalescer.cpp @@ -2955,7 +2955,7 @@ // its lanes. if (OtherV.ErasableImplicitDef && TrackSubRegLiveness && - (OtherV.WriteLanes & ~V.ValidLanes).any()) { + (OtherV.ValidLanes & ~V.ValidLanes).any()) { LLVM_DEBUG(dbgs() << "Cannot erase implicit_def with missing values\n"); OtherV.ErasableImplicitDef = false; Index: llvm/test/CodeGen/AMDGPU/blender-coalescer-verifier-error-empty-subrange.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/blender-coalescer-verifier-error-empty-subrange.mir @@ -0,0 +1,32 @@ +# RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=register-coalescer -verify-coalescing -o /dev/null %s 2>&1 | FileCheck %s + +# Testcase variants from +# liveout-implicit-def-subreg-redef-blender-verifier-error.mir which +# hit other verifier errors after coalescing. + +# CHECK: *** Bad machine code: Subrange must not be empty *** + + +# Same as previous, except the initial value isn't an implicit_def +--- +name: liveout_defined_register_redefine_sub0_implicit_def +tracksRegLiveness: true +body: | + bb.0: + S_CBRANCH_SCC0 %bb.2, implicit undef $scc + + bb.1: + S_NOP 0, implicit-def %0:sgpr_128 + %1:sgpr_32 = S_MOV_B32 0 + S_BRANCH %bb.3 + + bb.2: + undef %0.sub0:sgpr_128 = IMPLICIT_DEF + %1:sgpr_32 = COPY %0.sub0 + + bb.3: + S_NOP 0, implicit %0 + S_NOP 0, implicit %1 + S_ENDPGM 0 + +... Index: llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll @@ -0,0 +1,125 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-coalescing < %s | FileCheck %s + +define amdgpu_kernel void @blender_no_live_segment_at_def_error(<4 x float> %extractVec358.i.i, i32 %cmp5.i.i.arg, float %i1.i, i32 %cmp221.i.i.arg, i32 %cmp262.i.i.arg, ptr addrspace(1) %arg) { +; CHECK-LABEL: blender_no_live_segment_at_def_error: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_add_u32 s12, s12, s17 +; CHECK-NEXT: s_mov_b32 s32, 0 +; CHECK-NEXT: s_addc_u32 s13, s13, 0 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 +; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 +; CHECK-NEXT: s_load_dwordx8 s[36:43], s[8:9], 0x0 +; CHECK-NEXT: s_add_u32 s0, s0, s17 +; CHECK-NEXT: s_addc_u32 s1, s1, 0 +; CHECK-NEXT: s_mov_b64 s[34:35], s[8:9] +; CHECK-NEXT: s_mov_b32 s8, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_cmp_lg_u32 s40, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_8 +; CHECK-NEXT: ; %bb.1: ; %if.end13.i.i +; CHECK-NEXT: s_cmp_eq_u32 s42, 0 +; CHECK-NEXT: s_cbranch_scc1 .LBB0_4 +; CHECK-NEXT: ; %bb.2: ; %if.else251.i.i +; CHECK-NEXT: s_cmp_lg_u32 s43, 0 +; CHECK-NEXT: s_mov_b32 s12, 0 +; CHECK-NEXT: s_cselect_b32 s8, -1, 0 +; CHECK-NEXT: s_and_b32 vcc_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_vccz .LBB0_5 +; CHECK-NEXT: ; %bb.3: +; CHECK-NEXT: s_mov_b32 s36, 0 +; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_vccz .LBB0_6 +; CHECK-NEXT: s_branch .LBB0_7 +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: s_mov_b32 s10, s8 +; CHECK-NEXT: s_mov_b32 s11, s8 +; CHECK-NEXT: s_mov_b32 s9, s8 +; CHECK-NEXT: s_mov_b64 s[38:39], s[10:11] +; CHECK-NEXT: s_mov_b64 s[36:37], s[8:9] +; CHECK-NEXT: s_branch .LBB0_7 +; CHECK-NEXT: .LBB0_5: ; %if.then263.i.i +; CHECK-NEXT: v_cmp_lt_f32_e64 s8, s41, 0 +; CHECK-NEXT: s_mov_b32 s36, 1.0 +; CHECK-NEXT: s_mov_b32 s12, 0x7fc00000 +; CHECK-NEXT: s_mov_b32 s37, s36 +; CHECK-NEXT: s_mov_b32 s38, s36 +; CHECK-NEXT: s_mov_b32 s39, s36 +; CHECK-NEXT: s_andn2_b32 vcc_lo, exec_lo, s8 +; CHECK-NEXT: s_cbranch_vccnz .LBB0_7 +; CHECK-NEXT: .LBB0_6: ; %if.end273.i.i +; CHECK-NEXT: s_add_u32 s8, s34, 40 +; CHECK-NEXT: s_addc_u32 s9, s35, 0 +; CHECK-NEXT: s_getpc_b64 s[18:19] +; CHECK-NEXT: s_add_u32 s18, s18, _Z3dotDv3_fS_@gotpcrel32@lo+4 +; CHECK-NEXT: s_addc_u32 s19, s19, _Z3dotDv3_fS_@gotpcrel32@hi+12 +; CHECK-NEXT: v_lshlrev_b32_e32 v2, 20, v2 +; CHECK-NEXT: s_load_dwordx2 s[18:19], s[18:19], 0x0 +; CHECK-NEXT: v_lshlrev_b32_e32 v3, 10, v1 +; CHECK-NEXT: v_add_f32_e64 v1, s12, s36 +; CHECK-NEXT: s_mov_b32 s12, s14 +; CHECK-NEXT: s_mov_b32 s13, s15 +; CHECK-NEXT: s_mov_b32 s14, s16 +; CHECK-NEXT: v_or3_b32 v31, v0, v3, v2 +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_mov_b32 s36, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_swappc_b64 s[30:31], s[18:19] +; CHECK-NEXT: s_mov_b32 s37, s36 +; CHECK-NEXT: s_mov_b32 s38, s36 +; CHECK-NEXT: s_mov_b32 s39, s36 +; CHECK-NEXT: .LBB0_7: ; %if.end294.i.i +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:12 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; CHECK-NEXT: .LBB0_8: ; %kernel_direct_lighting.exit +; CHECK-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x20 +; CHECK-NEXT: v_mov_b32_e32 v0, s36 +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: v_mov_b32_e32 v1, s37 +; CHECK-NEXT: v_mov_b32_e32 v2, s38 +; CHECK-NEXT: v_mov_b32_e32 v3, s39 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: global_store_dwordx4 v4, v[0:3], s[4:5] +; CHECK-NEXT: s_endpgm +entry: + %cmp5.i.i = icmp eq i32 %cmp5.i.i.arg, 0 + br i1 %cmp5.i.i, label %if.end13.i.i, label %kernel_direct_lighting.exit + +if.end13.i.i: ; preds = %entry + %cmp221.i.i = icmp eq i32 %cmp221.i.i.arg, 0 + br i1 %cmp221.i.i, label %if.end294.i.i, label %if.else251.i.i + +if.else251.i.i: ; preds = %if.end13.i.i + %cmp262.i.i = icmp eq i32 %cmp262.i.i.arg, 0 + br i1 %cmp262.i.i, label %if.then263.i.i, label %if.end273.i.i + +if.then263.i.i: ; preds = %if.else251.i.i + %cmp.i604.i.i = fcmp olt float %i1.i, 0.000000e+00 + br i1 %cmp.i604.i.i, label %if.end273.i.i, label %if.end294.i.i + +if.end273.i.i: ; preds = %if.then263.i.i, %if.else251.i.i + %i = phi float [ 1.000000e+00, %if.then263.i.i ], [ 0.000000e+00, %if.else251.i.i ] + %i1 = phi float [ 0x7FF8000000000000, %if.then263.i.i ], [ 0.000000e+00, %if.else251.i.i ] + %extractVec278.i.i = insertelement <3 x float> zeroinitializer, float %i1, i64 0 + %extractVec3.i.i.i = insertelement <3 x float> zeroinitializer, float %i, i64 0 + %i3.i = fadd <3 x float> %extractVec278.i.i, %extractVec3.i.i.i + %call32.i.i.i = tail call float @_Z3dotDv3_fS_(<3 x float> %i3.i) + br label %if.end294.i.i + +if.end294.i.i: ; preds = %if.end273.i.i, %if.then263.i.i, %if.end13.i.i + %ls111.sroa.0.2.i = phi <4 x float> [ zeroinitializer, %if.end13.i.i ], [ zeroinitializer, %if.end273.i.i ], [ , %if.then263.i.i ] + store <4 x float> zeroinitializer, ptr addrspace(5) null, align 4 + br label %kernel_direct_lighting.exit + +kernel_direct_lighting.exit: ; preds = %if.end294.i.i, %entry + %ls111.sroa.0.3.i = phi <4 x float> [ %ls111.sroa.0.2.i, %if.end294.i.i ], [ %extractVec358.i.i, %entry ] + store <4 x float> %ls111.sroa.0.3.i, ptr addrspace(1) %arg, align 4 + ret void +} + +declare float @_Z3dotDv3_fS_(<3 x float>) Index: llvm/test/CodeGen/AMDGPU/liveout-implicit-def-subreg-redef-blender-verifier-error.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/liveout-implicit-def-subreg-redef-blender-verifier-error.mir @@ -0,0 +1,148 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -run-pass=register-coalescer -verify-coalescing -o - %s | FileCheck %s + +# Make sure coalescing doesn't produce "no live segment at def" when +# there is a live out implicit_def with subranges. + +# %1 will be coalesced into %0. %0 is a cross block implicit_def that +# cannot be deleted. The def of %0 in %bb.2 is a live out subregister +# def of the same register. We need to ensure that the resulting +# subrange for %0.sub0 includes the def in %bb.1 + +--- +name: liveout_implicit_def_super_reg_redefine_sub0_implicit_def +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: liveout_implicit_def_super_reg_redefine_sub0_implicit_def + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit undef $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef %0.sub0:sgpr_128 = S_MOV_B32 0 + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef %0.sub0:sgpr_128 = IMPLICIT_DEF + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: S_NOP 0, implicit %0 + ; CHECK-NEXT: S_NOP 0, implicit %0.sub0 + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + S_CBRANCH_SCC0 %bb.2, implicit undef $scc + + bb.1: + %0:sgpr_128 = IMPLICIT_DEF + %1:sgpr_32 = S_MOV_B32 0 + S_BRANCH %bb.3 + + bb.2: + undef %0.sub0:sgpr_128 = IMPLICIT_DEF + %1:sgpr_32 = COPY %0.sub0 + + bb.3: + S_NOP 0, implicit %0 + S_NOP 0, implicit %1 + S_ENDPGM 0 + +... + + +# Redef of sub0 is a meaningful value. +--- +name: liveout_implicit_def_redefine_sub0_undef_other +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: liveout_implicit_def_redefine_sub0_undef_other + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit undef $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef %0.sub0:sgpr_128 = S_MOV_B32 0 + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef %0.sub0:sgpr_128 = S_MOV_B32 9 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: S_NOP 0, implicit %0 + ; CHECK-NEXT: S_NOP 0, implicit %0.sub0 + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + S_CBRANCH_SCC0 %bb.2, implicit undef $scc + + bb.1: + %0:sgpr_128 = IMPLICIT_DEF + %1:sgpr_32 = S_MOV_B32 0 + S_BRANCH %bb.3 + + bb.2: + undef %0.sub0:sgpr_128 = S_MOV_B32 9 + %1:sgpr_32 = COPY %0.sub0 + + bb.3: + S_NOP 0, implicit %0 + S_NOP 0, implicit %1 + S_ENDPGM 0 + +... + +# The initial def of the register doesn't doesn't cover the redefined +# lanes. This had no error but was useful to compare against the +# failing cases. +--- +name: only_redefine_undefined_lanes +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: only_redefine_undefined_lanes + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit undef $scc + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_NOP 0, implicit-def undef %0.sub1_sub2_sub3 + ; CHECK-NEXT: %0.sub0:vreg_128 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: undef %0.sub0:vreg_128 = V_MOV_B32_e32 9, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: S_NOP 0, implicit %0 + ; CHECK-NEXT: S_NOP 0, implicit %0.sub0 + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + S_CBRANCH_SCC0 %bb.2, implicit undef $scc + + bb.1: + S_NOP 0, implicit-def undef %0.sub1_sub2_sub3:vreg_128 + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + S_BRANCH %bb.3 + + bb.2: + undef %0.sub0:vreg_128 = V_MOV_B32_e32 9, implicit $exec + %1:vgpr_32 = COPY %0.sub0 + + bb.3: + S_NOP 0, implicit %0 + S_NOP 0, implicit %1 + S_ENDPGM 0 + +...