diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1783,6 +1783,11 @@ continue; foldInstOperand(MI, OpToFold); + + // If we managed to fold all uses of this copy then we might as well + // delete it now. + if (MRI->use_nodbg_empty(MI.getOperand(0).getReg())) + MI.eraseFromParentAndMarkDBGValuesForRemoval(); } } return true; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -1220,9 +1220,8 @@ ; SI-NEXT: v_ffbh_u32_e32 v3, 0 ; SI-NEXT: v_cmp_eq_u32_e64 vcc, 0, 0 ; SI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; SI-NEXT: v_mov_b32_e32 v3, 0xbe ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: v_sub_i32_e32 v4, vcc, v3, v2 +; SI-NEXT: v_sub_i32_e32 v4, vcc, 0xbe, v2 ; SI-NEXT: v_lshl_b64 v[2:3], v[0:1], v2 ; SI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; SI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3 @@ -1251,9 +1250,8 @@ ; VI-NEXT: v_ffbh_u32_e32 v3, 0 ; VI-NEXT: v_cmp_eq_u32_e64 vcc, 0, 0 ; VI-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc -; VI-NEXT: v_mov_b32_e32 v3, 0xbe ; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_sub_u32_e32 v4, vcc, v3, v2 +; VI-NEXT: v_sub_u32_e32 v4, vcc, 0xbe, v2 ; VI-NEXT: v_lshlrev_b64 v[2:3], v2, v[0:1] ; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; VI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v3 diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi.mir b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi.mir --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi.mir +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-fold-fi.mir @@ -8,7 +8,6 @@ body: | bb.0.entry: ; GCN-LABEL: name: test_fold_fi_scratch_load_vgpr - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GCN: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 4, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) ; GCN: S_ENDPGM 0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec @@ -24,7 +23,6 @@ body: | bb.0.entry: ; GCN-LABEL: name: test_fold_fi_scratch_load_sgpr - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 %stack.0 ; GCN: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 4, 0, implicit $exec, implicit $flat_scr :: (load 4 from %stack.0, addrspace 5) ; GCN: S_ENDPGM 0 %0:sgpr_32 = S_MOV_B32 %stack.0 @@ -40,7 +38,6 @@ body: | bb.0.entry: ; GCN-LABEL: name: test_fold_fi_scratch_store_vgpr - ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN: SCRATCH_STORE_DWORD_SADDR [[DEF]], %stack.0, 4, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) ; GCN: S_ENDPGM 0 @@ -76,7 +73,6 @@ body: | bb.0.entry: ; GCN-LABEL: name: test_fold_fi_scratch_store_sgpr - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sgpr_32 = S_MOV_B32 %stack.0 ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; GCN: SCRATCH_STORE_DWORD_SADDR [[DEF]], %stack.0, 4, 0, implicit $exec, implicit $flat_scr :: (store 4 into %stack.0, addrspace 5) ; GCN: S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/fold-cndmask-wave32.mir b/llvm/test/CodeGen/AMDGPU/fold-cndmask-wave32.mir --- a/llvm/test/CodeGen/AMDGPU/fold-cndmask-wave32.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-cndmask-wave32.mir @@ -11,7 +11,6 @@ ; CHECK: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %0:sreg_32_xm0_xexec = IMPLICIT_DEF %1:sreg_32 = S_MOV_B32 0 %2:vgpr_32 = COPY %1:sreg_32 diff --git a/llvm/test/CodeGen/AMDGPU/fold-cndmask.mir b/llvm/test/CodeGen/AMDGPU/fold-cndmask.mir --- a/llvm/test/CodeGen/AMDGPU/fold-cndmask.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-cndmask.mir @@ -3,7 +3,6 @@ # CHECK: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec # CHECK: %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec # CHECK: %4:vgpr_32 = COPY %3 -# CHECK: %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec # CHECK: %6:vgpr_32 = V_MOV_B32_e32 0, implicit $exec # CHECK: %7:vgpr_32 = COPY %3 diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir --- a/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir @@ -375,7 +375,6 @@ ... --- # CHECK-LABEL: name: add_i32_1_multi_f16_use -# CHECK: %13:vgpr_32 = V_MOV_B32_e32 1, implicit $exec # CHECK: %14:vgpr_32 = V_ADD_F16_e32 1, killed %11, implicit $mode, implicit $exec # CHECK: %15:vgpr_32 = V_ADD_F16_e32 1, killed %12, implicit $mode, implicit $exec @@ -440,7 +439,6 @@ --- # CHECK-LABEL: name: add_i32_m2_one_f32_use_multi_f16_use -# CHECK: %14:vgpr_32 = V_MOV_B32_e32 -2, implicit $exec # CHECK: %15:vgpr_32 = V_ADD_F16_e32 -2, %11, implicit $mode, implicit $exec # CHECK: %16:vgpr_32 = V_ADD_F16_e32 -2, %12, implicit $mode, implicit $exec # CHECK: %17:vgpr_32 = V_ADD_F32_e32 -2, killed %13, implicit $mode, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-order.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-order.mir --- a/llvm/test/CodeGen/AMDGPU/fold-operands-order.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-order.mir @@ -6,10 +6,7 @@ # aren't made in users before the def is seen. # GCN-LABEL: name: mov_in_use_list_2x{{$}} -# GCN: %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec -# GCN-NEXT: %3:vgpr_32 = COPY undef %0 - -# GCN: %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec +# GCN: %3:vgpr_32 = COPY undef %0 name: mov_in_use_list_2x diff --git a/llvm/test/CodeGen/AMDGPU/fold-readlane.mir b/llvm/test/CodeGen/AMDGPU/fold-readlane.mir --- a/llvm/test/CodeGen/AMDGPU/fold-readlane.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-readlane.mir @@ -12,10 +12,22 @@ S_NOP 0, implicit %1 ... +# GCN-LABEL: name: fold-imm-readfirstlane-dbgvalue{{$}} +# GCN: %1:sreg_32_xm0 = S_MOV_B32 123 +# GCN: DBG_VALUE $noreg, 0, 0 +--- +name: fold-imm-readfirstlane-dbgvalue +tracksRegLiveness: true +body: | + bb.0: + %0:vgpr_32 = V_MOV_B32_e32 123, implicit $exec + %1:sreg_32_xm0 = V_READFIRSTLANE_B32 %0, implicit $exec + DBG_VALUE %0, 0, 0 + S_NOP 0, implicit %1 +... + # GCN-LABEL: name: fold-imm-readfirstlane-readfirstlane{{$}} -# GCN: %0:vgpr_32 = V_MOV_B32_e32 123, implicit $exec # GCN: %1:sreg_32_xm0 = S_MOV_B32 123 -# GCN: %2:vgpr_32 = V_MOV_B32_e32 123, implicit $exec # GCN: %3:sreg_32_xm0 = COPY %1 --- @@ -33,7 +45,6 @@ # GCN-LABEL: name: fold-copy-readfirstlane{{$}} # GCN: %0:sreg_32_xm0 = COPY $sgpr10 -# GCN: %1:vgpr_32 = COPY %0 # GCN: %2:sreg_32_xm0 = COPY %0 --- name: fold-copy-readfirstlane diff --git a/llvm/test/CodeGen/AMDGPU/huge-number-operand-folds.mir b/llvm/test/CodeGen/AMDGPU/huge-number-operand-folds.mir --- a/llvm/test/CodeGen/AMDGPU/huge-number-operand-folds.mir +++ b/llvm/test/CodeGen/AMDGPU/huge-number-operand-folds.mir @@ -13,7 +13,6 @@ liveins: $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN-LABEL: name: op_idx_overflows_uchar - ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GCN: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]] %0:sreg_32 = S_MOV_B32 0 diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -392,8 +392,8 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -1205,11 +1205,10 @@ ; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v2 ; GCN-NEXT: v_mul_lo_u32 v3, v1, v2 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 -; GCN-NEXT: s_mov_b32 s4, 0x8000 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_mul_lo_u32 v4, v0, v2 ; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0, v3 -; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v4 +; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0x8000, v4 ; GCN-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v1, vcc ; GCN-NEXT: v_sub_i32_e64 v6, s[4:5], v4, v0 ; GCN-NEXT: v_subbrev_u32_e64 v5, s[4:5], 0, v5, s[4:5] @@ -1246,8 +1245,7 @@ ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GCN-IR-NEXT: v_cndmask_b32_e32 v8, v3, v2, vcc -; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, s6, v8 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 0xffffffd0, v8 ; GCN-IR-NEXT: v_addc_u32_e64 v5, s[6:7], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -1226,10 +1226,9 @@ ; GCN-NEXT: v_mul_lo_u32 v3, v1, v2 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v0, v2 -; GCN-NEXT: s_mov_b32 s4, 0x8000 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0x8000, v2 ; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, vcc ; GCN-NEXT: v_sub_i32_e64 v5, s[4:5], v2, v0 ; GCN-NEXT: v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5]