Index: llvm/lib/CodeGen/ExpandPostRAPseudos.cpp =================================================================== --- llvm/lib/CodeGen/ExpandPostRAPseudos.cpp +++ llvm/lib/CodeGen/ExpandPostRAPseudos.cpp @@ -68,9 +68,16 @@ MachineBasicBlock::iterator CopyMI = MI; --CopyMI; - for (const MachineOperand &MO : MI->implicit_operands()) - if (MO.isReg()) - CopyMI->addOperand(MO); + Register DstReg = MI->getOperand(0).getReg(); + for (const MachineOperand &MO : MI->implicit_operands()) { + CopyMI->addOperand(MO); + + // Be conservative about preserving kills when subregister defs are + // involved. If there was implicit kill of a super-register overlapping the + // copy result, we would kill the subregisters previous copies defined. + if (MO.isKill() && TRI->regsOverlap(DstReg, MO.getReg())) + CopyMI->getOperand(CopyMI->getNumOperands() - 1).setIsKill(false); + } } bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) { Index: llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/copy-phys-reg-implicit-operand-kills-subregs.mir @@ -0,0 +1,24 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs -run-pass=postrapseudos -o - %s | FileCheck %s + +# The copy has an implicit def of a superregister which overlaps the +# register it defines. We cannot preserve the kill on the tuple def +# when copying implicit operands to the last inserted v_mov_b32, since +# it kills the subregister defined earlier in the expansion. + +--- +name: copy_has_implicit_kill_superreg +tracksRegLiveness: true +body: | + bb.0: + + ; CHECK-LABEL: name: copy_has_implicit_kill_superreg + ; CHECK: renamable $vgpr7_vgpr8_vgpr9_vgpr10 = IMPLICIT_DEF + ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit-def $vgpr7_vgpr8, implicit $vgpr10_vgpr11 + ; CHECK-NEXT: $vgpr8 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr10_vgpr11, implicit $vgpr7_vgpr8_vgpr9_vgpr10 + ; CHECK-NEXT: S_ENDPGM 0, implicit $vgpr7 + renamable $vgpr7_vgpr8_vgpr9_vgpr10 = IMPLICIT_DEF + renamable $vgpr7_vgpr8 = COPY killed renamable $vgpr10_vgpr11, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10 + S_ENDPGM 0, implicit $vgpr7 + +... Index: llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/overlapping-tuple-copy-implicit-op-failure.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s + +; Testcase which happened to trigger a liveness verifier error +define amdgpu_kernel void @test_long_add4(<4 x i64> %arg) #0 { +; CHECK-LABEL: test_long_add4: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b64 s[4:5], 0 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: global_load_dwordx4 v[7:10], v[0:1], off +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:16 +; CHECK-NEXT: ; kill: def $vgpr7_vgpr8_vgpr9_vgpr10 killed $vgpr7_vgpr8_vgpr9_vgpr10 def $vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14 killed $exec +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v14, v3 +; CHECK-NEXT: v_mov_b32_e32 v13, v2 +; CHECK-NEXT: v_mov_b32_e32 v12, v1 +; CHECK-NEXT: v_mov_b32_e32 v11, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: global_load_dwordx4 v[18:21], v[0:1], off +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:16 +; CHECK-NEXT: ; kill: def $vgpr18_vgpr19_vgpr20_vgpr21 killed $vgpr18_vgpr19_vgpr20_vgpr21 def $vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25 killed $exec +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v25, v3 +; CHECK-NEXT: v_mov_b32_e32 v24, v2 +; CHECK-NEXT: v_mov_b32_e32 v23, v1 +; CHECK-NEXT: v_mov_b32_e32 v22, v0 +; CHECK-NEXT: v_mov_b32_e32 v2, v7 +; CHECK-NEXT: v_mov_b32_e32 v3, v8 +; CHECK-NEXT: v_mov_b32_e32 v0, v9 +; CHECK-NEXT: v_mov_b32_e32 v1, v10 +; CHECK-NEXT: v_mov_b32_e32 v5, v11 +; CHECK-NEXT: v_mov_b32_e32 v6, v12 +; CHECK-NEXT: v_mov_b32_e32 v12, v13 +; CHECK-NEXT: v_mov_b32_e32 v13, v14 +; CHECK-NEXT: v_mov_b32_e32 v8, v18 +; CHECK-NEXT: v_mov_b32_e32 v9, v19 +; CHECK-NEXT: v_mov_b32_e32 v16, v20 +; CHECK-NEXT: v_mov_b32_e32 v17, v21 +; CHECK-NEXT: v_mov_b32_e32 v14, v22 +; CHECK-NEXT: v_mov_b32_e32 v15, v23 +; CHECK-NEXT: v_mov_b32_e32 v10, v24 +; CHECK-NEXT: v_mov_b32_e32 v11, v25 +; CHECK-NEXT: v_mov_b32_e32 v4, v2 +; CHECK-NEXT: v_mov_b32_e32 v2, v3 +; CHECK-NEXT: v_mov_b32_e32 v7, v8 +; CHECK-NEXT: v_mov_b32_e32 v3, v9 +; CHECK-NEXT: v_add_co_u32 v7, s6, v4, v7 +; CHECK-NEXT: v_add_co_ci_u32_e64 v2, s6, v2, v3, s6 +; CHECK-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v8, v2 +; CHECK-NEXT: v_mov_b32_e32 v2, v0 +; CHECK-NEXT: v_mov_b32_e32 v0, v1 +; CHECK-NEXT: v_mov_b32_e32 v3, v16 +; CHECK-NEXT: v_mov_b32_e32 v1, v17 +; CHECK-NEXT: v_add_co_u32 v3, s6, v2, v3 +; CHECK-NEXT: v_add_co_ci_u32_e64 v0, s6, v0, v1, s6 +; CHECK-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v4, v0 +; CHECK-NEXT: v_mov_b32_e32 v1, v5 +; CHECK-NEXT: v_mov_b32_e32 v0, v6 +; CHECK-NEXT: v_mov_b32_e32 v5, v14 +; CHECK-NEXT: v_mov_b32_e32 v2, v15 +; CHECK-NEXT: v_add_co_u32 v1, s6, v1, v5 +; CHECK-NEXT: v_add_co_ci_u32_e64 v0, s6, v0, v2, s6 +; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v2, v0 +; CHECK-NEXT: v_mov_b32_e32 v5, v12 +; CHECK-NEXT: v_mov_b32_e32 v0, v13 +; CHECK-NEXT: v_mov_b32_e32 v9, v10 +; CHECK-NEXT: v_mov_b32_e32 v6, v11 +; CHECK-NEXT: v_add_co_u32 v5, s6, v5, v9 +; CHECK-NEXT: v_add_co_ci_u32_e64 v0, s6, v0, v6, s6 +; CHECK-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v6, v0 +; CHECK-NEXT: ; kill: def $vgpr7_vgpr8 killed $vgpr7_vgpr8 def $vgpr7_vgpr8_vgpr9_vgpr10 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v10, v4 +; CHECK-NEXT: v_mov_b32_e32 v9, v3 +; CHECK-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v3, v5 +; CHECK-NEXT: v_mov_b32_e32 v4, v6 +; CHECK-NEXT: v_mov_b32_e32 v6, s5 +; CHECK-NEXT: v_mov_b32_e32 v5, s4 +; CHECK-NEXT: global_store_dwordx4 v[5:6], v[7:10], off +; CHECK-NEXT: s_mov_b64 s[4:5], 16 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: global_store_dwordx4 v0, v[1:4], s[4:5] +; CHECK-NEXT: s_endpgm +entry: + %load0 = load <4 x i64>, <4 x i64> addrspace(1)* null, align 32 + %load1 = load <4 x i64>, <4 x i64> addrspace(1)* null, align 32 + %add = add <4 x i64> %load0, %load1 + store <4 x i64> %add, <4 x i64> addrspace(1)* null, align 32 + ret void +} + +attributes #0 = { noinline optnone } Index: llvm/test/CodeGen/X86/pr28560.ll =================================================================== --- llvm/test/CodeGen/X86/pr28560.ll +++ llvm/test/CodeGen/X86/pr28560.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=i686-pc-linux -print-after=postrapseudos < %s 2>&1 | FileCheck %s -; CHECK: MOV8rr ${{[a-d]}}l, implicit killed $e[[R:[a-d]]]x, implicit-def $e[[R]]x +; CHECK: MOV8rr ${{[a-d]}}l, implicit $e[[R:[a-d]]]x, implicit-def $e[[R]]x define i32 @foo(i32 %i, i32 %k, i8* %p) { %f = icmp ne i32 %i, %k %s = zext i1 %f to i8