diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1632,13 +1632,15 @@ // TODO: Could insert earlier and schedule more liberally with operations // that only use caller preserved registers. MachineBasicBlock &EntryBB = MF.front(); + MachineBasicBlock::iterator I = EntryBB.begin(); + for (MachineBasicBlock::iterator E = EntryBB.end(); + I != E && (I->isPHI() || I->isMetaInstruction()); ++I) + ; + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0); if (ST->hasVscnt()) - BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), - TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(0); - BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(0); + BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(0); Modified = true; } diff --git a/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll b/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll --- a/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll +++ b/llvm/test/CodeGen/AMDGPU/split-arg-dbg-value.ll @@ -7,12 +7,12 @@ ; GCN-NEXT: .file 0 ; GCN-NEXT: .loc 0 3 0 ; /tmp/dbg.cl:3:0 ; GCN-NEXT: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: .Ltmp0: ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 96 32] $vgpr3 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 64 32] $vgpr2 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: .Ltmp0: ; GCN-NEXT: .loc 0 4 5 prologue_end ; /tmp/dbg.cl:4:5 ; GCN-NEXT: s_setpc_b64 s[30:31] ; GCN-NEXT: .Ltmp1: @@ -25,14 +25,14 @@ ; GCN: .Lfunc_begin1: ; GCN-NEXT: .loc 0 7 0 ; /tmp/dbg.cl:7:0 ; GCN-NEXT: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: .Ltmp2: ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_multi_arg:arg1 <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr5 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_multi_arg:arg1 <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr4 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_multi_arg:arg0 <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 96 32] $vgpr3 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_multi_arg:arg0 <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 64 32] $vgpr2 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_multi_arg:arg0 <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f32_multi_arg:arg0 <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: .Ltmp2: ; GCN-NEXT: .loc 0 8 17 prologue_end ; /tmp/dbg.cl:8:17 ; GCN-NEXT: v_add_f32_e32 v0, v4, v0 ; GCN-NEXT: .Ltmp3: @@ -57,10 +57,10 @@ ; GCN: .Lfunc_begin2: ; GCN-NEXT: .loc 0 11 0 is_stmt 1 ; /tmp/dbg.cl:11:0 ; GCN-NEXT: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: .Ltmp8: ; GCN-NEXT: ;DEBUG_VALUE: split_v4f16_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_v4f16_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: .Ltmp8: ; GCN-NEXT: .loc 0 12 5 prologue_end ; /tmp/dbg.cl:12:5 ; GCN-NEXT: s_setpc_b64 s[30:31] ; GCN-NEXT: .Ltmp9: @@ -73,10 +73,10 @@ ; GCN: .Lfunc_begin3: ; GCN-NEXT: .loc 0 15 0 ; /tmp/dbg.cl:15:0 ; GCN-NEXT: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: .Ltmp10: ; GCN-NEXT: ;DEBUG_VALUE: split_f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: .Ltmp10: ; GCN-NEXT: .loc 0 16 5 prologue_end ; /tmp/dbg.cl:16:5 ; GCN-NEXT: s_setpc_b64 s[30:31] ; GCN-NEXT: .Ltmp11: @@ -89,12 +89,12 @@ ; GCN: .Lfunc_begin4: ; GCN-NEXT: .loc 0 19 0 ; /tmp/dbg.cl:19:0 ; GCN-NEXT: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: .Ltmp12: ; GCN-NEXT: ;DEBUG_VALUE: split_v2f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 96 32] $vgpr3 ; GCN-NEXT: ;DEBUG_VALUE: split_v2f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 64 32] $vgpr2 ; GCN-NEXT: ;DEBUG_VALUE: split_v2f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_v2f64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: .Ltmp12: ; GCN-NEXT: .loc 0 20 5 prologue_end ; /tmp/dbg.cl:20:5 ; GCN-NEXT: s_setpc_b64 s[30:31] ; GCN-NEXT: .Ltmp13: @@ -107,10 +107,10 @@ ; GCN: .Lfunc_begin5: ; GCN-NEXT: .loc 0 23 0 ; /tmp/dbg.cl:23:0 ; GCN-NEXT: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: .Ltmp14: ; GCN-NEXT: ;DEBUG_VALUE: split_i64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_i64_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: .Ltmp14: ; GCN-NEXT: .loc 0 24 5 prologue_end ; /tmp/dbg.cl:24:5 ; GCN-NEXT: s_setpc_b64 s[30:31] ; GCN-NEXT: .Ltmp15: @@ -123,10 +123,10 @@ ; GCN: .Lfunc_begin6: ; GCN-NEXT: .loc 0 27 0 ; /tmp/dbg.cl:27:0 ; GCN-NEXT: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: .Ltmp16: ; GCN-NEXT: ;DEBUG_VALUE: split_ptr_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 32 32] $vgpr1 ; GCN-NEXT: ;DEBUG_VALUE: split_ptr_arg:arg <- [DW_OP_constu 1, DW_OP_swap, DW_OP_xderef, DW_OP_LLVM_fragment 0 32] $vgpr0 +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: .Ltmp16: ; GCN-NEXT: .loc 0 28 5 prologue_end ; /tmp/dbg.cl:28:5 ; GCN-NEXT: s_setpc_b64 s[30:31] ; GCN-NEXT: .Ltmp17: diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -march=amdgcn -mcpu=gfx1010 | FileCheck %s --check-prefix=GCN -define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) { +define void @vgpr_descriptor_waterfall_loop_idom_update(<4 x i32>* %arg) #0 { ; GCN-LABEL: vgpr_descriptor_waterfall_loop_idom_update: ; GCN: ; %bb.0: ; %entry +; GCN-NEXT: ; implicit-def: $vcc_hi ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_waitcnt_vscnt null, 0x0 -; GCN-NEXT: ; implicit-def: $vcc_hi ; GCN-NEXT: BB0_1: ; %bb0 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB0_2 Depth 2 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-skip-meta.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-skip-meta.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-skip-meta.mir @@ -0,0 +1,96 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass si-insert-waitcnts %s -o - | FileCheck %s + +# Ensure we insert waitcnts after any meta instructions at the start of +# non-kernel functions. Without this, the inserted waitcnts can affect e.g. the +# PC ranges covered by CFI and debug values. + +--- +# CHECK-LABEL: name: skip_implicit_def{{$}} +# CHECK: IMPLICIT_DEF +# CHECK: S_WAITCNT +name: skip_implicit_def +machineFunctionInfo: +body: | + bb.0: + $sgpr0 = IMPLICIT_DEF +... +--- +# CHECK-LABEL: name: skip_kill{{$}} +# CHECK: KILL +# CHECK: S_WAITCNT +name: skip_kill +machineFunctionInfo: +body: | + bb.0: + KILL $sgpr0 +... +--- +# CHECK-LABEL: name: skip_cfi{{$}} +# CHECK: CFI_INSTRUCTION +# CHECK: S_WAITCNT +name: skip_cfi +machineFunctionInfo: +body: | + bb.0: + CFI_INSTRUCTION undefined $sgpr0 +... +--- +# CHECK-LABEL: name: skip_eh_label{{$}} +# CHECK: EH_LABEL +# CHECK: S_WAITCNT +name: skip_eh_label +machineFunctionInfo: +body: | + bb.0: + EH_LABEL 0 +... +--- +# CHECK-LABEL: name: skip_gc_label{{$}} +# CHECK: GC_LABEL +# CHECK: S_WAITCNT +name: skip_gc_label +machineFunctionInfo: +body: | + bb.0: + GC_LABEL 0 +... +--- +# CHECK-LABEL: name: skip_dbg_value{{$}} +# CHECK: DBG_VALUE +# CHECK: S_WAITCNT +name: skip_dbg_value +machineFunctionInfo: +body: | + bb.0: + DBG_VALUE 0 +... +--- +# CHECK-LABEL: name: skip_dbg_label{{$}} +# CHECK: DBG_LABEL +# CHECK: S_WAITCNT +name: skip_dbg_label +machineFunctionInfo: +body: | + bb.0: + DBG_LABEL 0 +... +--- +# CHECK-LABEL: name: skip_lifetime_start{{$}} +# CHECK: LIFETIME_START +# CHECK: S_WAITCNT +name: skip_lifetime_start +machineFunctionInfo: +body: | + bb.0: + LIFETIME_START 0 +... +--- +# CHECK-LABEL: name: skip_lifetime_end{{$}} +# CHECK: LIFETIME_END +# CHECK: S_WAITCNT +name: skip_lifetime_end +machineFunctionInfo: +body: | + bb.0: + LIFETIME_END 0 +...