diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1177,6 +1177,13 @@ MachineOperand &Op = MI.getOperand(I); if (!Op.isReg()) continue; + + // D16 buffer instructions does not actually read the tied source + // operand, so we can skip the source operand. + if (Op.isTied() && Op.isUse() && + (MI.getDesc().TSFlags & SIInstrFlags::D16Buf)) + continue; + RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I); diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -22,7 +22,6 @@ ; FLATSCR-NEXT: s_mov_b32 s0, 2 ; FLATSCR-NEXT: scratch_load_ushort v0, off, s0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 -; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] @@ -45,7 +44,6 @@ ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, off, s0 ; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 0 -; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, off, s0 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] @@ -57,7 +55,6 @@ ; GFX11-NEXT: s_mov_b32 s0, 2 ; GFX11-NEXT: scratch_load_u16 v0, off, s0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_load_d16_hi_b16 v0, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -86,7 +83,7 @@ ; FLATSCR: ; %bb.0: ; %bb ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: scratch_load_ushort v0, v0, off -; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_nop 0 ; FLATSCR-NEXT: scratch_load_short_d16_hi v0, v1, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] @@ -106,7 +103,6 @@ ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, v0, off -; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, v1, off ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] @@ -116,7 +112,6 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_u16 v0, v0, off -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_load_d16_hi_b16 v0, v1, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -279,7 +274,6 @@ ; GCN-NEXT: global_load_ushort v0, v[0:1], off ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_short_d16_hi v0, v[1:2], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -293,7 +287,6 @@ ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -307,7 +300,6 @@ ; GFX11-NEXT: global_load_u16 v0, v[0:1], off ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[1:2], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -327,7 +319,7 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_ushort v0, v[0:1], off -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -337,7 +329,6 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -347,7 +338,6 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -2549,7 +2549,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_ushort v2, v[0:1], off glc -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:2 glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v0, v2 @@ -2581,7 +2581,7 @@ ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: global_load_ushort v2, v[0:1], off glc -; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_nop 0 ; GFX900-FLATSCR-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:2 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, v2