diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td --- a/llvm/lib/Target/AMDGPU/BUFInstructions.td +++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td @@ -810,6 +810,7 @@ >; let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { +let TiedSourceNotRead = 1 in { defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Loads < "buffer_load_format_d16_x", i32 >; @@ -822,6 +823,7 @@ defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Loads < "buffer_load_format_d16_xyzw", v4i32 >; +} defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Stores < "buffer_store_format_d16_x", i32 >; @@ -837,6 +839,7 @@ } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { +let TiedSourceNotRead = 1 in { defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads < "buffer_load_format_d16_x", f16 >; @@ -849,6 +852,7 @@ defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads < "buffer_load_format_d16_xyzw", v4f16 >; +} defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Pseudo_Stores < "buffer_store_format_d16_x", f16 >; @@ -1067,6 +1071,7 @@ } let SubtargetPredicate = HasD16LoadStore in { +let TiedSourceNotRead = 1 in { defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads < "buffer_load_ubyte_d16", i32, 1 @@ -1092,6 +1097,11 @@ "buffer_load_short_d16_hi", i32, 1 >; +defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads < + "buffer_load_format_d16_hi_x", i32 +>; +} // End TiedSourceNotRead + defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores < "buffer_store_byte_d16_hi", i32 >; @@ -1100,9 +1110,6 @@ "buffer_store_short_d16_hi", i32 >; -defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads < - "buffer_load_format_d16_hi_x", i32 ->; defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores < "buffer_store_format_d16_hi_x", i32 >; @@ -1146,10 +1153,12 @@ defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128, 4>; let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { +let TiedSourceNotRead = 1 in { defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>; defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64, 2>; defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96, 3>; defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128, 4>; +} defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>; defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64, 2>; defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96, 3>; @@ -1157,10 +1166,12 @@ } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { +let TiedSourceNotRead = 1 in { defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>; defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32, 2>; defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64, 3>; defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64, 4>; +} defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>; defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32, 2>; defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64, 3>; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -649,7 +649,7 @@ defm DS_READ2ST64_B64: DS_1A_Off8_RET_mc<"ds_read2st64_b64", VReg_128>; let has_m0_read = 0 in { -let SubtargetPredicate = HasD16LoadStore in { +let SubtargetPredicate = HasD16LoadStore, TiedSourceNotRead = 1 in { def DS_READ_U8_D16 : DS_1A_RET_Tied<"ds_read_u8_d16">; def DS_READ_U8_D16_HI : DS_1A_RET_Tied<"ds_read_u8_d16_hi">; def DS_READ_I8_D16 : DS_1A_RET_Tied<"ds_read_i8_d16">; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -595,12 +595,14 @@ def FLAT_STORE_DWORDX3 : FLAT_Store_Pseudo <"flat_store_dwordx3", VReg_96>; let SubtargetPredicate = HasD16LoadStore in { +let TiedSourceNotRead = 1 in { def FLAT_LOAD_UBYTE_D16 : FLAT_Load_Pseudo <"flat_load_ubyte_d16", VGPR_32, 1>; def FLAT_LOAD_UBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_ubyte_d16_hi", VGPR_32, 1>; def FLAT_LOAD_SBYTE_D16 : FLAT_Load_Pseudo <"flat_load_sbyte_d16", VGPR_32, 1>; def FLAT_LOAD_SBYTE_D16_HI : FLAT_Load_Pseudo <"flat_load_sbyte_d16_hi", VGPR_32, 1>; def FLAT_LOAD_SHORT_D16 : FLAT_Load_Pseudo <"flat_load_short_d16", VGPR_32, 1>; def FLAT_LOAD_SHORT_D16_HI : FLAT_Load_Pseudo <"flat_load_short_d16_hi", VGPR_32, 1>; +} def FLAT_STORE_BYTE_D16_HI : FLAT_Store_Pseudo <"flat_store_byte_d16_hi", VGPR_32>; def FLAT_STORE_SHORT_D16_HI : FLAT_Store_Pseudo <"flat_store_short_d16_hi", VGPR_32>; @@ -741,12 +743,15 @@ defm GLOBAL_LOAD_DWORDX3 : FLAT_Global_Load_Pseudo <"global_load_dwordx3", VReg_96>; defm GLOBAL_LOAD_DWORDX4 : FLAT_Global_Load_Pseudo <"global_load_dwordx4", VReg_128>; +let TiedSourceNotRead = 1 in { defm GLOBAL_LOAD_UBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16", VGPR_32, 1>; defm GLOBAL_LOAD_UBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_ubyte_d16_hi", VGPR_32, 1>; defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16", VGPR_32, 1>; defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32, 1>; defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32, 1>; defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>; +} + let OtherPredicates = [HasGFX10_BEncoding] in defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>; @@ -865,12 +870,14 @@ defm SCRATCH_LOAD_DWORDX3 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx3", VReg_96>; defm SCRATCH_LOAD_DWORDX4 : FLAT_Scratch_Load_Pseudo <"scratch_load_dwordx4", VReg_128>; +let TiedSourceNotRead = 1 in { defm SCRATCH_LOAD_UBYTE_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16", VGPR_32, 1>; defm SCRATCH_LOAD_UBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_ubyte_d16_hi", VGPR_32, 1>; defm SCRATCH_LOAD_SBYTE_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16", VGPR_32, 1>; defm SCRATCH_LOAD_SBYTE_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_sbyte_d16_hi", VGPR_32, 1>; defm SCRATCH_LOAD_SHORT_D16 : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16", VGPR_32, 1>; defm SCRATCH_LOAD_SHORT_D16_HI : FLAT_Scratch_Load_Pseudo <"scratch_load_short_d16_hi", VGPR_32, 1>; +} defm SCRATCH_STORE_BYTE : FLAT_Scratch_Store_Pseudo <"scratch_store_byte", VGPR_32>; defm SCRATCH_STORE_SHORT : FLAT_Scratch_Store_Pseudo <"scratch_store_short", VGPR_32>; diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -130,6 +130,9 @@ // Is a WMMA instruction. IsWMMA = UINT64_C(1) << 59, + + // Whether tied sources will be read. + TiedSourceNotRead = UINT64_C(1) << 60, }; // v_cmp_class_* etc. use a 10-bit mask for what operation is checked. diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -1177,6 +1177,11 @@ MachineOperand &Op = MI.getOperand(I); if (!Op.isReg()) continue; + + // If the instruction does not read tied source, skip the operand. + if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI)) + continue; + RegInterval Interval = ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I); diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td --- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td +++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td @@ -150,6 +150,9 @@ // This bit indicates that this is one of WMMA instructions. field bit IsWMMA = 0; + // This bit indicates that tied source will not be read. + field bit TiedSourceNotRead = 0; + // These need to be kept in sync with the enum in SIInstrFlags. let TSFlags{0} = SALU; let TSFlags{1} = VALU; @@ -229,6 +232,8 @@ let TSFlags{59} = IsWMMA; + let TSFlags{60} = TiedSourceNotRead; + let SchedRW = [Write32Bit]; let AsmVariantName = AMDGPUAsmVariants.Default; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -777,6 +777,14 @@ return get(Opcode).TSFlags & SIInstrFlags::FPAtomic; } + static bool doesNotReadTiedSource(const MachineInstr &MI) { + return MI.getDesc().TSFlags & SIInstrFlags::TiedSourceNotRead; + } + + bool doesNotReadTiedSource(uint16_t Opcode) const { + return get(Opcode).TSFlags & SIInstrFlags::TiedSourceNotRead; + } + bool isVGPRCopy(const MachineInstr &MI) const { assert(MI.isCopy()); Register Dest = MI.getOperand(0).getReg(); diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -11,7 +11,7 @@ ; GFX900: ; %bb.0: ; %bb ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -22,7 +22,6 @@ ; FLATSCR-NEXT: s_mov_b32 s0, 2 ; FLATSCR-NEXT: scratch_load_ushort v0, off, s0 ; FLATSCR-NEXT: s_mov_b32 s0, 0 -; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: scratch_load_short_d16_hi v0, off, s0 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] @@ -31,8 +30,8 @@ ; GFX10_DEFAULT: ; %bb.0: ; %bb ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10_DEFAULT-NEXT: s_clause 0x1 ; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:2 -; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], 0 ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) ; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31] @@ -45,7 +44,6 @@ ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, off, s0 ; FLATSCR_GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; FLATSCR_GFX10-NEXT: s_mov_b32 s0, 0 -; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, off, s0 ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] @@ -57,7 +55,6 @@ ; GFX11-NEXT: s_mov_b32 s0, 2 ; GFX11-NEXT: scratch_load_u16 v0, off, s0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_load_d16_hi_b16 v0, off, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -77,7 +74,7 @@ ; GFX900: ; %bb.0: ; %bb ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: s_setpc_b64 s[30:31] @@ -86,7 +83,7 @@ ; FLATSCR: ; %bb.0: ; %bb ; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR-NEXT: scratch_load_ushort v0, v0, off -; FLATSCR-NEXT: s_waitcnt vmcnt(0) +; FLATSCR-NEXT: s_nop 0 ; FLATSCR-NEXT: scratch_load_short_d16_hi v0, v1, off ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: s_setpc_b64 s[30:31] @@ -95,8 +92,8 @@ ; GFX10_DEFAULT: ; %bb.0: ; %bb ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10_DEFAULT-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10_DEFAULT-NEXT: s_clause 0x1 ; GFX10_DEFAULT-NEXT: buffer_load_ushort v0, v0, s[0:3], 0 offen -; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) ; GFX10_DEFAULT-NEXT: buffer_load_short_d16_hi v0, v1, s[0:3], 0 offen ; GFX10_DEFAULT-NEXT: s_waitcnt vmcnt(0) ; GFX10_DEFAULT-NEXT: s_setpc_b64 s[30:31] @@ -106,7 +103,6 @@ ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; FLATSCR_GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; FLATSCR_GFX10-NEXT: scratch_load_ushort v0, v0, off -; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: scratch_load_short_d16_hi v0, v1, off ; FLATSCR_GFX10-NEXT: s_waitcnt vmcnt(0) ; FLATSCR_GFX10-NEXT: s_setpc_b64 s[30:31] @@ -116,7 +112,6 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_u16 v0, v0, off -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_load_d16_hi_b16 v0, v1, off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -279,7 +274,6 @@ ; GCN-NEXT: global_load_ushort v0, v[0:1], off ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_short_d16_hi v0, v[1:2], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -293,7 +287,6 @@ ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -307,7 +300,6 @@ ; GFX11-NEXT: global_load_u16 v0, v[0:1], off ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[1:2], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -327,7 +319,7 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: global_load_ushort v0, v[0:1], off -; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_short_d16_hi v0, v[2:3], off ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -337,7 +329,6 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -347,7 +338,6 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/load-hi16.ll b/llvm/test/CodeGen/AMDGPU/load-hi16.ll --- a/llvm/test/CodeGen/AMDGPU/load-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-hi16.ll @@ -2549,7 +2549,7 @@ ; GFX900: ; %bb.0: ; %entry ; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-NEXT: global_load_ushort v2, v[0:1], off glc -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: s_nop 0 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:2 glc ; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_mov_b32_e32 v0, v2 @@ -2581,7 +2581,7 @@ ; GFX900-FLATSCR: ; %bb.0: ; %entry ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX900-FLATSCR-NEXT: global_load_ushort v2, v[0:1], off glc -; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) +; GFX900-FLATSCR-NEXT: s_nop 0 ; GFX900-FLATSCR-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:2 glc ; GFX900-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX900-FLATSCR-NEXT: v_mov_b32_e32 v0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -2537,7 +2537,7 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_short_d16_hi v0, v[2:3], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_dword v[4:5], v0, off @@ -2549,7 +2549,6 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_short_d16_hi v0, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dword v[4:5], v0, off @@ -2561,7 +2560,6 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off -; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_load_d16_hi_b16 v0, v[2:3], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v[4:5], v0, off