diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -505,7 +505,7 @@ const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo); unsigned Size = TRI->getRegSizeInBits(*RC); - Result.second = Result.first + (Size / 32); + Result.second = Result.first + ((Size + 16) / 32); return Result; } diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt.mir b/llvm/test/CodeGen/AMDGPU/waitcnt.mir --- a/llvm/test/CodeGen/AMDGPU/waitcnt.mir +++ b/llvm/test/CodeGen/AMDGPU/waitcnt.mir @@ -41,6 +41,9 @@ ret void } + define amdgpu_kernel void @subregs16bit() { + ret void + } ... --- @@ -284,3 +287,19 @@ FLAT_STORE_DWORD $vgpr1_vgpr2, $vgpr0, 0, 0, 0, 0, implicit $exec, implicit $flat_scr } ... + +--- +# CHECK-LABEL: name: subregs16bit +# CHECK: S_WAITCNT 112 +# CHECK-NEXT: V_NOP_e32 + +name: subregs16bit +machineFunctionInfo: + isEntryFunction: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4 + $vgpr0 = FLAT_LOAD_USHORT killed $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + $vgpr1 = FLAT_LOAD_USHORT killed $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr + V_NOP_e32 implicit $exec, implicit $vgpr0_lo16, implicit $vgpr1_lo16 +...