diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir @@ -83,8 +83,8 @@ ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX7-NEXT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4 @@ -111,8 +111,8 @@ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX10-NEXT: %12:vgpr_32, dead %14:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4 @@ -213,8 +213,8 @@ ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX7-NEXT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4 @@ -241,8 +241,8 @@ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX10-NEXT: %12:vgpr_32, dead %14:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64)) ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4 @@ -289,8 +289,8 @@ ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX7-NEXT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4 @@ -308,8 +308,8 @@ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX9-NEXT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4 @@ -327,8 +327,8 @@ ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX10-NEXT: %12:vgpr_32, dead %14:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4 @@ -346,8 +346,8 @@ ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX11-NEXT: %12:vgpr_32, dead %14:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32)) ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] %0:vgpr(p0) = COPY $vgpr0_vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir @@ -145,8 +145,8 @@ ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX7-FLAT-NEXT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gep4 @@ -164,8 +164,8 @@ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX8-NEXT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gep4 @@ -335,8 +335,8 @@ ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX7-FLAT-NEXT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_gep4 @@ -354,8 +354,8 @@ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX8-NEXT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1) ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_gep4 @@ -411,8 +411,8 @@ ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX6-NEXT: %19:vgpr_32, dead %21:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %19, %subreg.sub1 + ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 @@ -436,8 +436,8 @@ ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX7-NEXT: %19:vgpr_32, dead %21:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %19, %subreg.sub1 + ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440 ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1 @@ -461,8 +461,8 @@ ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX7-FLAT-NEXT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gepm4 @@ -480,8 +480,8 @@ ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1 ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec - ; GFX8-NEXT: %12:vgpr_32, dead %14:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, %12, %subreg.sub1 + ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1 ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1) ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]] ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gepm4 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -4586,19 +4586,84 @@ ret void } -; GCN-LABEL: {{^}}atomic_inc_i64_offset: -; GCN: flat_atomic_inc_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) { +; GCN1-LABEL: atomic_inc_i64_offset: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_inc_i64_offset: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in seq_cst ret void } -; GCN-LABEL: {{^}}atomic_inc_i64_ret_offset: -; GCN: flat_atomic_inc_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} -; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GCN1-LABEL: atomic_inc_i64_ret_offset: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_inc_i64_ret_offset: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in seq_cst @@ -4606,9 +4671,46 @@ ret void } -; GCN-LABEL: {{^}}atomic_inc_i64_incr64_offset: -; GCN: flat_atomic_inc_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %index) { +; GCN1-LABEL: atomic_inc_i64_incr64_offset: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_inc_i64_incr64_offset: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -4616,10 +4718,50 @@ ret void } -; GCN-LABEL: {{^}}atomic_inc_i64_ret_incr64_offset: -; GCN: flat_atomic_inc_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} -; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GCN1-LABEL: atomic_inc_i64_ret_incr64_offset: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_inc_i64_ret_incr64_offset: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -4628,37 +4770,163 @@ ret void } -; GCN-LABEL: {{^}}atomic_inc_i64: -; GCN: flat_atomic_inc_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) { +; GCN1-LABEL: atomic_inc_i64: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_inc_i64: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in seq_cst ret void } -; GCN-LABEL: {{^}}atomic_inc_i64_ret: -; GCN: flat_atomic_inc_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} -; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GCN1-LABEL: atomic_inc_i64_ret: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_inc_i64_ret: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in seq_cst store i64 %tmp0, ptr %out2 ret void } -; GCN-LABEL: {{^}}atomic_inc_i64_incr64: -; GCN: flat_atomic_inc_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) { +; GCN1-LABEL: atomic_inc_i64_incr64: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_inc_i64_incr64: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in seq_cst ret void } -; GCN-LABEL: {{^}}atomic_inc_i64_ret_incr64: -; GCN: flat_atomic_inc_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} -; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GCN1-LABEL: atomic_inc_i64_ret_incr64: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_inc_i64_ret_incr64: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in seq_cst @@ -4666,19 +4934,84 @@ ret void } -; GCN-LABEL: {{^}}atomic_dec_i64_offset: -; GCN: flat_atomic_dec_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { +; GCN1-LABEL: atomic_dec_i64_offset: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v0, s2 +; GCN1-NEXT: v_mov_b32_e32 v1, s3 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_dec_i64_offset: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v0, s2 +; GCN2-NEXT: v_mov_b32_e32 v1, s3 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in seq_cst ret void } -; GCN-LABEL: {{^}}atomic_dec_i64_ret_offset: -; GCN: flat_atomic_dec_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} -; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) { +; GCN1-LABEL: atomic_dec_i64_ret_offset: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_dec_i64_ret_offset: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in seq_cst @@ -4686,9 +5019,46 @@ ret void } -; GCN-LABEL: {{^}}atomic_dec_i64_decr64_offset: -; GCN: flat_atomic_dec_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) { +; GCN1-LABEL: atomic_dec_i64_decr64_offset: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_dec_i64_decr64_offset: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -4696,10 +5066,50 @@ ret void } -; GCN-LABEL: {{^}}atomic_dec_i64_ret_decr64_offset: -; GCN: flat_atomic_dec_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} -; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GCN1-LABEL: atomic_dec_i64_ret_decr64_offset: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_dec_i64_ret_decr64_offset: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %gep = getelementptr i64, ptr %ptr, i64 4 @@ -4708,37 +5118,163 @@ ret void } -; GCN-LABEL: {{^}}atomic_dec_i64: -; GCN: flat_atomic_dec_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { +; GCN1-LABEL: atomic_dec_i64: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_dec_i64: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in seq_cst ret void } -; GCN-LABEL: {{^}}atomic_dec_i64_ret: -; GCN: flat_atomic_dec_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} -; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { +; GCN1-LABEL: atomic_dec_i64_ret: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_mov_b32_e32 v2, s6 +; GCN1-NEXT: v_mov_b32_e32 v3, s7 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_dec_i64_ret: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_mov_b32_e32 v2, s6 +; GCN2-NEXT: v_mov_b32_e32 v3, s7 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in seq_cst store i64 %tmp0, ptr %out2 ret void } -; GCN-LABEL: {{^}}atomic_dec_i64_decr64: -; GCN: flat_atomic_dec_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}} define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) { +; GCN1-LABEL: atomic_dec_i64_decr64: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s6 +; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN1-NEXT: s_add_u32 s0, s4, s0 +; GCN1-NEXT: s_addc_u32 s1, s5, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v1, s7 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_dec_i64_decr64: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s6 +; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; GCN2-NEXT: s_add_u32 s0, s4, s0 +; GCN2-NEXT: s_addc_u32 s1, s5, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v1, s7 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in seq_cst ret void } -; GCN-LABEL: {{^}}atomic_dec_i64_ret_decr64: -; GCN: flat_atomic_dec_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}} -; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]] define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) { +; GCN1-LABEL: atomic_dec_i64_ret_decr64: +; GCN1: ; %bb.0: ; %entry +; GCN1-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 +; GCN1-NEXT: s_waitcnt lgkmcnt(0) +; GCN1-NEXT: v_mov_b32_e32 v0, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s5 +; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN1-NEXT: s_add_u32 s0, s0, s4 +; GCN1-NEXT: s_addc_u32 s1, s1, s5 +; GCN1-NEXT: v_mov_b32_e32 v3, s1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN1-NEXT: buffer_wbinvl1_vol +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 +; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN1-NEXT: s_endpgm +; +; GCN2-LABEL: atomic_dec_i64_ret_decr64: +; GCN2: ; %bb.0: ; %entry +; GCN2-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 +; GCN2-NEXT: s_waitcnt lgkmcnt(0) +; GCN2-NEXT: v_mov_b32_e32 v0, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s5 +; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3 +; GCN2-NEXT: s_add_u32 s0, s0, s4 +; GCN2-NEXT: s_addc_u32 s1, s1, s5 +; GCN2-NEXT: v_mov_b32_e32 v3, s1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc +; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN2-NEXT: buffer_wbinvl1_vol +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 +; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN2-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll --- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -5970,23 +5970,108 @@ ret void } -; GCN-LABEL: {{^}}atomic_inc_i64_offset: -; CIVI: buffer_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} - -; GFX9: global_atomic_inc_x2 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32{{$}} define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) { +; CI-LABEL: atomic_inc_i64_offset: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 offset:32 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_inc_i64_offset: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 offset:32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_inc_i64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst ret void } -; GCN-LABEL: {{^}}atomic_inc_i64_ret_offset: -; CIVI: buffer_atomic_inc_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} -; CIVI: buffer_store_dwordx2 [[RET]] - -; GFX9: global_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { +; CI-LABEL: atomic_inc_i64_ret_offset: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s0, s6 +; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s6, s2 +; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_inc_i64_ret_offset: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_inc_i64_ret_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -5994,11 +6079,61 @@ ret void } -; GCN-LABEL: {{^}}atomic_inc_i64_incr64_offset: -; CI: buffer_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} -; VI: flat_atomic_inc_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9: global_atomic_inc_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { +; CI-LABEL: atomic_inc_i64_incr64_offset: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s6 +; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, s7 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_inc_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_inc_i64_incr64_offset: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; VI-NEXT: s_add_u32 s0, s4, s0 +; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_inc_i64_incr64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4 @@ -6006,23 +6141,108 @@ ret void } -; GCN-LABEL: {{^}}atomic_dec_i64_offset: -; CIVI: buffer_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}} - -; GFX9: global_atomic_dec_x2 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32{{$}} define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) { +; CI-LABEL: atomic_dec_i64_offset: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 offset:32 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_dec_i64_offset: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 offset:32 +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_dec_i64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst ret void } -; GCN-LABEL: {{^}}atomic_dec_i64_ret_offset: -; CIVI: buffer_atomic_dec_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}} -; CIVI: buffer_store_dwordx2 [[RET]] - -; GFX9: global_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:32 glc{{$}} define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) { +; CI-LABEL: atomic_dec_i64_ret_offset: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; CI-NEXT: s_mov_b32 s3, 0xf000 +; CI-NEXT: s_mov_b32 s2, -1 +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: s_mov_b32 s0, s6 +; CI-NEXT: s_mov_b32 s1, s7 +; CI-NEXT: v_mov_b32_e32 v0, s8 +; CI-NEXT: v_mov_b32_e32 v1, s9 +; CI-NEXT: s_mov_b32 s6, s2 +; CI-NEXT: s_mov_b32 s7, s3 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_dec_i64_ret_offset: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 +; VI-NEXT: s_mov_b32 s3, 0xf000 +; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_mov_b32 s0, s6 +; VI-NEXT: s_mov_b32 s1, s7 +; VI-NEXT: v_mov_b32_e32 v0, s8 +; VI-NEXT: v_mov_b32_e32 v1, s9 +; VI-NEXT: s_mov_b32 s6, s2 +; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_dec_i64_ret_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7] +; GFX9-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(1) %out, i64 4 %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in seq_cst @@ -6030,11 +6250,61 @@ ret void } -; GCN-LABEL: {{^}}atomic_dec_i64_decr64_offset: -; CI: buffer_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}} -; VI: flat_atomic_dec_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}} -; GFX9: global_atomic_dec_x2 v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s{{\[[0-9]+:[0-9]+\]}} offset:32{{$}} define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) { +; CI-LABEL: atomic_dec_i64_decr64_offset: +; CI: ; %bb.0: ; %entry +; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v0, s6 +; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v1, s7 +; CI-NEXT: s_mov_b32 s7, 0xf000 +; CI-NEXT: s_mov_b32 s6, 0 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; CI-NEXT: buffer_atomic_dec_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32 +; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: buffer_wbinvl1_vol +; CI-NEXT: s_endpgm +; +; VI-LABEL: atomic_dec_i64_decr64_offset: +; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; VI-NEXT: s_add_u32 s0, s4, s0 +; VI-NEXT: s_addc_u32 s1, s5, s1 +; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1] +; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: buffer_wbinvl1_vol +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: atomic_dec_i64_decr64_offset: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, s6 +; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; GFX9-NEXT: s_add_u32 s0, s4, s0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 +; GFX9-NEXT: s_addc_u32 s1, s5, s1 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4