diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -207,6 +207,8 @@ case AtomicRMWInst::UMin: case AtomicRMWInst::FAdd: case AtomicRMWInst::FSub: + case AtomicRMWInst::FMax: + case AtomicRMWInst::FMin: break; } @@ -315,6 +317,12 @@ case Intrinsic::amdgcn_global_atomic_fadd: Op = AtomicRMWInst::FAdd; break; + case Intrinsic::amdgcn_global_atomic_fmax: + Op = AtomicRMWInst::FMax; + break; + case Intrinsic::amdgcn_global_atomic_fmin: + Op = AtomicRMWInst::FMin; + break; } // Only 32-bit floating point atomic ops are supported. @@ -325,7 +333,8 @@ unsigned ValIdx = 0; // TODO: Operand order is not consistent for atomic fadd intrinsics - if (Op == AtomicRMWInst::FAdd) { + if (Op == AtomicRMWInst::FAdd || Op == AtomicRMWInst::FMax || + Op == AtomicRMWInst::FMin) { ValIdx = 1; } @@ -392,6 +401,10 @@ case AtomicRMWInst::UMin: Pred = CmpInst::ICMP_ULT; break; + case AtomicRMWInst::FMax: + return B.CreateMaxNum(LHS, RHS); + case AtomicRMWInst::FMin: + return B.CreateMinNum(LHS, RHS); } Value *Cond = B.CreateICmp(Pred, LHS, RHS); return B.CreateSelect(Cond, LHS, RHS); @@ -712,6 +725,10 @@ return APFloat::getZero(Semantics, false); case AtomicRMWInst::FSub: return APFloat::getZero(Semantics, true); + case AtomicRMWInst::FMin: + return APFloat::getInf(Semantics, false); + case AtomicRMWInst::FMax: + return APFloat::getInf(Semantics, true); } } @@ -906,6 +923,8 @@ case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: + case AtomicRMWInst::FMin: + case AtomicRMWInst::FMax: // These operations with a uniform value are idempotent: doing the atomic // operation multiple times has the same effect as doing it once. NewV = V; diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll +++ /dev/null @@ -1,285 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-ITERATIVE %s -; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-DPP %s -declare i32 @llvm.amdgcn.workitem.id.x() -define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_value( -; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> -; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) -; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) -; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) -; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 -; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float -; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] -; IR-ITERATIVE: 12: -; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 -; IR-ITERATIVE-NEXT: br label [[TMP14]] -; IR-ITERATIVE: 14: -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fadd_uni_value( -; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> -; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) -; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) -; IR-DPP-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) -; IR-DPP-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 -; IR-DPP-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float -; IR-DPP-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] -; IR-DPP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 -; IR-DPP-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] -; IR-DPP: 12: -; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 -; IR-DPP-NEXT: br label [[TMP14]] -; IR-DPP: 14: -; IR-DPP-NEXT: ret void -; - %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst - ret void -} - - -define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_value( -; IR-ITERATIVE-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; IR-ITERATIVE-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float -; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> -; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) -; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) -; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] -; IR-ITERATIVE: 8: -; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4 -; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]] -; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: ret void -; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) -; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 -; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float -; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]] -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 -; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] -; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] -; -; IR-DPP-LABEL: @global_atomic_fadd_div_value( -; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; IR-DPP-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float -; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> -; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) -; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) -; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32 -; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 0) -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float -; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float -; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast float [[TMP9]] to i32 -; IR-DPP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP11]], i32 273, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast i32 [[TMP12]] to float -; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast i32 [[TMP11]] to float -; IR-DPP-NEXT: [[TMP15:%.*]] = fadd float [[TMP14]], [[TMP13]] -; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast float [[TMP15]] to i32 -; IR-DPP-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP16]], i32 274, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float -; IR-DPP-NEXT: [[TMP19:%.*]] = bitcast i32 [[TMP16]] to float -; IR-DPP-NEXT: [[TMP20:%.*]] = fadd float [[TMP19]], [[TMP18]] -; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast float [[TMP20]] to i32 -; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP21]], i32 276, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast i32 [[TMP22]] to float -; IR-DPP-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP21]] to float -; IR-DPP-NEXT: [[TMP25:%.*]] = fadd float [[TMP24]], [[TMP23]] -; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP25]] to i32 -; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP26]], i32 280, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float -; IR-DPP-NEXT: [[TMP29:%.*]] = bitcast i32 [[TMP26]] to float -; IR-DPP-NEXT: [[TMP30:%.*]] = fadd float [[TMP29]], [[TMP28]] -; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast float [[TMP30]] to i32 -; IR-DPP-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP31]], i32 322, i32 10, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast i32 [[TMP32]] to float -; IR-DPP-NEXT: [[TMP34:%.*]] = bitcast i32 [[TMP31]] to float -; IR-DPP-NEXT: [[TMP35:%.*]] = fadd float [[TMP34]], [[TMP33]] -; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast float [[TMP35]] to i32 -; IR-DPP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP36]], i32 323, i32 12, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast i32 [[TMP37]] to float -; IR-DPP-NEXT: [[TMP39:%.*]] = bitcast i32 [[TMP36]] to float -; IR-DPP-NEXT: [[TMP40:%.*]] = fadd float [[TMP39]], [[TMP38]] -; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast float [[TMP40]] to i32 -; IR-DPP-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP41]], i32 63) -; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast i32 [[TMP42]] to float -; IR-DPP-NEXT: [[TMP44:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP43]]) -; IR-DPP-NEXT: [[TMP45:%.*]] = icmp eq i32 [[TMP6]], 0 -; IR-DPP-NEXT: br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP48:%.*]] -; IR-DPP: 46: -; IR-DPP-NEXT: [[TMP47:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP44]] seq_cst, align 4 -; IR-DPP-NEXT: br label [[TMP48]] -; IR-DPP: 48: -; IR-DPP-NEXT: ret void -; - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %divValue = bitcast i32 %id.x to float - %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue seq_cst - ret void -} - -define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_value( -; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> -; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) -; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) -; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) -; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 -; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float -; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] -; IR-ITERATIVE: 12: -; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 -; IR-ITERATIVE-NEXT: br label [[TMP14]] -; IR-ITERATIVE: 14: -; IR-ITERATIVE-NEXT: ret void -; -; IR-DPP-LABEL: @global_atomic_fsub_uni_value( -; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> -; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) -; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) -; IR-DPP-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) -; IR-DPP-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 -; IR-DPP-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float -; IR-DPP-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] -; IR-DPP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 -; IR-DPP-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] -; IR-DPP: 12: -; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 -; IR-DPP-NEXT: br label [[TMP14]] -; IR-DPP: 14: -; IR-DPP-NEXT: ret void -; - %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst - ret void -} - - -define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) #0 { -; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_value( -; IR-ITERATIVE-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; IR-ITERATIVE-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float -; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> -; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) -; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) -; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] -; IR-ITERATIVE: 8: -; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4 -; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]] -; IR-ITERATIVE: 10: -; IR-ITERATIVE-NEXT: ret void -; IR-ITERATIVE: ComputeLoop: -; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] -; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) -; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 -; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 -; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) -; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float -; IR-ITERATIVE-NEXT: [[TMP16]] = fsub float [[ACCUMULATOR]], [[TMP15]] -; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] -; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 -; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] -; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] -; IR-ITERATIVE: ComputeEnd: -; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 -; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] -; -; IR-DPP-LABEL: @global_atomic_fsub_div_value( -; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() -; IR-DPP-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float -; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) -; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> -; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 -; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 -; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) -; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) -; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32 -; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 -2147483648) -; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float -; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float -; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast float [[TMP9]] to i32 -; IR-DPP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP11]], i32 273, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast i32 [[TMP12]] to float -; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast i32 [[TMP11]] to float -; IR-DPP-NEXT: [[TMP15:%.*]] = fsub float [[TMP14]], [[TMP13]] -; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast float [[TMP15]] to i32 -; IR-DPP-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP16]], i32 274, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float -; IR-DPP-NEXT: [[TMP19:%.*]] = bitcast i32 [[TMP16]] to float -; IR-DPP-NEXT: [[TMP20:%.*]] = fsub float [[TMP19]], [[TMP18]] -; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast float [[TMP20]] to i32 -; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP21]], i32 276, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast i32 [[TMP22]] to float -; IR-DPP-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP21]] to float -; IR-DPP-NEXT: [[TMP25:%.*]] = fsub float [[TMP24]], [[TMP23]] -; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP25]] to i32 -; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP26]], i32 280, i32 15, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float -; IR-DPP-NEXT: [[TMP29:%.*]] = bitcast i32 [[TMP26]] to float -; IR-DPP-NEXT: [[TMP30:%.*]] = fsub float [[TMP29]], [[TMP28]] -; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast float [[TMP30]] to i32 -; IR-DPP-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP31]], i32 322, i32 10, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast i32 [[TMP32]] to float -; IR-DPP-NEXT: [[TMP34:%.*]] = bitcast i32 [[TMP31]] to float -; IR-DPP-NEXT: [[TMP35:%.*]] = fsub float [[TMP34]], [[TMP33]] -; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast float [[TMP35]] to i32 -; IR-DPP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP36]], i32 323, i32 12, i32 15, i1 false) -; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast i32 [[TMP37]] to float -; IR-DPP-NEXT: [[TMP39:%.*]] = bitcast i32 [[TMP36]] to float -; IR-DPP-NEXT: [[TMP40:%.*]] = fsub float [[TMP39]], [[TMP38]] -; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast float [[TMP40]] to i32 -; IR-DPP-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP41]], i32 63) -; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast i32 [[TMP42]] to float -; IR-DPP-NEXT: [[TMP44:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP43]]) -; IR-DPP-NEXT: [[TMP45:%.*]] = icmp eq i32 [[TMP6]], 0 -; IR-DPP-NEXT: br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP48:%.*]] -; IR-DPP: 46: -; IR-DPP-NEXT: [[TMP47:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP44]] seq_cst, align 4 -; IR-DPP-NEXT: br label [[TMP48]] -; IR-DPP: 48: -; IR-DPP-NEXT: ret void -; - %id.x = call i32 @llvm.amdgcn.workitem.id.x() - %divValue = bitcast i32 %id.x to float - %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue seq_cst - ret void -} - -attributes #0 = {"target-cpu"="gfx906"} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp.ll @@ -0,0 +1,826 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-ITERATIVE %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR-DPP %s + +declare i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_value( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP14]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_value( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) +; IR-DPP-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; IR-DPP-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] +; IR-DPP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] +; IR-DPP: 12: +; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP14]] +; IR-DPP: 14: +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_value( +; IR-ITERATIVE-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-ITERATIVE-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 8: +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]] +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 +; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] +; +; IR-DPP-LABEL: @global_atomic_fadd_div_value( +; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-DPP-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 0) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast float [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP11]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast i32 [[TMP12]] to float +; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast i32 [[TMP11]] to float +; IR-DPP-NEXT: [[TMP15:%.*]] = fadd float [[TMP14]], [[TMP13]] +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast float [[TMP15]] to i32 +; IR-DPP-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP16]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float +; IR-DPP-NEXT: [[TMP19:%.*]] = bitcast i32 [[TMP16]] to float +; IR-DPP-NEXT: [[TMP20:%.*]] = fadd float [[TMP19]], [[TMP18]] +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast float [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP21]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast i32 [[TMP22]] to float +; IR-DPP-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP21]] to float +; IR-DPP-NEXT: [[TMP25:%.*]] = fadd float [[TMP24]], [[TMP23]] +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP25]] to i32 +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP26]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float +; IR-DPP-NEXT: [[TMP29:%.*]] = bitcast i32 [[TMP26]] to float +; IR-DPP-NEXT: [[TMP30:%.*]] = fadd float [[TMP29]], [[TMP28]] +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast float [[TMP30]] to i32 +; IR-DPP-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP31]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast i32 [[TMP32]] to float +; IR-DPP-NEXT: [[TMP34:%.*]] = bitcast i32 [[TMP31]] to float +; IR-DPP-NEXT: [[TMP35:%.*]] = fadd float [[TMP34]], [[TMP33]] +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast float [[TMP35]] to i32 +; IR-DPP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP36]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast i32 [[TMP37]] to float +; IR-DPP-NEXT: [[TMP39:%.*]] = bitcast i32 [[TMP36]] to float +; IR-DPP-NEXT: [[TMP40:%.*]] = fadd float [[TMP39]], [[TMP38]] +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast float [[TMP40]] to i32 +; IR-DPP-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP41]], i32 63) +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast i32 [[TMP42]] to float +; IR-DPP-NEXT: [[TMP44:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP43]]) +; IR-DPP-NEXT: [[TMP45:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP48:%.*]] +; IR-DPP: 46: +; IR-DPP-NEXT: [[TMP47:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP44]] seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP48]] +; IR-DPP: 48: +; IR-DPP-NEXT: ret void +; + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fsub_uni_value(ptr addrspace(1) %ptr) { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_value( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP14]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fsub_uni_value( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) +; IR-DPP-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; IR-DPP-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] +; IR-DPP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] +; IR-DPP: 12: +; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP14]] +; IR-DPP: 14: +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst + ret void +} + + +define amdgpu_kernel void @global_atomic_fsub_div_value(ptr addrspace(1) %ptr) { +; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_value( +; IR-ITERATIVE-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-ITERATIVE-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 8: +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ -0.000000e+00, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-ITERATIVE-NEXT: [[TMP16]] = fsub float [[ACCUMULATOR]], [[TMP15]] +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 +; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] +; +; IR-DPP-LABEL: @global_atomic_fsub_div_value( +; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-DPP-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 -2147483648) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast float [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP11]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast i32 [[TMP12]] to float +; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast i32 [[TMP11]] to float +; IR-DPP-NEXT: [[TMP15:%.*]] = fsub float [[TMP14]], [[TMP13]] +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast float [[TMP15]] to i32 +; IR-DPP-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP16]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float +; IR-DPP-NEXT: [[TMP19:%.*]] = bitcast i32 [[TMP16]] to float +; IR-DPP-NEXT: [[TMP20:%.*]] = fsub float [[TMP19]], [[TMP18]] +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast float [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP21]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast i32 [[TMP22]] to float +; IR-DPP-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP21]] to float +; IR-DPP-NEXT: [[TMP25:%.*]] = fsub float [[TMP24]], [[TMP23]] +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP25]] to i32 +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP26]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float +; IR-DPP-NEXT: [[TMP29:%.*]] = bitcast i32 [[TMP26]] to float +; IR-DPP-NEXT: [[TMP30:%.*]] = fsub float [[TMP29]], [[TMP28]] +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast float [[TMP30]] to i32 +; IR-DPP-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP31]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast i32 [[TMP32]] to float +; IR-DPP-NEXT: [[TMP34:%.*]] = bitcast i32 [[TMP31]] to float +; IR-DPP-NEXT: [[TMP35:%.*]] = fsub float [[TMP34]], [[TMP33]] +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast float [[TMP35]] to i32 +; IR-DPP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -2147483648, i32 [[TMP36]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast i32 [[TMP37]] to float +; IR-DPP-NEXT: [[TMP39:%.*]] = bitcast i32 [[TMP36]] to float +; IR-DPP-NEXT: [[TMP40:%.*]] = fsub float [[TMP39]], [[TMP38]] +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast float [[TMP40]] to i32 +; IR-DPP-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP41]], i32 63) +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast i32 [[TMP42]] to float +; IR-DPP-NEXT: [[TMP44:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP43]]) +; IR-DPP-NEXT: [[TMP45:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP48:%.*]] +; IR-DPP: 46: +; IR-DPP-NEXT: [[TMP47:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[TMP44]] seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP48]] +; IR-DPP: 48: +; IR-DPP-NEXT: ret void +; + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_uni_value(ptr addrspace(1) %ptr) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_value( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP10:%.*]] +; IR-ITERATIVE: 8: +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float 4.000000e+00 seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP10]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fmin_uni_value( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP10:%.*]] +; IR-DPP: 8: +; IR-DPP-NEXT: [[TMP9:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float 4.000000e+00 seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP10]] +; IR-DPP: 10: +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_div_value(ptr addrspace(1) %ptr) { +; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_value( +; IR-ITERATIVE-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-ITERATIVE-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 8: +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0x7FF0000000000000, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-ITERATIVE-NEXT: [[TMP16]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 +; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] +; +; IR-DPP-LABEL: @global_atomic_fmin_div_value( +; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-DPP-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 2139095040) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast float [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 2139095040, i32 [[TMP11]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast i32 [[TMP12]] to float +; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast i32 [[TMP11]] to float +; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.minnum.f32(float [[TMP14]], float [[TMP13]]) +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast float [[TMP15]] to i32 +; IR-DPP-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 2139095040, i32 [[TMP16]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float +; IR-DPP-NEXT: [[TMP19:%.*]] = bitcast i32 [[TMP16]] to float +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.minnum.f32(float [[TMP19]], float [[TMP18]]) +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast float [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 2139095040, i32 [[TMP21]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast i32 [[TMP22]] to float +; IR-DPP-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP21]] to float +; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.minnum.f32(float [[TMP24]], float [[TMP23]]) +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP25]] to i32 +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 2139095040, i32 [[TMP26]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float +; IR-DPP-NEXT: [[TMP29:%.*]] = bitcast i32 [[TMP26]] to float +; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.minnum.f32(float [[TMP29]], float [[TMP28]]) +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast float [[TMP30]] to i32 +; IR-DPP-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 2139095040, i32 [[TMP31]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast i32 [[TMP32]] to float +; IR-DPP-NEXT: [[TMP34:%.*]] = bitcast i32 [[TMP31]] to float +; IR-DPP-NEXT: [[TMP35:%.*]] = call float @llvm.minnum.f32(float [[TMP34]], float [[TMP33]]) +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast float [[TMP35]] to i32 +; IR-DPP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 2139095040, i32 [[TMP36]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast i32 [[TMP37]] to float +; IR-DPP-NEXT: [[TMP39:%.*]] = bitcast i32 [[TMP36]] to float +; IR-DPP-NEXT: [[TMP40:%.*]] = call float @llvm.minnum.f32(float [[TMP39]], float [[TMP38]]) +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast float [[TMP40]] to i32 +; IR-DPP-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP41]], i32 63) +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast i32 [[TMP42]] to float +; IR-DPP-NEXT: [[TMP44:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP43]]) +; IR-DPP-NEXT: [[TMP45:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP48:%.*]] +; IR-DPP: 46: +; IR-DPP-NEXT: [[TMP47:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP44]] seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP48]] +; IR-DPP: 48: +; IR-DPP-NEXT: ret void +; + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_uni_value(ptr addrspace(1) %ptr) { +; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_value( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP10:%.*]] +; IR-ITERATIVE: 8: +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float 4.000000e+00 seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP10]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fmax_uni_value( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP10:%.*]] +; IR-DPP: 8: +; IR-DPP-NEXT: [[TMP9:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float 4.000000e+00 seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP10]] +; IR-DPP: 10: +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_div_value(ptr addrspace(1) %ptr) { +; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_value( +; IR-ITERATIVE-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-ITERATIVE-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 8: +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0xFFF0000000000000, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-ITERATIVE-NEXT: [[TMP16]] = call float @llvm.maxnum.f32(float [[ACCUMULATOR]], float [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 +; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] +; +; IR-DPP-LABEL: @global_atomic_fmax_div_value( +; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-DPP-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 -8388608) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast float [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -8388608, i32 [[TMP11]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast i32 [[TMP12]] to float +; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast i32 [[TMP11]] to float +; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.maxnum.f32(float [[TMP14]], float [[TMP13]]) +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast float [[TMP15]] to i32 +; IR-DPP-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -8388608, i32 [[TMP16]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float +; IR-DPP-NEXT: [[TMP19:%.*]] = bitcast i32 [[TMP16]] to float +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.maxnum.f32(float [[TMP19]], float [[TMP18]]) +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast float [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -8388608, i32 [[TMP21]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast i32 [[TMP22]] to float +; IR-DPP-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP21]] to float +; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.maxnum.f32(float [[TMP24]], float [[TMP23]]) +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP25]] to i32 +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -8388608, i32 [[TMP26]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float +; IR-DPP-NEXT: [[TMP29:%.*]] = bitcast i32 [[TMP26]] to float +; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.maxnum.f32(float [[TMP29]], float [[TMP28]]) +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast float [[TMP30]] to i32 +; IR-DPP-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -8388608, i32 [[TMP31]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast i32 [[TMP32]] to float +; IR-DPP-NEXT: [[TMP34:%.*]] = bitcast i32 [[TMP31]] to float +; IR-DPP-NEXT: [[TMP35:%.*]] = call float @llvm.maxnum.f32(float [[TMP34]], float [[TMP33]]) +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast float [[TMP35]] to i32 +; IR-DPP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -8388608, i32 [[TMP36]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast i32 [[TMP37]] to float +; IR-DPP-NEXT: [[TMP39:%.*]] = bitcast i32 [[TMP36]] to float +; IR-DPP-NEXT: [[TMP40:%.*]] = call float @llvm.maxnum.f32(float [[TMP39]], float [[TMP38]]) +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast float [[TMP40]] to i32 +; IR-DPP-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP41]], i32 63) +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast i32 [[TMP42]] to float +; IR-DPP-NEXT: [[TMP44:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP43]]) +; IR-DPP-NEXT: [[TMP45:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP48:%.*]] +; IR-DPP: 46: +; IR-DPP-NEXT: [[TMP47:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP44]] seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP48]] +; IR-DPP: 48: +; IR-DPP-NEXT: ret void +; + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_value_scope_defalut( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP14]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_value_scope_defalut( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) +; IR-DPP-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; IR-DPP-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] +; IR-DPP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] +; IR-DPP: 12: +; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP14]] +; IR-DPP: 14: +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_value_scope_defalut( +; IR-ITERATIVE-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-ITERATIVE-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 8: +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]] +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 +; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] +; +; IR-DPP-LABEL: @global_atomic_fadd_div_value_scope_defalut( +; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-DPP-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 0) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast float [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP11]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast i32 [[TMP12]] to float +; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast i32 [[TMP11]] to float +; IR-DPP-NEXT: [[TMP15:%.*]] = fadd float [[TMP14]], [[TMP13]] +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast float [[TMP15]] to i32 +; IR-DPP-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP16]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float +; IR-DPP-NEXT: [[TMP19:%.*]] = bitcast i32 [[TMP16]] to float +; IR-DPP-NEXT: [[TMP20:%.*]] = fadd float [[TMP19]], [[TMP18]] +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast float [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP21]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast i32 [[TMP22]] to float +; IR-DPP-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP21]] to float +; IR-DPP-NEXT: [[TMP25:%.*]] = fadd float [[TMP24]], [[TMP23]] +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP25]] to i32 +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP26]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float +; IR-DPP-NEXT: [[TMP29:%.*]] = bitcast i32 [[TMP26]] to float +; IR-DPP-NEXT: [[TMP30:%.*]] = fadd float [[TMP29]], [[TMP28]] +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast float [[TMP30]] to i32 +; IR-DPP-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP31]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast i32 [[TMP32]] to float +; IR-DPP-NEXT: [[TMP34:%.*]] = bitcast i32 [[TMP31]] to float +; IR-DPP-NEXT: [[TMP35:%.*]] = fadd float [[TMP34]], [[TMP33]] +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast float [[TMP35]] to i32 +; IR-DPP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP36]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast i32 [[TMP37]] to float +; IR-DPP-NEXT: [[TMP39:%.*]] = bitcast i32 [[TMP36]] to float +; IR-DPP-NEXT: [[TMP40:%.*]] = fadd float [[TMP39]], [[TMP38]] +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast float [[TMP40]] to i32 +; IR-DPP-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP41]], i32 63) +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast i32 [[TMP42]] to float +; IR-DPP-NEXT: [[TMP44:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP43]]) +; IR-DPP-NEXT: [[TMP45:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP48:%.*]] +; IR-DPP: 46: +; IR-DPP-NEXT: [[TMP47:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP44]] monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP48]] +; IR-DPP: 48: +; IR-DPP-NEXT: ret void +; + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue monotonic, align 4 + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_value_scope_agent( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) +; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float +; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] +; IR-ITERATIVE: 12: +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP14]] +; IR-ITERATIVE: 14: +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fadd_uni_value_scope_agent( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) +; IR-DPP-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; IR-DPP-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] +; IR-DPP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] +; IR-DPP: 12: +; IR-DPP-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP14]] +; IR-DPP: 14: +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) { +; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_value_scope_agent( +; IR-ITERATIVE-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-ITERATIVE-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 8: +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] syncscope("agent") monotonic, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-ITERATIVE-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]] +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 +; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] +; +; IR-DPP-LABEL: @global_atomic_fadd_div_value_scope_agent( +; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-DPP-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 0) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast float [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP11]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast i32 [[TMP12]] to float +; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast i32 [[TMP11]] to float +; IR-DPP-NEXT: [[TMP15:%.*]] = fadd float [[TMP14]], [[TMP13]] +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast float [[TMP15]] to i32 +; IR-DPP-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP16]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float +; IR-DPP-NEXT: [[TMP19:%.*]] = bitcast i32 [[TMP16]] to float +; IR-DPP-NEXT: [[TMP20:%.*]] = fadd float [[TMP19]], [[TMP18]] +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast float [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP21]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast i32 [[TMP22]] to float +; IR-DPP-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP21]] to float +; IR-DPP-NEXT: [[TMP25:%.*]] = fadd float [[TMP24]], [[TMP23]] +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP25]] to i32 +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP26]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float +; IR-DPP-NEXT: [[TMP29:%.*]] = bitcast i32 [[TMP26]] to float +; IR-DPP-NEXT: [[TMP30:%.*]] = fadd float [[TMP29]], [[TMP28]] +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast float [[TMP30]] to i32 +; IR-DPP-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP31]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast i32 [[TMP32]] to float +; IR-DPP-NEXT: [[TMP34:%.*]] = bitcast i32 [[TMP31]] to float +; IR-DPP-NEXT: [[TMP35:%.*]] = fadd float [[TMP34]], [[TMP33]] +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast float [[TMP35]] to i32 +; IR-DPP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 [[TMP36]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast i32 [[TMP37]] to float +; IR-DPP-NEXT: [[TMP39:%.*]] = bitcast i32 [[TMP36]] to float +; IR-DPP-NEXT: [[TMP40:%.*]] = fadd float [[TMP39]], [[TMP38]] +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast float [[TMP40]] to i32 +; IR-DPP-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP41]], i32 63) +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast i32 [[TMP42]] to float +; IR-DPP-NEXT: [[TMP44:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP43]]) +; IR-DPP-NEXT: [[TMP45:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP48:%.*]] +; IR-DPP: 46: +; IR-DPP-NEXT: [[TMP47:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP44]] syncscope("agent") monotonic, align 4 +; IR-DPP-NEXT: br label [[TMP48]] +; IR-DPP: 48: +; IR-DPP-NEXT: ret void +; + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll @@ -0,0 +1,2929 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s + +declare i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_value: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-NEXT: .LBB0_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_uni_value: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_cbranch_execz .LBB0_3 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX8-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB0_2 +; GFX8-NEXT: .LBB0_3: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_value: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-NEXT: .LBB0_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_value: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-NEXT: .LBB0_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_value: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-NEXT: .LBB0_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_value: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-NEXT: .LBB0_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_value: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-NEXT: .LBB0_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: buffer_wbinvl1_vol +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-DPP-NEXT: .LBB0_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: buffer_gl0_inv +; GFX1064-DPP-NEXT: buffer_gl1_inv +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-DPP-NEXT: .LBB0_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: buffer_gl0_inv +; GFX1032-DPP-NEXT: buffer_gl1_inv +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-DPP-NEXT: .LBB0_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: buffer_gl0_inv +; GFX1164-DPP-NEXT: buffer_gl1_inv +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-DPP-NEXT: .LBB0_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: buffer_gl0_inv +; GFX1132-DPP-NEXT: buffer_gl1_inv +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-DPP-NEXT: .LBB0_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_div_value: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_div_value: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: .LBB1_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s3 +; GFX8-NEXT: s_ff1_i32_b32 s5, s2 +; GFX8-NEXT: s_add_i32 s4, s4, 32 +; GFX8-NEXT: s_min_u32 s4, s5, s4 +; GFX8-NEXT: v_readlane_b32 s6, v0, s4 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: v_add_f32_e32 v2, s6, v2 +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB1_5 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB1_4 +; GFX8-NEXT: .LBB1_5: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_div_value: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: .LBB1_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s3 +; GFX9-NEXT: s_ff1_i32_b32 s5, s2 +; GFX9-NEXT: s_add_i32 s4, s4, 32 +; GFX9-NEXT: s_min_u32 s4, s5, s4 +; GFX9-NEXT: v_readlane_b32 s6, v0, s4 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: .LBB1_5: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_div_value: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 +; GFX1064-NEXT: s_add_i32 s4, s4, 32 +; GFX1064-NEXT: s_min_u32 s4, s5, s4 +; GFX1064-NEXT: v_readlane_b32 s6, v0, s4 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s6, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB1_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1064-NEXT: .LBB1_5: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_div_value: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB1_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1032-NEXT: .LBB1_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_div_value: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 +; GFX1164-NEXT: s_add_i32 s4, s4, 32 +; GFX1164-NEXT: s_min_u32 s4, s5, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s6, v0, s4 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: v_add_f32_e32 v2, s6, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB1_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1164-NEXT: .LBB1_5: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_div_value: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB1_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1132-NEXT: .LBB1_5: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_div_value: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v2, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f32_e32 v3, s4, v4 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v5, v[3:4], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: buffer_wbinvl1_vol +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX9-DPP-NEXT: .LBB1_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_add_f32_e64 v1, s4, s5 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v5 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v3, v0, v[3:4], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: buffer_gl0_inv +; GFX1064-DPP-NEXT: buffer_gl1_inv +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1064-DPP-NEXT: .LBB1_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v5 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v3, v0, v[3:4], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: buffer_gl0_inv +; GFX1032-DPP-NEXT: buffer_gl1_inv +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1032-DPP-NEXT: .LBB1_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v3, v4, v5 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: buffer_gl0_inv +; GFX1164-DPP-NEXT: buffer_gl1_inv +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1164-DPP-NEXT: .LBB1_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v3, v4, v5 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: buffer_gl0_inv +; GFX1132-DPP-NEXT: buffer_gl1_inv +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1132-DPP-NEXT: .LBB1_3: +; GFX1132-DPP-NEXT: s_endpgm + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-NEXT: .LBB2_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_cbranch_execz .LBB2_3 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX8-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB2_2 +; GFX8-NEXT: .LBB2_3: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-NEXT: .LBB2_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-NEXT: .LBB2_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-NEXT: .LBB2_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-NEXT: .LBB2_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-NEXT: .LBB2_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-DPP-NEXT: .LBB2_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-DPP-NEXT: .LBB2_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-DPP-NEXT: .LBB2_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-DPP-NEXT: .LBB2_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-DPP-NEXT: .LBB2_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: .LBB3_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s3 +; GFX8-NEXT: s_ff1_i32_b32 s5, s2 +; GFX8-NEXT: s_add_i32 s4, s4, 32 +; GFX8-NEXT: s_min_u32 s4, s5, s4 +; GFX8-NEXT: v_readlane_b32 s6, v0, s4 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: v_add_f32_e32 v2, s6, v2 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB3_5 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB3_4 +; GFX8-NEXT: .LBB3_5: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s3 +; GFX9-NEXT: s_ff1_i32_b32 s5, s2 +; GFX9-NEXT: s_add_i32 s4, s4, 32 +; GFX9-NEXT: s_min_u32 s4, s5, s4 +; GFX9-NEXT: v_readlane_b32 s6, v0, s4 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_5: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 +; GFX1064-NEXT: s_add_i32 s4, s4, 32 +; GFX1064-NEXT: s_min_u32 s4, s5, s4 +; GFX1064-NEXT: v_readlane_b32 s6, v0, s4 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s6, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB3_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1064-NEXT: .LBB3_5: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB3_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1032-NEXT: .LBB3_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 +; GFX1164-NEXT: s_add_i32 s4, s4, 32 +; GFX1164-NEXT: s_min_u32 s4, s5, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s6, v0, s4 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: v_add_f32_e32 v2, s6, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB3_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1164-NEXT: .LBB3_5: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB3_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1132-NEXT: .LBB3_5: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v2, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f32_e32 v3, s4, v4 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v5, v[3:4], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX9-DPP-NEXT: .LBB3_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_add_f32_e64 v1, s4, s5 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v5 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v3, v0, v[3:4], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1064-DPP-NEXT: .LBB3_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v5 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v3, v0, v[3:4], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1032-DPP-NEXT: .LBB3_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v3, v4, v5 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1164-DPP-NEXT: .LBB3_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v3, v4, v5 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1132-DPP-NEXT: .LBB3_3: +; GFX1132-DPP-NEXT: s_endpgm + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue monotonic, align 4 + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-NEXT: .LBB4_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_cbranch_execz .LBB4_3 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX8-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB4_2 +; GFX8-NEXT: .LBB4_3: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-NEXT: .LBB4_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-NEXT: .LBB4_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-NEXT: .LBB4_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-NEXT: .LBB4_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-NEXT: .LBB4_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-DPP-NEXT: .LBB4_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-DPP-NEXT: .LBB4_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-DPP-NEXT: .LBB4_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-DPP-NEXT: .LBB4_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-DPP-NEXT: .LBB4_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_add_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: .LBB5_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s3 +; GFX8-NEXT: s_ff1_i32_b32 s5, s2 +; GFX8-NEXT: s_add_i32 s4, s4, 32 +; GFX8-NEXT: s_min_u32 s4, s5, s4 +; GFX8-NEXT: v_readlane_b32 s6, v0, s4 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: v_add_f32_e32 v2, s6, v2 +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB5_5 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB5_4 +; GFX8-NEXT: .LBB5_5: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: .LBB5_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s3 +; GFX9-NEXT: s_ff1_i32_b32 s5, s2 +; GFX9-NEXT: s_add_i32 s4, s4, 32 +; GFX9-NEXT: s_min_u32 s4, s5, s4 +; GFX9-NEXT: v_readlane_b32 s6, v0, s4 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: v_add_f32_e32 v2, s6, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB5_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: .LBB5_5: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 +; GFX1064-NEXT: s_add_i32 s4, s4, 32 +; GFX1064-NEXT: s_min_u32 s4, s5, s4 +; GFX1064-NEXT: v_readlane_b32 s6, v0, s4 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: v_add_f32_e32 v2, s6, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB5_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1064-NEXT: .LBB5_5: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB5_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1032-NEXT: .LBB5_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 +; GFX1164-NEXT: s_add_i32 s4, s4, 32 +; GFX1164-NEXT: s_min_u32 s4, s5, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s6, v0, s4 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: v_add_f32_e32 v2, s6, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB5_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1164-NEXT: .LBB5_5: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v2, s4, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB5_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1132-NEXT: .LBB5_5: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_nop 0 +; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v2, v1 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s5 +; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_add_f32_e32 v3, s4, v4 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v5, v[3:4], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v4 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX9-DPP-NEXT: .LBB5_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_add_f32_e64 v1, s4, s5 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v5 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v3, v0, v[3:4], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1064-DPP-NEXT: .LBB5_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v5 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v3, v0, v[3:4], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1032-DPP-NEXT: .LBB5_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_add_f32_e32 v3, v4, v5 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1164-DPP-NEXT: .LBB5_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:1 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:2 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:4 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: v_add_f32_dpp v1, v1, v1 row_xmask:8 row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v3 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, s2 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_add_f32_e32 v3, v4, v5 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v3, v0, v[3:4], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v3 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1132-DPP-NEXT: .LBB5_3: +; GFX1132-DPP-NEXT: s_endpgm + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll @@ -0,0 +1,3199 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s + +declare i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_value: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-NEXT: .LBB0_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_uni_value: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz .LBB0_3 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB0_2 +; GFX8-NEXT: .LBB0_3: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_value: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-NEXT: .LBB0_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_value: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-NEXT: .LBB0_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_value: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-NEXT: .LBB0_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_value: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-NEXT: .LBB0_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_value: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-NEXT: .LBB0_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: buffer_wbinvl1_vol +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-DPP-NEXT: .LBB0_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: buffer_gl0_inv +; GFX1064-DPP-NEXT: buffer_gl1_inv +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-DPP-NEXT: .LBB0_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: buffer_gl0_inv +; GFX1032-DPP-NEXT: buffer_gl1_inv +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-DPP-NEXT: .LBB0_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: buffer_gl0_inv +; GFX1164-DPP-NEXT: buffer_gl1_inv +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-DPP-NEXT: .LBB0_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: buffer_gl0_inv +; GFX1132-DPP-NEXT: buffer_gl1_inv +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-DPP-NEXT: .LBB0_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_div_value: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_div_value: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX8-NEXT: .LBB1_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s3 +; GFX8-NEXT: s_ff1_i32_b32 s5, s2 +; GFX8-NEXT: s_add_i32 s4, s4, 32 +; GFX8-NEXT: s_min_u32 s4, s5, s4 +; GFX8-NEXT: v_readlane_b32 s6, v0, s4 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s6 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB1_5 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB1_4 +; GFX8-NEXT: .LBB1_5: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_div_value: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX9-NEXT: .LBB1_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s3 +; GFX9-NEXT: s_ff1_i32_b32 s5, s2 +; GFX9-NEXT: s_add_i32 s4, s4, 32 +; GFX9-NEXT: s_min_u32 s4, s5, s4 +; GFX9-NEXT: v_readlane_b32 s6, v0, s4 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e64 v2, s6, s6 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: .LBB1_5: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_div_value: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 +; GFX1064-NEXT: s_add_i32 s4, s4, 32 +; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1064-NEXT: s_min_u32 s4, s5, s4 +; GFX1064-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1064-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB1_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1064-NEXT: .LBB1_5: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_div_value: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: v_max_f32_e64 v2, s4, s4 +; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB1_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1032-NEXT: .LBB1_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_div_value: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 +; GFX1164-NEXT: s_add_i32 s4, s4, 32 +; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-NEXT: s_min_u32 s4, s5, s4 +; GFX1164-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB1_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1164-NEXT: .LBB1_5: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_div_value: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e64 v2, s4, s4 +; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB1_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_max_f32 v2, v1, v1 :: v_dual_mov_b32 v1, s3 +; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1132-NEXT: .LBB1_5: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_div_value: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: buffer_wbinvl1_vol +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX9-DPP-NEXT: .LBB1_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 0 +; GFX1064-DPP-NEXT: v_max_f32_e64 v1, s4, s4 +; GFX1064-DPP-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: buffer_gl0_inv +; GFX1064-DPP-NEXT: buffer_gl1_inv +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1064-DPP-NEXT: .LBB1_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 +; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: buffer_gl0_inv +; GFX1032-DPP-NEXT: buffer_gl1_inv +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1032-DPP-NEXT: .LBB1_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: buffer_gl0_inv +; GFX1164-DPP-NEXT: buffer_gl1_inv +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1164-DPP-NEXT: .LBB1_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0xff800000 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_mov_b32 v5, s3 +; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: buffer_gl0_inv +; GFX1132-DPP-NEXT: buffer_gl1_inv +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1132-DPP-NEXT: .LBB1_3: +; GFX1132-DPP-NEXT: s_endpgm + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-NEXT: .LBB2_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz .LBB2_3 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB2_2 +; GFX8-NEXT: .LBB2_3: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-NEXT: .LBB2_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-NEXT: .LBB2_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-NEXT: .LBB2_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-NEXT: .LBB2_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-NEXT: .LBB2_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-DPP-NEXT: .LBB2_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-DPP-NEXT: .LBB2_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-DPP-NEXT: .LBB2_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-DPP-NEXT: .LBB2_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-DPP-NEXT: .LBB2_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX8-NEXT: .LBB3_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s3 +; GFX8-NEXT: s_ff1_i32_b32 s5, s2 +; GFX8-NEXT: s_add_i32 s4, s4, 32 +; GFX8-NEXT: s_min_u32 s4, s5, s4 +; GFX8-NEXT: v_readlane_b32 s6, v0, s4 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s6 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB3_5 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB3_4 +; GFX8-NEXT: .LBB3_5: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX9-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s3 +; GFX9-NEXT: s_ff1_i32_b32 s5, s2 +; GFX9-NEXT: s_add_i32 s4, s4, 32 +; GFX9-NEXT: s_min_u32 s4, s5, s4 +; GFX9-NEXT: v_readlane_b32 s6, v0, s4 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e64 v2, s6, s6 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_5: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 +; GFX1064-NEXT: s_add_i32 s4, s4, 32 +; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1064-NEXT: s_min_u32 s4, s5, s4 +; GFX1064-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1064-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB3_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1064-NEXT: .LBB3_5: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: v_max_f32_e64 v2, s4, s4 +; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB3_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1032-NEXT: .LBB3_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 +; GFX1164-NEXT: s_add_i32 s4, s4, 32 +; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-NEXT: s_min_u32 s4, s5, s4 +; GFX1164-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB3_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1164-NEXT: .LBB3_5: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e64 v2, s4, s4 +; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB3_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_max_f32 v2, v1, v1 :: v_dual_mov_b32 v1, s3 +; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1132-NEXT: .LBB3_5: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX9-DPP-NEXT: .LBB3_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 0 +; GFX1064-DPP-NEXT: v_max_f32_e64 v1, s4, s4 +; GFX1064-DPP-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1064-DPP-NEXT: .LBB3_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 +; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1032-DPP-NEXT: .LBB3_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1164-DPP-NEXT: .LBB3_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0xff800000 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_mov_b32 v5, s3 +; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1132-DPP-NEXT: .LBB3_3: +; GFX1132-DPP-NEXT: s_endpgm + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue monotonic, align 4 + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-NEXT: .LBB4_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz .LBB4_3 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB4_2 +; GFX8-NEXT: .LBB4_3: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-NEXT: .LBB4_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-NEXT: .LBB4_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-NEXT: .LBB4_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-NEXT: .LBB4_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-NEXT: .LBB4_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-DPP-NEXT: .LBB4_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-DPP-NEXT: .LBB4_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-DPP-NEXT: .LBB4_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-DPP-NEXT: .LBB4_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-DPP-NEXT: .LBB4_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX8-NEXT: .LBB5_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s3 +; GFX8-NEXT: s_ff1_i32_b32 s5, s2 +; GFX8-NEXT: s_add_i32 s4, s4, 32 +; GFX8-NEXT: s_min_u32 s4, s5, s4 +; GFX8-NEXT: v_readlane_b32 s6, v0, s4 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s6 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: v_max_f32_e32 v2, v1, v2 +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB5_5 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB5_4 +; GFX8-NEXT: .LBB5_5: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX9-NEXT: .LBB5_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s3 +; GFX9-NEXT: s_ff1_i32_b32 s5, s2 +; GFX9-NEXT: s_add_i32 s4, s4, 32 +; GFX9-NEXT: s_min_u32 s4, s5, s4 +; GFX9-NEXT: v_readlane_b32 s6, v0, s4 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e64 v2, s6, s6 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB5_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: .LBB5_5: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 +; GFX1064-NEXT: s_add_i32 s4, s4, 32 +; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1064-NEXT: s_min_u32 s4, s5, s4 +; GFX1064-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1064-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB5_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1064-NEXT: .LBB5_5: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: v_max_f32_e64 v2, s4, s4 +; GFX1032-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB5_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1032-NEXT: .LBB5_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 +; GFX1164-NEXT: s_add_i32 s4, s4, 32 +; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-NEXT: s_min_u32 s4, s5, s4 +; GFX1164-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB5_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1164-NEXT: .LBB5_5: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e64 v2, s4, s4 +; GFX1132-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB5_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_max_f32 v2, v1, v1 :: v_dual_mov_b32 v1, s3 +; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1132-NEXT: .LBB5_5: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX9-DPP-NEXT: .LBB5_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 0 +; GFX1064-DPP-NEXT: v_max_f32_e64 v1, s4, s4 +; GFX1064-DPP-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1064-DPP-NEXT: .LBB5_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 +; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1032-DPP-NEXT: .LBB5_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000 +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1164-DPP-NEXT: .LBB5_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0xff800000 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_mov_b32 v5, s3 +; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v4, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1132-DPP-NEXT: .LBB5_3: +; GFX1132-DPP-NEXT: s_endpgm + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll @@ -0,0 +1,3199 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s + +declare i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_value: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-NEXT: .LBB0_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_uni_value: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz .LBB0_3 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB0_2 +; GFX8-NEXT: .LBB0_3: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_value: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-NEXT: .LBB0_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_value: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-NEXT: .LBB0_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_value: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-NEXT: .LBB0_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_value: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-NEXT: .LBB0_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_value: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-NEXT: .LBB0_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: buffer_wbinvl1_vol +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-DPP-NEXT: .LBB0_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: buffer_gl0_inv +; GFX1064-DPP-NEXT: buffer_gl1_inv +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-DPP-NEXT: .LBB0_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: buffer_gl0_inv +; GFX1032-DPP-NEXT: buffer_gl1_inv +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-DPP-NEXT: .LBB0_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: buffer_gl0_inv +; GFX1164-DPP-NEXT: buffer_gl1_inv +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-DPP-NEXT: .LBB0_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: buffer_gl0_inv +; GFX1132-DPP-NEXT: buffer_gl1_inv +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-DPP-NEXT: .LBB0_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_div_value: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_div_value: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX8-NEXT: .LBB1_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s3 +; GFX8-NEXT: s_ff1_i32_b32 s5, s2 +; GFX8-NEXT: s_add_i32 s4, s4, 32 +; GFX8-NEXT: s_min_u32 s4, s5, s4 +; GFX8-NEXT: v_readlane_b32 s6, v0, s4 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s6 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB1_5 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB1_4 +; GFX8-NEXT: .LBB1_5: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_div_value: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX9-NEXT: .LBB1_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s3 +; GFX9-NEXT: s_ff1_i32_b32 s5, s2 +; GFX9-NEXT: s_add_i32 s4, s4, 32 +; GFX9-NEXT: s_min_u32 s4, s5, s4 +; GFX9-NEXT: v_readlane_b32 s6, v0, s4 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e64 v2, s6, s6 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: .LBB1_5: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_div_value: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 +; GFX1064-NEXT: s_add_i32 s4, s4, 32 +; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1064-NEXT: s_min_u32 s4, s5, s4 +; GFX1064-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1064-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB1_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1064-NEXT: .LBB1_5: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_div_value: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: v_max_f32_e64 v2, s4, s4 +; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB1_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1032-NEXT: .LBB1_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_div_value: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 +; GFX1164-NEXT: s_add_i32 s4, s4, 32 +; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-NEXT: s_min_u32 s4, s5, s4 +; GFX1164-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB1_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1164-NEXT: .LBB1_5: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_div_value: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e64 v2, s4, s4 +; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB1_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_max_f32 v2, v1, v1 :: v_dual_mov_b32 v1, s3 +; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1132-NEXT: .LBB1_5: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_div_value: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: buffer_wbinvl1_vol +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX9-DPP-NEXT: .LBB1_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 0 +; GFX1064-DPP-NEXT: v_max_f32_e64 v1, s4, s4 +; GFX1064-DPP-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1064-DPP-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: buffer_gl0_inv +; GFX1064-DPP-NEXT: buffer_gl1_inv +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1064-DPP-NEXT: .LBB1_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 +; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1032-DPP-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: buffer_gl0_inv +; GFX1032-DPP-NEXT: buffer_gl1_inv +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1032-DPP-NEXT: .LBB1_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1164-DPP-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: buffer_gl0_inv +; GFX1164-DPP-NEXT: buffer_gl1_inv +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1164-DPP-NEXT: .LBB1_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7f800000 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_mov_b32 v5, s3 +; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1132-DPP-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: buffer_gl0_inv +; GFX1132-DPP-NEXT: buffer_gl1_inv +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1132-DPP-NEXT: .LBB1_3: +; GFX1132-DPP-NEXT: s_endpgm + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-NEXT: .LBB2_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz .LBB2_3 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB2_2 +; GFX8-NEXT: .LBB2_3: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-NEXT: .LBB2_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-NEXT: .LBB2_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-NEXT: .LBB2_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-NEXT: .LBB2_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-NEXT: .LBB2_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-DPP-NEXT: .LBB2_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-DPP-NEXT: .LBB2_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-DPP-NEXT: .LBB2_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-DPP-NEXT: .LBB2_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-DPP-NEXT: .LBB2_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX8-NEXT: .LBB3_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s3 +; GFX8-NEXT: s_ff1_i32_b32 s5, s2 +; GFX8-NEXT: s_add_i32 s4, s4, 32 +; GFX8-NEXT: s_min_u32 s4, s5, s4 +; GFX8-NEXT: v_readlane_b32 s6, v0, s4 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s6 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB3_5 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB3_4 +; GFX8-NEXT: .LBB3_5: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX9-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s3 +; GFX9-NEXT: s_ff1_i32_b32 s5, s2 +; GFX9-NEXT: s_add_i32 s4, s4, 32 +; GFX9-NEXT: s_min_u32 s4, s5, s4 +; GFX9-NEXT: v_readlane_b32 s6, v0, s4 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e64 v2, s6, s6 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_5: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 +; GFX1064-NEXT: s_add_i32 s4, s4, 32 +; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1064-NEXT: s_min_u32 s4, s5, s4 +; GFX1064-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1064-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB3_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1064-NEXT: .LBB3_5: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: v_max_f32_e64 v2, s4, s4 +; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB3_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1032-NEXT: .LBB3_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 +; GFX1164-NEXT: s_add_i32 s4, s4, 32 +; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-NEXT: s_min_u32 s4, s5, s4 +; GFX1164-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB3_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1164-NEXT: .LBB3_5: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e64 v2, s4, s4 +; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB3_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_max_f32 v2, v1, v1 :: v_dual_mov_b32 v1, s3 +; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1132-NEXT: .LBB3_5: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX9-DPP-NEXT: .LBB3_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 0 +; GFX1064-DPP-NEXT: v_max_f32_e64 v1, s4, s4 +; GFX1064-DPP-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1064-DPP-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1064-DPP-NEXT: .LBB3_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 +; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1032-DPP-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1032-DPP-NEXT: .LBB3_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1164-DPP-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1164-DPP-NEXT: .LBB3_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7f800000 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_mov_b32 v5, s3 +; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1132-DPP-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1132-DPP-NEXT: .LBB3_3: +; GFX1132-DPP-NEXT: s_endpgm + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue monotonic, align 4 + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-NEXT: .LBB4_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX8: ; %bb.0: +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_cbranch_execz .LBB4_3 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB4_2 +; GFX8-NEXT: .LBB4_3: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-NEXT: .LBB4_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-NEXT: .LBB4_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-NEXT: .LBB4_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-NEXT: .LBB4_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-NEXT: .LBB4_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-DPP-NEXT: .LBB4_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-DPP-NEXT: .LBB4_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-DPP-NEXT: .LBB4_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-DPP-NEXT: .LBB4_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-DPP-NEXT: v_min_f32_e32 v0, 4.0, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-DPP-NEXT: .LBB4_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX7LESS-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX8-NEXT: .LBB5_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s3 +; GFX8-NEXT: s_ff1_i32_b32 s5, s2 +; GFX8-NEXT: s_add_i32 s4, s4, 32 +; GFX8-NEXT: s_min_u32 s4, s5, s4 +; GFX8-NEXT: v_readlane_b32 s6, v0, s4 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8-NEXT: v_mul_f32_e32 v1, 1.0, v2 +; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s6 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: v_min_f32_e32 v2, v1, v2 +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB5_5 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB5_4 +; GFX8-NEXT: .LBB5_5: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX9-NEXT: .LBB5_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s3 +; GFX9-NEXT: s_ff1_i32_b32 s5, s2 +; GFX9-NEXT: s_add_i32 s4, s4, 32 +; GFX9-NEXT: s_min_u32 s4, s5, s4 +; GFX9-NEXT: v_readlane_b32 s6, v0, s4 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-NEXT: v_max_f32_e64 v2, s6, s6 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB5_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX9-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: .LBB5_5: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 +; GFX1064-NEXT: s_add_i32 s4, s4, 32 +; GFX1064-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1064-NEXT: s_min_u32 s4, s5, s4 +; GFX1064-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1064-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB5_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1064-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1064-NEXT: .LBB5_5: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: v_max_f32_e64 v2, s4, s4 +; GFX1032-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB5_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1032-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1032-NEXT: .LBB5_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 +; GFX1164-NEXT: s_add_i32 s4, s4, 32 +; GFX1164-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-NEXT: s_min_u32 s4, s5, s4 +; GFX1164-NEXT: v_readlane_b32 s5, v0, s4 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB5_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_max_f32_e32 v2, v1, v1 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1164-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1164-NEXT: .LBB5_5: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e64 v2, s4, s4 +; GFX1132-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB5_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_dual_max_f32 v2, v1, v1 :: v_dual_mov_b32 v1, s3 +; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1 +; GFX1132-NEXT: v_min_f32_e32 v0, v0, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1132-NEXT: .LBB5_5: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3 +; GFX9-DPP-NEXT: v_min_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX9-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_max_f32_e64 v0, s4, s4 +; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX9-DPP-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX9-DPP-NEXT: .LBB5_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1064-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 32 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 0 +; GFX1064-DPP-NEXT: v_max_f32_e64 v1, s4, s4 +; GFX1064-DPP-NEXT: v_max_f32_e64 v2, s5, s5 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1064-DPP-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1064-DPP-NEXT: .LBB5_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1032-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1032-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 +; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1032-DPP-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1032-DPP-NEXT: .LBB5_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0x7f800000 +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v0, v0 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1164-DPP-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1164-DPP-NEXT: .LBB5_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0x7f800000 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0x7f800000 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0x7f800000 +; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0x7f800000 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2 +; GFX1132-DPP-NEXT: v_min_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_mov_b32 v5, s3 +; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5 +; GFX1132-DPP-NEXT: v_min_f32_e32 v4, v4, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1132-DPP-NEXT: .LBB5_3: +; GFX1132-DPP-NEXT: s_endpgm + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll @@ -0,0 +1,3100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s +; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s + +declare i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_value: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2 +; GFX7LESS-NEXT: .LBB0_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_uni_value: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_cbranch_execz .LBB0_3 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX8-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB0_2 +; GFX8-NEXT: .LBB0_3: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_value: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-NEXT: .LBB0_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_value: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-NEXT: .LBB0_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_value: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-NEXT: .LBB0_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_value: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-NEXT: .LBB0_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_value: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-NEXT: .LBB0_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: buffer_wbinvl1_vol +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX9-DPP-NEXT: .LBB0_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: buffer_gl0_inv +; GFX1064-DPP-NEXT: buffer_gl1_inv +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1064-DPP-NEXT: .LBB0_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: buffer_gl0_inv +; GFX1032-DPP-NEXT: buffer_gl1_inv +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1032-DPP-NEXT: .LBB0_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: buffer_gl0_inv +; GFX1164-DPP-NEXT: buffer_gl1_inv +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1164-DPP-NEXT: .LBB0_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: buffer_gl0_inv +; GFX1132-DPP-NEXT: buffer_gl1_inv +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2 +; GFX1132-DPP-NEXT: .LBB0_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_div_value: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_div_value: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX8-NEXT: .LBB1_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s3 +; GFX8-NEXT: s_ff1_i32_b32 s5, s2 +; GFX8-NEXT: s_add_i32 s4, s4, 32 +; GFX8-NEXT: s_min_u32 s4, s5, s4 +; GFX8-NEXT: v_readlane_b32 s6, v0, s4 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: v_subrev_f32_e32 v2, s6, v2 +; GFX8-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB1_5 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB1_4 +; GFX8-NEXT: .LBB1_5: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_div_value: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX9-NEXT: .LBB1_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s3 +; GFX9-NEXT: s_ff1_i32_b32 s5, s2 +; GFX9-NEXT: s_add_i32 s4, s4, 32 +; GFX9-NEXT: s_min_u32 s4, s5, s4 +; GFX9-NEXT: v_readlane_b32 s6, v0, s4 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: v_subrev_f32_e32 v2, s6, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB1_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB1_4 +; GFX9-NEXT: .LBB1_5: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_div_value: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 +; GFX1064-NEXT: s_add_i32 s4, s4, 32 +; GFX1064-NEXT: s_min_u32 s4, s5, s4 +; GFX1064-NEXT: v_readlane_b32 s6, v0, s4 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: v_subrev_f32_e32 v2, s6, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB1_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: buffer_gl0_inv +; GFX1064-NEXT: buffer_gl1_inv +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1064-NEXT: .LBB1_5: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_div_value: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB1_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: buffer_gl0_inv +; GFX1032-NEXT: buffer_gl1_inv +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1032-NEXT: .LBB1_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_div_value: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 +; GFX1164-NEXT: s_add_i32 s4, s4, 32 +; GFX1164-NEXT: s_min_u32 s4, s5, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s6, v0, s4 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: v_subrev_f32_e32 v2, s6, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB1_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: buffer_gl0_inv +; GFX1164-NEXT: buffer_gl1_inv +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1164-NEXT: .LBB1_5: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_div_value: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB1_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: buffer_gl0_inv +; GFX1132-NEXT: buffer_gl1_inv +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB1_4 +; GFX1132-NEXT: .LBB1_5: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_div_value: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_subrev_f32_e32 v4, s4, v5 +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v0, v[4:5], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: buffer_wbinvl1_vol +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX9-DPP-NEXT: .LBB1_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_sub_f32_e64 v1, s4, s5 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1064-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: buffer_gl0_inv +; GFX1064-DPP-NEXT: buffer_gl1_inv +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1064-DPP-NEXT: .LBB1_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 +; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1032-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: buffer_gl0_inv +; GFX1032-DPP-NEXT: buffer_gl1_inv +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1032-DPP-NEXT: .LBB1_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1164-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: buffer_gl0_inv +; GFX1164-DPP-NEXT: buffer_gl1_inv +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1164-DPP-NEXT: .LBB1_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_div_value: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, s3 +; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX1132-DPP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: buffer_gl0_inv +; GFX1132-DPP-NEXT: buffer_gl1_inv +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2 +; GFX1132-DPP-NEXT: .LBB1_3: +; GFX1132-DPP-NEXT: s_endpgm + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_defalut(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2 +; GFX7LESS-NEXT: .LBB2_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_cbranch_execz .LBB2_3 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX8-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB2_2 +; GFX8-NEXT: .LBB2_3: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-NEXT: .LBB2_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-NEXT: .LBB2_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-NEXT: .LBB2_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-NEXT: .LBB2_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-NEXT: .LBB2_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX9-DPP-NEXT: .LBB2_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1064-DPP-NEXT: .LBB2_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1032-DPP-NEXT: .LBB2_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1164-DPP-NEXT: .LBB2_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_defalut: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2 +; GFX1132-DPP-NEXT: .LBB2_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 monotonic, align 4 + ret void +} + +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_defalut(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX8-NEXT: .LBB3_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s3 +; GFX8-NEXT: s_ff1_i32_b32 s5, s2 +; GFX8-NEXT: s_add_i32 s4, s4, 32 +; GFX8-NEXT: s_min_u32 s4, s5, s4 +; GFX8-NEXT: v_readlane_b32 s6, v0, s4 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: v_subrev_f32_e32 v2, s6, v2 +; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB3_5 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB3_4 +; GFX8-NEXT: .LBB3_5: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX9-NEXT: .LBB3_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s3 +; GFX9-NEXT: s_ff1_i32_b32 s5, s2 +; GFX9-NEXT: s_add_i32 s4, s4, 32 +; GFX9-NEXT: s_min_u32 s4, s5, s4 +; GFX9-NEXT: v_readlane_b32 s6, v0, s4 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: v_subrev_f32_e32 v2, s6, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB3_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB3_4 +; GFX9-NEXT: .LBB3_5: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 +; GFX1064-NEXT: s_add_i32 s4, s4, 32 +; GFX1064-NEXT: s_min_u32 s4, s5, s4 +; GFX1064-NEXT: v_readlane_b32 s6, v0, s4 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: v_subrev_f32_e32 v2, s6, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB3_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1064-NEXT: .LBB3_5: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB3_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1032-NEXT: .LBB3_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 +; GFX1164-NEXT: s_add_i32 s4, s4, 32 +; GFX1164-NEXT: s_min_u32 s4, s5, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s6, v0, s4 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: v_subrev_f32_e32 v2, s6, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB3_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1164-NEXT: .LBB3_5: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB3_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB3_4 +; GFX1132-NEXT: .LBB3_5: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_subrev_f32_e32 v4, s4, v5 +; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v0, v[4:5], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX9-DPP-NEXT: .LBB3_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_sub_f32_e64 v1, s4, s5 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1064-DPP-NEXT: .LBB3_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 +; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1032-DPP-NEXT: .LBB3_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1164-DPP-NEXT: .LBB3_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_defalut: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, s3 +; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2 +; GFX1132-DPP-NEXT: .LBB3_3: +; GFX1132-DPP-NEXT: s_endpgm + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue monotonic, align 4 + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_uni_value_scope_agent(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3 +; GFX7LESS-NEXT: ; %bb.1: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2 +; GFX7LESS-NEXT: .LBB4_3: +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_cbranch_execz .LBB4_3 +; GFX8-NEXT: ; %bb.1: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX8-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB4_2 +; GFX8-NEXT: .LBB4_3: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-NEXT: .LBB4_3: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-NEXT: ; %bb.1: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-NEXT: .LBB4_3: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-NEXT: ; %bb.1: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-NEXT: .LBB4_3: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-NEXT: ; %bb.1: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-NEXT: .LBB4_3: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-NEXT: ; %bb.1: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-NEXT: .LBB4_3: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s5, s[2:3] +; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s5 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX9-DPP-NEXT: .LBB4_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1064-DPP-NEXT: .LBB4_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1032-DPP-NEXT: .LBB4_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s4 +; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1164-DPP-NEXT: .LBB4_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_uni_value_scope_agent: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s4, exec_lo +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4 +; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2 +; GFX1132-DPP-NEXT: .LBB4_3: +; GFX1132-DPP-NEXT: s_endpgm + %result = atomicrmw fsub ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4 + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_div_value_scope_agent(ptr addrspace(1) %ptr) { +; GFX7LESS-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX7LESS: ; %bb.0: +; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start +; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX7LESS-NEXT: v_sub_f32_e32 v1, v2, v0 +; GFX7LESS-NEXT: s_waitcnt expcnt(0) +; GFX7LESS-NEXT: v_mov_b32_e32 v4, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1 +; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc +; GFX7LESS-NEXT: s_waitcnt vmcnt(0) +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2 +; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX7LESS-NEXT: v_mov_b32_e32 v2, v3 +; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1 +; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end +; GFX7LESS-NEXT: s_endpgm +; +; GFX8-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX8-NEXT: .LBB5_1: ; %ComputeLoop +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: s_ff1_i32_b32 s4, s3 +; GFX8-NEXT: s_ff1_i32_b32 s5, s2 +; GFX8-NEXT: s_add_i32 s4, s4, 32 +; GFX8-NEXT: s_min_u32 s4, s5, s4 +; GFX8-NEXT: v_readlane_b32 s6, v0, s4 +; GFX8-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX8-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX8-NEXT: v_subrev_f32_e32 v2, s6, v2 +; GFX8-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX8-NEXT: ; %bb.2: ; %ComputeEnd +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX8-NEXT: s_cbranch_execz .LBB5_5 +; GFX8-NEXT: ; %bb.3: +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX8-NEXT: s_mov_b64 s[2:3], 0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s4 +; GFX8-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX8-NEXT: v_mov_b32_e32 v3, s0 +; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc +; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX8-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v1, v0 +; GFX8-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_cbranch_execnz .LBB5_4 +; GFX8-NEXT: .LBB5_5: +; GFX8-NEXT: s_endpgm +; +; GFX9-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX9-NEXT: .LBB5_1: ; %ComputeLoop +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: s_ff1_i32_b32 s4, s3 +; GFX9-NEXT: s_ff1_i32_b32 s5, s2 +; GFX9-NEXT: s_add_i32 s4, s4, 32 +; GFX9-NEXT: s_min_u32 s4, s5, s4 +; GFX9-NEXT: v_readlane_b32 s6, v0, s4 +; GFX9-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX9-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX9-NEXT: v_subrev_f32_e32 v2, s6, v2 +; GFX9-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX9-NEXT: ; %bb.2: ; %ComputeEnd +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX9-NEXT: s_cbranch_execz .LBB5_5 +; GFX9-NEXT: ; %bb.3: +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v1, s4 +; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v1, v0 +; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_cbranch_execnz .LBB5_4 +; GFX9-NEXT: .LBB5_5: +; GFX9-NEXT: s_endpgm +; +; GFX1064-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: s_ff1_i32_b32 s4, s3 +; GFX1064-NEXT: s_ff1_i32_b32 s5, s2 +; GFX1064-NEXT: s_add_i32 s4, s4, 32 +; GFX1064-NEXT: s_min_u32 s4, s5, s4 +; GFX1064-NEXT: v_readlane_b32 s6, v0, s4 +; GFX1064-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1064-NEXT: v_subrev_f32_e32 v2, s6, v2 +; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execz .LBB5_5 +; GFX1064-NEXT: ; %bb.3: +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: v_mov_b32_e32 v1, s2 +; GFX1064-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1064-NEXT: v_mov_b32_e32 v1, v0 +; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1064-NEXT: .LBB5_5: +; GFX1064-NEXT: s_endpgm +; +; GFX1032-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: s_ff1_i32_b32 s3, s2 +; GFX1032-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1032-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1032-NEXT: s_andn2_b32 s2, s2, s3 +; GFX1032-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1032-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: s_mov_b32 s2, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1032-NEXT: s_cbranch_execz .LBB5_5 +; GFX1032-NEXT: ; %bb.3: +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 +; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc +; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1032-NEXT: v_mov_b32_e32 v1, v0 +; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1032-NEXT: .LBB5_5: +; GFX1032-NEXT: s_endpgm +; +; GFX1164-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1164: ; %bb.0: +; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: s_ctz_i32_b32 s4, s3 +; GFX1164-NEXT: s_ctz_i32_b32 s5, s2 +; GFX1164-NEXT: s_add_i32 s4, s4, 32 +; GFX1164-NEXT: s_min_u32 s4, s5, s4 +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-NEXT: v_readlane_b32 s6, v0, s4 +; GFX1164-NEXT: s_lshl_b64 s[4:5], 1, s4 +; GFX1164-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 +; GFX1164-NEXT: v_subrev_f32_e32 v2, s6, v2 +; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execz .LBB5_5 +; GFX1164-NEXT: ; %bb.3: +; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-NEXT: v_mov_b32_e32 v1, s2 +; GFX1164-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1164-NEXT: s_waitcnt vmcnt(0) +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 +; GFX1164-NEXT: v_mov_b32_e32 v1, v0 +; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1164-NEXT: .LBB5_5: +; GFX1164-NEXT: s_endpgm +; +; GFX1132-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1132: ; %bb.0: +; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-NEXT: s_mov_b32 s2, exec_lo +; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_ctz_i32_b32 s3, s2 +; GFX1132-NEXT: v_readlane_b32 s4, v0, s3 +; GFX1132-NEXT: s_lshl_b32 s3, 1, s3 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX1132-NEXT: s_cmp_lg_u32 s2, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_subrev_f32_e32 v2, s4, v2 +; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1 +; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_mov_b32 s2, 0 +; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 +; GFX1132-NEXT: s_cbranch_execz .LBB5_5 +; GFX1132-NEXT: ; %bb.3: +; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-NEXT: v_mov_b32_e32 v3, 0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 +; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start +; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_sub_f32_e32 v0, v1, v2 +; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc +; GFX1132-NEXT: s_waitcnt vmcnt(0) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1 +; GFX1132-NEXT: v_mov_b32_e32 v1, v0 +; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_cbranch_execnz .LBB5_4 +; GFX1132-NEXT: .LBB5_5: +; GFX1132-NEXT: s_endpgm +; +; GFX9-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX9-DPP: ; %bb.0: +; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v4 +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX9-DPP-NEXT: s_not_b64 exec, exec +; GFX9-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v2, v2, v3 +; GFX9-DPP-NEXT: s_nop 1 +; GFX9-DPP-NEXT: v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf +; GFX9-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX9-DPP-NEXT: v_readlane_b32 s4, v1, 63 +; GFX9-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX9-DPP-NEXT: ; %bb.1: +; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: s_load_dword s5, s[0:1], 0x0 +; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX9-DPP-NEXT: v_subrev_f32_e32 v4, s4, v5 +; GFX9-DPP-NEXT: global_atomic_cmpswap v4, v0, v[4:5], s[0:1] glc +; GFX9-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX9-DPP-NEXT: .LBB5_3: +; GFX9-DPP-NEXT: s_endpgm +; +; GFX1064-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1064-DPP: ; %bb.0: +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1064-DPP-NEXT: s_not_b64 exec, exec +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1064-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1064-DPP-NEXT: v_readlane_b32 s4, v1, 0 +; GFX1064-DPP-NEXT: v_readlane_b32 s5, v1, 32 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1064-DPP-NEXT: v_sub_f32_e64 v1, s4, s5 +; GFX1064-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1064-DPP-NEXT: ; %bb.1: +; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0 +; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1064-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 +; GFX1064-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3] +; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1064-DPP-NEXT: .LBB5_3: +; GFX1064-DPP-NEXT: s_endpgm +; +; GFX1032-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1032-DPP: ; %bb.0: +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1032-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1032-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1032-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1032-DPP-NEXT: ; %bb.1: +; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0 +; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, s3 +; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1032-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 +; GFX1032-DPP-NEXT: global_atomic_cmpswap v4, v6, v[4:5], s[0:1] glc +; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2 +; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1032-DPP-NEXT: .LBB5_3: +; GFX1032-DPP-NEXT: s_endpgm +; +; GFX1164-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1164-DPP: ; %bb.0: +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: s_not_b64 exec, exec +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff +; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[2:3], -1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1164-DPP-NEXT: s_mov_b64 exec, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1164-DPP-NEXT: ; %bb.1: +; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0 +; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, s2 +; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0 +; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1164-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 +; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5 +; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3] +; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3] +; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1164-DPP-NEXT: .LBB5_3: +; GFX1164-DPP-NEXT: s_endpgm +; +; GFX1132-DPP-LABEL: global_atomic_fadd_div_value_scope_agent: +; GFX1132-DPP: ; %bb.0: +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo +; GFX1132-DPP-NEXT: s_or_saveexec_b32 s2, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v2, v1 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v3 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1 +; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v1, v1, v2 +; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s2 +; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1 +; GFX1132-DPP-NEXT: s_mov_b32 s2, 0 +; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4 +; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3 +; GFX1132-DPP-NEXT: ; %bb.1: +; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0 +; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0) +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, s3 +; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start +; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1 +; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-DPP-NEXT: v_sub_f32_e32 v4, v5, v0 +; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc +; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0) +; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5 +; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4 +; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2 +; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2 +; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2 +; GFX1132-DPP-NEXT: .LBB5_3: +; GFX1132-DPP-NEXT: s_endpgm + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fsub ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4 + ret void +}