diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -207,6 +207,8 @@ case AtomicRMWInst::UMin: case AtomicRMWInst::FAdd: case AtomicRMWInst::FSub: + case AtomicRMWInst::FMax: + case AtomicRMWInst::FMin: break; } @@ -315,6 +317,12 @@ case Intrinsic::amdgcn_global_atomic_fadd: Op = AtomicRMWInst::FAdd; break; + case Intrinsic::amdgcn_global_atomic_fmax: + Op = AtomicRMWInst::FMax; + break; + case Intrinsic::amdgcn_global_atomic_fmin: + Op = AtomicRMWInst::FMin; + break; } // Only 32-bit floating point atomic ops are supported. @@ -325,7 +333,8 @@ unsigned ValIdx = 0; // TODO: Operand order is not consistent for atomic fadd intrinsics - if (Op == AtomicRMWInst::FAdd) { + if (Op == AtomicRMWInst::FAdd || Op == AtomicRMWInst::FMax || + Op == AtomicRMWInst::FMin) { ValIdx = 1; } @@ -392,6 +401,10 @@ case AtomicRMWInst::UMin: Pred = CmpInst::ICMP_ULT; break; + case AtomicRMWInst::FMax: + return B.CreateMaxNum(LHS, RHS); + case AtomicRMWInst::FMin: + return B.CreateMinNum(LHS, RHS); } Value *Cond = B.CreateICmp(Pred, LHS, RHS); return B.CreateSelect(Cond, LHS, RHS); @@ -712,6 +725,10 @@ return APFloat::getZero(Semantics, false); case AtomicRMWInst::FSub: return APFloat::getZero(Semantics, true); + case AtomicRMWInst::FMin: + return APFloat::getInf(Semantics, false); + case AtomicRMWInst::FMax: + return APFloat::getInf(Semantics, true); } } @@ -906,6 +923,8 @@ case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: + case AtomicRMWInst::FMin: + case AtomicRMWInst::FMax: // These operations with a uniform value are idempotent: doing the atomic // operation multiple times has the same effect as doing it once. NewV = V; diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll --- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll @@ -282,4 +282,266 @@ ret void } +define amdgpu_kernel void @global_atomic_fmin_uni_value(ptr addrspace(1) %ptr) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_value( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP10:%.*]] +; IR-ITERATIVE: 8: +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float 4.000000e+00 seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP10]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fmin_uni_value( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP10:%.*]] +; IR-DPP: 8: +; IR-DPP-NEXT: [[TMP9:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float 4.000000e+00 seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP10]] +; IR-DPP: 10: +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fmin ptr addrspace(1) %ptr, float 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fmin_div_value(ptr addrspace(1) %ptr) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_value( +; IR-ITERATIVE-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-ITERATIVE-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 8: +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0x7FF0000000000000, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-ITERATIVE-NEXT: [[TMP16]] = call float @llvm.minnum.f32(float [[ACCUMULATOR]], float [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 +; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] +; +; IR-DPP-LABEL: @global_atomic_fmin_div_value( +; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-DPP-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 2139095040) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast float [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 2139095040, i32 [[TMP11]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast i32 [[TMP12]] to float +; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast i32 [[TMP11]] to float +; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.minnum.f32(float [[TMP14]], float [[TMP13]]) +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast float [[TMP15]] to i32 +; IR-DPP-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 2139095040, i32 [[TMP16]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float +; IR-DPP-NEXT: [[TMP19:%.*]] = bitcast i32 [[TMP16]] to float +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.minnum.f32(float [[TMP19]], float [[TMP18]]) +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast float [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 2139095040, i32 [[TMP21]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast i32 [[TMP22]] to float +; IR-DPP-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP21]] to float +; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.minnum.f32(float [[TMP24]], float [[TMP23]]) +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP25]] to i32 +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 2139095040, i32 [[TMP26]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float +; IR-DPP-NEXT: [[TMP29:%.*]] = bitcast i32 [[TMP26]] to float +; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.minnum.f32(float [[TMP29]], float [[TMP28]]) +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast float [[TMP30]] to i32 +; IR-DPP-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 2139095040, i32 [[TMP31]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast i32 [[TMP32]] to float +; IR-DPP-NEXT: [[TMP34:%.*]] = bitcast i32 [[TMP31]] to float +; IR-DPP-NEXT: [[TMP35:%.*]] = call float @llvm.minnum.f32(float [[TMP34]], float [[TMP33]]) +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast float [[TMP35]] to i32 +; IR-DPP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 2139095040, i32 [[TMP36]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast i32 [[TMP37]] to float +; IR-DPP-NEXT: [[TMP39:%.*]] = bitcast i32 [[TMP36]] to float +; IR-DPP-NEXT: [[TMP40:%.*]] = call float @llvm.minnum.f32(float [[TMP39]], float [[TMP38]]) +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast float [[TMP40]] to i32 +; IR-DPP-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP41]], i32 63) +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast i32 [[TMP42]] to float +; IR-DPP-NEXT: [[TMP44:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP43]]) +; IR-DPP-NEXT: [[TMP45:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP48:%.*]] +; IR-DPP: 46: +; IR-DPP-NEXT: [[TMP47:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[TMP44]] seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP48]] +; IR-DPP: 48: +; IR-DPP-NEXT: ret void +; + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fmin ptr addrspace(1) %ptr, float %divValue seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_uni_value(ptr addrspace(1) %ptr) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fmax_uni_value( +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP10:%.*]] +; IR-ITERATIVE: 8: +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float 4.000000e+00 seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP10]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: ret void +; +; IR-DPP-LABEL: @global_atomic_fmax_uni_value( +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP7]], label [[TMP8:%.*]], label [[TMP10:%.*]] +; IR-DPP: 8: +; IR-DPP-NEXT: [[TMP9:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float 4.000000e+00 seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP10]] +; IR-DPP: 10: +; IR-DPP-NEXT: ret void +; + %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 seq_cst + ret void +} + +define amdgpu_kernel void @global_atomic_fmax_div_value(ptr addrspace(1) %ptr) #0 { +; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_value( +; IR-ITERATIVE-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-ITERATIVE-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-ITERATIVE-NEXT: br label [[COMPUTELOOP:%.*]] +; IR-ITERATIVE: 8: +; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4 +; IR-ITERATIVE-NEXT: br label [[TMP10:%.*]] +; IR-ITERATIVE: 10: +; IR-ITERATIVE-NEXT: ret void +; IR-ITERATIVE: ComputeLoop: +; IR-ITERATIVE-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0xFFF0000000000000, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] +; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 +; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) +; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-ITERATIVE-NEXT: [[TMP16]] = call float @llvm.maxnum.f32(float [[ACCUMULATOR]], float [[TMP15]]) +; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] +; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 +; IR-ITERATIVE-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] +; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR-ITERATIVE: ComputeEnd: +; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-ITERATIVE-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] +; +; IR-DPP-LABEL: @global_atomic_fmax_div_value( +; IR-DPP-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-DPP-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-DPP-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-DPP-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-DPP-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-DPP-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-DPP-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-DPP-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-DPP-NEXT: [[TMP7:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.set.inactive.i32(i32 [[TMP7]], i32 -8388608) +; IR-DPP-NEXT: [[TMP9:%.*]] = bitcast i32 [[TMP8]] to float +; IR-DPP-NEXT: [[TMP10:%.*]] = bitcast i32 [[TMP7]] to float +; IR-DPP-NEXT: [[TMP11:%.*]] = bitcast float [[TMP9]] to i32 +; IR-DPP-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -8388608, i32 [[TMP11]], i32 273, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP13:%.*]] = bitcast i32 [[TMP12]] to float +; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast i32 [[TMP11]] to float +; IR-DPP-NEXT: [[TMP15:%.*]] = call float @llvm.maxnum.f32(float [[TMP14]], float [[TMP13]]) +; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast float [[TMP15]] to i32 +; IR-DPP-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -8388608, i32 [[TMP16]], i32 274, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast i32 [[TMP17]] to float +; IR-DPP-NEXT: [[TMP19:%.*]] = bitcast i32 [[TMP16]] to float +; IR-DPP-NEXT: [[TMP20:%.*]] = call float @llvm.maxnum.f32(float [[TMP19]], float [[TMP18]]) +; IR-DPP-NEXT: [[TMP21:%.*]] = bitcast float [[TMP20]] to i32 +; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -8388608, i32 [[TMP21]], i32 276, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP23:%.*]] = bitcast i32 [[TMP22]] to float +; IR-DPP-NEXT: [[TMP24:%.*]] = bitcast i32 [[TMP21]] to float +; IR-DPP-NEXT: [[TMP25:%.*]] = call float @llvm.maxnum.f32(float [[TMP24]], float [[TMP23]]) +; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast float [[TMP25]] to i32 +; IR-DPP-NEXT: [[TMP27:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -8388608, i32 [[TMP26]], i32 280, i32 15, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP28:%.*]] = bitcast i32 [[TMP27]] to float +; IR-DPP-NEXT: [[TMP29:%.*]] = bitcast i32 [[TMP26]] to float +; IR-DPP-NEXT: [[TMP30:%.*]] = call float @llvm.maxnum.f32(float [[TMP29]], float [[TMP28]]) +; IR-DPP-NEXT: [[TMP31:%.*]] = bitcast float [[TMP30]] to i32 +; IR-DPP-NEXT: [[TMP32:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -8388608, i32 [[TMP31]], i32 322, i32 10, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP33:%.*]] = bitcast i32 [[TMP32]] to float +; IR-DPP-NEXT: [[TMP34:%.*]] = bitcast i32 [[TMP31]] to float +; IR-DPP-NEXT: [[TMP35:%.*]] = call float @llvm.maxnum.f32(float [[TMP34]], float [[TMP33]]) +; IR-DPP-NEXT: [[TMP36:%.*]] = bitcast float [[TMP35]] to i32 +; IR-DPP-NEXT: [[TMP37:%.*]] = call i32 @llvm.amdgcn.update.dpp.i32(i32 -8388608, i32 [[TMP36]], i32 323, i32 12, i32 15, i1 false) +; IR-DPP-NEXT: [[TMP38:%.*]] = bitcast i32 [[TMP37]] to float +; IR-DPP-NEXT: [[TMP39:%.*]] = bitcast i32 [[TMP36]] to float +; IR-DPP-NEXT: [[TMP40:%.*]] = call float @llvm.maxnum.f32(float [[TMP39]], float [[TMP38]]) +; IR-DPP-NEXT: [[TMP41:%.*]] = bitcast float [[TMP40]] to i32 +; IR-DPP-NEXT: [[TMP42:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP41]], i32 63) +; IR-DPP-NEXT: [[TMP43:%.*]] = bitcast i32 [[TMP42]] to float +; IR-DPP-NEXT: [[TMP44:%.*]] = call float @llvm.amdgcn.strict.wwm.f32(float [[TMP43]]) +; IR-DPP-NEXT: [[TMP45:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-DPP-NEXT: br i1 [[TMP45]], label [[TMP46:%.*]], label [[TMP48:%.*]] +; IR-DPP: 46: +; IR-DPP-NEXT: [[TMP47:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[TMP44]] seq_cst, align 4 +; IR-DPP-NEXT: br label [[TMP48]] +; IR-DPP: 48: +; IR-DPP-NEXT: ret void +; + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue seq_cst + ret void +} + attributes #0 = {"target-cpu"="gfx906"}