diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -101,6 +101,8 @@ void visitAtomicRMWInst(AtomicRMWInst &I); void visitIntrinsicInst(IntrinsicInst &I); + + bool isScanStrategyIterative(); }; } // namespace @@ -133,6 +135,21 @@ .run(F); } +bool AMDGPUAtomicOptimizerImpl::isScanStrategyIterative() { + return ScanImpl == ScanOptions::Iterative; +} + +bool isOpFP(AtomicRMWInst::BinOp &Op) { + switch (Op) { + default: + return false; + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FMax: + case AtomicRMWInst::FMin: + return true; + } +} + PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, FunctionAnalysisManager &AM) { @@ -202,9 +219,17 @@ case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FMax: + case AtomicRMWInst::FMin: break; } + // FP Atomics are supported for only Iterative Strategy + if (isOpFP(Op) && !isScanStrategyIterative()) { + return; + } + const unsigned PtrIdx = 0; const unsigned ValIdx = 1; @@ -302,6 +327,20 @@ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: Op = AtomicRMWInst::UMax; break; + case Intrinsic::amdgcn_global_atomic_fadd: + Op = AtomicRMWInst::FAdd; + break; + case Intrinsic::amdgcn_global_atomic_fmin: + Op = AtomicRMWInst::FMin; + break; + case Intrinsic::amdgcn_global_atomic_fmax: + Op = AtomicRMWInst::FMax; + break; + } + + // FP Atomics are supported for only Iterative Strategy + if (isOpFP(Op) && !isScanStrategyIterative()) { + return; } const unsigned ValIdx = 0; @@ -344,6 +383,8 @@ llvm_unreachable("Unhandled atomic op"); case AtomicRMWInst::Add: return B.CreateBinOp(Instruction::Add, LHS, RHS); + case AtomicRMWInst::FAdd: + return B.CreateBinOp(Instruction::FAdd, LHS, RHS); case AtomicRMWInst::Sub: return B.CreateBinOp(Instruction::Sub, LHS, RHS); case AtomicRMWInst::And: @@ -365,6 +406,12 @@ case AtomicRMWInst::UMin: Pred = CmpInst::ICMP_ULT; break; + case AtomicRMWInst::FMax: + Pred = CmpInst::FCMP_UGT; + break; + case AtomicRMWInst::FMin: + Pred = CmpInst::FCMP_ULT; + break; } Value *Cond = B.CreateICmp(Pred, LHS, RHS); return B.CreateSelect(Cond, LHS, RHS); @@ -554,11 +601,15 @@ // Use llvm.cttz instrinsic to find the lowest remaining active lane. auto *FF1 = B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); - auto *LaneIdxInt = B.CreateTrunc(FF1, Ty); + auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty()); // Get the value required for atomic operation - auto *LaneValue = + if (Ty->isFloatingPointTy()) + V = B.CreateBitCast(V, B.getInt32Ty()); + Value *LaneValue = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt}); + if (Ty->isFloatingPointTy()) + LaneValue = B.CreateBitCast(LaneValue, Ty); // Perform writelane if intermediate scan results are required later in the // kernel computations @@ -590,6 +641,18 @@ return {OldValue, NewAccumulator}; } +static APFloat getIdentityValueForFAtomicOp(AtomicRMWInst::BinOp Op) { + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FMax: + return APFloat::getSmallest(APFloat::IEEEsingle(), false); + case AtomicRMWInst::FMin: + return APFloat::getLargest(APFloat::IEEEsingle(), false); + } +} + static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth) { switch (Op) { @@ -679,17 +742,22 @@ Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); } - Mbcnt = B.CreateIntCast(Mbcnt, Ty, false); + Mbcnt = B.CreateIntCast(Mbcnt, B.getInt32Ty(), false); - Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); + Function *F = I.getFunction(); + LLVMContext &C = F->getContext(); + Value *Identity; + if (Ty->isIntegerTy()) { + Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); + } else if (Ty->isFloatingPointTy()) { + Identity = ConstantFP::get(C, getIdentityValueForFAtomicOp(Op)); + } Value *ExclScan = nullptr; Value *NewV = nullptr; const bool NeedResult = !I.use_empty(); - Function *F = I.getFunction(); - LLVMContext &C = F->getContext(); BasicBlock *ComputeLoop = nullptr; BasicBlock *ComputeEnd = nullptr; // If we have a divergent value in each lane, we need to combine the value @@ -746,13 +814,22 @@ NewV = buildMul(B, V, Ctpop); break; } - + case AtomicRMWInst::FAdd: { + Value *const Ctpop = + B.CreateIntCast(B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), + B.getInt32Ty(), false); + Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty); + NewV = B.CreateFMul(V, CtpopFP); + break; + } case AtomicRMWInst::And: case AtomicRMWInst::Or: case AtomicRMWInst::Max: case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: + case AtomicRMWInst::FMin: + case AtomicRMWInst::FMax: // These operations with a uniform value are idempotent: doing the atomic // operation multiple times has the same effect as doing it once. NewV = V; diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR %s + +declare i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) #0 { +; IR-LABEL: @global_atomic_fadd_uni_value( +; IR-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) +; IR-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; IR-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float +; IR-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] +; IR-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] +; IR: 12: +; IR-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 +; IR-NEXT: br label [[TMP14]] +; IR: 14: +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #0 { +; IR-LABEL: @global_atomic_fadd_div_value( +; IR-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: br label [[COMPUTELOOP:%.*]] +; IR: 8: +; IR-NEXT: [[TMP9:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4 +; IR-NEXT: br label [[TMP10:%.*]] +; IR: 10: +; IR-NEXT: ret void +; IR: ComputeLoop: +; IR-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0x36A0000000000000, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] +; IR-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] +; IR-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 +; IR-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) +; IR-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]] +; IR-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] +; IR-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 +; IR-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] +; IR-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 +; IR-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR: ComputeEnd: +; IR-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] +; + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue seq_cst + ret void +} + +attributes #0 = {"target-cpu"="gfx906"}