diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -101,6 +101,8 @@ void visitAtomicRMWInst(AtomicRMWInst &I); void visitIntrinsicInst(IntrinsicInst &I); + + bool isScanStrategyIterative(); }; } // namespace @@ -133,6 +135,10 @@ .run(F); } +bool AMDGPUAtomicOptimizerImpl::isScanStrategyIterative() { + return ScanImpl == ScanOptions::Iterative; +} + PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F, FunctionAnalysisManager &AM) { @@ -202,9 +208,19 @@ case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: break; } + // FP Atomics are supported for only Iterative Strategy. + if (AtomicRMWInst::isFPOperation(Op)) { + // TODO: Support for double type + if (!isScanStrategyIterative() || I.getType()->isDoubleTy()) { + return; + } + } + const unsigned PtrIdx = 0; const unsigned ValIdx = 1; @@ -302,9 +318,24 @@ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax: Op = AtomicRMWInst::UMax; break; + case Intrinsic::amdgcn_global_atomic_fadd: + Op = AtomicRMWInst::FAdd; + break; } - const unsigned ValIdx = 0; + // FP Atomics are supported for only Iterative Strategy + if (AtomicRMWInst::isFPOperation(Op)) { + // TODO: Support for double type + if (!isScanStrategyIterative() || I.getType()->isDoubleTy()) { + return; + } + } + unsigned ValIdx = 0; + + // TODO: Operand order is not consistent for atomic fadd intrinsics + if (Op == AtomicRMWInst::FAdd) { + ValIdx = 1; + } const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx)); @@ -344,8 +375,12 @@ llvm_unreachable("Unhandled atomic op"); case AtomicRMWInst::Add: return B.CreateBinOp(Instruction::Add, LHS, RHS); + case AtomicRMWInst::FAdd: + return B.CreateBinOp(Instruction::FAdd, LHS, RHS); case AtomicRMWInst::Sub: return B.CreateBinOp(Instruction::Sub, LHS, RHS); + case AtomicRMWInst::FSub: + return B.CreateBinOp(Instruction::FSub, LHS, RHS); case AtomicRMWInst::And: return B.CreateBinOp(Instruction::And, LHS, RHS); case AtomicRMWInst::Or: @@ -554,18 +589,32 @@ // Use llvm.cttz instrinsic to find the lowest remaining active lane. auto *FF1 = B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()}); - auto *LaneIdxInt = B.CreateTrunc(FF1, Ty); + auto *LaneIdxInt = B.CreateTrunc(FF1, B.getInt32Ty()); // Get the value required for atomic operation - auto *LaneValue = + bool isAtomicFloatingPointTy = Ty->isFloatingPointTy(); + if (isAtomicFloatingPointTy) + V = B.CreateBitCast(V, B.getInt32Ty()); + Value *LaneValue = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt}); + if (isAtomicFloatingPointTy) + LaneValue = B.CreateBitCast(LaneValue, Ty); // Perform writelane if intermediate scan results are required later in the // kernel computations Value *OldValue = nullptr; if (NeedResult) { - OldValue = B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {}, - {Accumulator, LaneIdxInt, OldValuePhi}); + Value *AccumulatorBitCasted = Accumulator; + Value *OldValuePhiBitCasted = OldValuePhi; + if (isAtomicFloatingPointTy) { + AccumulatorBitCasted = B.CreateBitCast(Accumulator, B.getInt32Ty()); + OldValuePhiBitCasted = B.CreateBitCast(OldValuePhi, B.getInt32Ty()); + } + OldValue = B.CreateIntrinsic( + Intrinsic::amdgcn_writelane, {}, + {AccumulatorBitCasted, LaneIdxInt, OldValuePhiBitCasted}); + if (isAtomicFloatingPointTy) + OldValue = B.CreateBitCast(OldValue, Ty); OldValuePhi->addIncoming(OldValue, ComputeLoop); } @@ -590,6 +639,18 @@ return {OldValue, NewAccumulator}; } +static APFloat getIdentityValueForFAtomicOp(AtomicRMWInst::BinOp Op, + const fltSemantics &Semantics) { + switch (Op) { + default: + llvm_unreachable("Unhandled atomic op"); + case AtomicRMWInst::FAdd: + return APFloat::getZero(Semantics, false); + case AtomicRMWInst::FSub: + return APFloat::getZero(Semantics, true); + } +} + static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op, unsigned BitWidth) { switch (Op) { @@ -679,17 +740,25 @@ Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt}); } - Mbcnt = B.CreateIntCast(Mbcnt, Ty, false); + bool isAtomicFloatingPointTy = Ty->isFloatingPointTy(); + if (!isAtomicFloatingPointTy) + Mbcnt = B.CreateIntCast(Mbcnt, Ty, false); - Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); + Function *F = I.getFunction(); + LLVMContext &C = F->getContext(); + Value *Identity; + if (Ty->isIntegerTy()) { + Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth)); + } else if (isAtomicFloatingPointTy) { + const fltSemantics &Semantics = Ty->getScalarType()->getFltSemantics(); + Identity = ConstantFP::get(C, getIdentityValueForFAtomicOp(Op, Semantics)); + } Value *ExclScan = nullptr; Value *NewV = nullptr; const bool NeedResult = !I.use_empty(); - Function *F = I.getFunction(); - LLVMContext &C = F->getContext(); BasicBlock *ComputeLoop = nullptr; BasicBlock *ComputeEnd = nullptr; // If we have a divergent value in each lane, we need to combine the value @@ -746,13 +815,23 @@ NewV = buildMul(B, V, Ctpop); break; } - + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: { + Value *const Ctpop = + B.CreateIntCast(B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), + B.getInt32Ty(), false); + Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty); + NewV = B.CreateFMul(V, CtpopFP); + break; + } case AtomicRMWInst::And: case AtomicRMWInst::Or: case AtomicRMWInst::Max: case AtomicRMWInst::Min: case AtomicRMWInst::UMax: case AtomicRMWInst::UMin: + case AtomicRMWInst::FMin: + case AtomicRMWInst::FMax: // These operations with a uniform value are idempotent: doing the atomic // operation multiple times has the same effect as doing it once. NewV = V; @@ -830,7 +909,7 @@ if (NeedResult) { // Create a PHI node to get our new atomic result into the exit block. - PHINode *const PHI = B.CreatePHI(Ty, 2); + PHINode *PHI = B.CreatePHI(Ty, 2); PHI->addIncoming(PoisonValue::get(Ty), Predecessor); PHI->addIncoming(NewI, SingleLaneTerminator->getParent()); @@ -853,8 +932,13 @@ B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1)); BroadcastI = B.CreateBitCast(Insert, Ty); } else if (TyBitWidth == 32) { - - BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI); + Value *CastedPhi = PHI; + if (isAtomicFloatingPointTy) + CastedPhi = B.CreateBitCast(PHI, B.getInt32Ty()); + BroadcastI = + B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, CastedPhi); + if (isAtomicFloatingPointTy) + BroadcastI = B.CreateBitCast(BroadcastI, Ty); } else { llvm_unreachable("Unhandled atomic bit width"); } @@ -892,6 +976,12 @@ case AtomicRMWInst::Xor: LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1)); break; + case AtomicRMWInst::FAdd: + case AtomicRMWInst::FSub: { + Value *const MbcntFP = B.CreateUIToFP(Mbcnt, Ty); + LaneOffset = B.CreateFMul(V, MbcntFP); + break; + } } } Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-no-rtn.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX908_GFX11 %s define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-atomic-fadd.f32-rtn.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=instruction-select -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.fadd.ll @@ -61,22 +61,42 @@ define amdgpu_kernel void @global_atomic_fadd_f32_off_ss(ptr addrspace(1) %ptr, float %data) { ; GFX908-LABEL: global_atomic_fadd_f32_off_ss: ; GFX908: ; %bb.0: -; GFX908-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX908-NEXT: s_mov_b64 s[0:1], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX908-NEXT: s_cbranch_execz .LBB3_2 +; GFX908-NEXT: ; %bb.1: +; GFX908-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX908-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_mov_b32_e32 v0, s2 -; GFX908-NEXT: global_atomic_add_f32 v1, v0, s[0:1] offset:2048 +; GFX908-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX908-NEXT: global_atomic_add_f32 v1, v0, s[2:3] offset:2048 +; GFX908-NEXT: .LBB3_2: ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_f32_off_ss: ; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX90A-NEXT: s_mov_b64 s[0:1], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB3_2 +; GFX90A-NEXT: ; %bb.1: +; GFX90A-NEXT: s_load_dword s6, s[4:5], 0x8 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, s0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v0, s2 -; GFX90A-NEXT: global_atomic_add_f32 v1, v0, s[0:1] offset:2048 +; GFX90A-NEXT: v_mul_f32_e32 v0, s6, v0 +; GFX90A-NEXT: global_atomic_add_f32 v1, v0, s[2:3] offset:2048 +; GFX90A-NEXT: .LBB3_2: ; GFX90A-NEXT: s_endpgm %gep = getelementptr float, ptr addrspace(1) %ptr, i64 512 %ret = call float @llvm.amdgcn.global.atomic.fadd.f32.p1.f32(ptr addrspace(1) %gep, float %data) diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll --- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll +++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx908.ll @@ -30,15 +30,26 @@ ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GCN-NEXT: s_cbranch_execz .LBB0_2 +; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: ; %bb.1: ; %if +; GCN-NEXT: s_mov_b64 s[8:9], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GCN-NEXT: s_cbranch_execz .LBB0_3 +; GCN-NEXT: ; %bb.2: +; GCN-NEXT: s_bcnt1_i32_b64 s8, s[8:9] +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s8 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 2.0 +; GCN-NEXT: v_add_f32_e32 v1, v1, v1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] offset:28 +; GCN-NEXT: .LBB0_3: +; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: global_load_dword v0, v[0:1], off glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: .LBB0_2: ; %endif +; GCN-NEXT: .LBB0_4: ; %Flow ; GCN-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0000 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-no-rtn.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX908_GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX908_GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX908_GFX11 %s define amdgpu_ps void @global_atomic_fadd_f32_no_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { ; GFX908_GFX11-LABEL: name: global_atomic_fadd_f32_no_rtn_intrinsic diff --git a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomic-fadd.f32-rtn.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX90A_GFX940 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -march=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX90A_GFX940 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -stop-after=amdgpu-isel -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -check-prefix=GFX11 %s define amdgpu_ps float @global_atomic_fadd_f32_rtn_intrinsic(ptr addrspace(1) %ptr, float %data) { ; GFX90A_GFX940-LABEL: name: global_atomic_fadd_f32_rtn_intrinsic diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -203,13 +203,29 @@ ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB1_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mul_f32_e32 v2, 4.0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc +; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v2, s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB1_2: +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX90A-NEXT: v_readfirstlane_b32 s0, v1 +; GFX90A-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX90A-NEXT: v_add_f32_e32 v0, s0, v0 ; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; @@ -243,14 +259,29 @@ ; ; GFX11-LABEL: global_atomic_fadd_ret_f32_ieee: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB1_2 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mul_f32 v1, 4.0, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc +; GFX11-NEXT: global_atomic_add_f32 v1, v2, v1, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: .LBB1_2: +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX11-NEXT: v_add_f32_e32 v0, s0, v0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -287,24 +318,44 @@ ; ; GFX908-LABEL: global_atomic_fadd_noret_f32: ; GFX908: ; %bb.0: +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_cbranch_execz .LBB2_2 +; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 -; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: .LBB2_2: ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_noret_f32: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB2_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB2_2: ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_fadd_noret_f32: @@ -335,14 +386,23 @@ ; ; GFX11-LABEL: global_atomic_fadd_noret_f32: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB2_2 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: global_atomic_add_f32 v1, v0, s[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: .LBB2_2: ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst ret void @@ -375,24 +435,44 @@ ; ; GFX908-LABEL: global_atomic_fadd_noret_f32_ieee: ; GFX908: ; %bb.0: +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_cbranch_execz .LBB3_2 +; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 -; GFX908-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX908-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: buffer_wbinvl1_vol +; GFX908-NEXT: .LBB3_2: ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: global_atomic_fadd_noret_f32_ieee: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB3_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB3_2: ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: global_atomic_fadd_noret_f32_ieee: @@ -423,14 +503,23 @@ ; ; GFX11-LABEL: global_atomic_fadd_noret_f32_ieee: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB3_2 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: global_atomic_add_f32 v1, v0, s[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: .LBB3_2: ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst ret void @@ -491,13 +580,28 @@ ; ; GFX90A-LABEL: global_atomic_fadd_ret_f32_agent: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b64 s[4:5], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: ; implicit-def: $vgpr1 +; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB4_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v2, s4 +; GFX90A-NEXT: v_mov_b32_e32 v1, 0 +; GFX90A-NEXT: v_mul_f32_e32 v2, 4.0, v2 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc +; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v2, s[0:1] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol +; GFX90A-NEXT: .LBB4_2: +; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX90A-NEXT: v_readfirstlane_b32 s0, v1 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX90A-NEXT: v_mad_f32 v0, v0, 4.0, s0 ; GFX90A-NEXT: global_store_dword v[0:1], v0, off ; GFX90A-NEXT: s_endpgm ; @@ -531,14 +635,29 @@ ; ; GFX11-LABEL: global_atomic_fadd_ret_f32_agent: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0 +; GFX11-NEXT: ; implicit-def: $vgpr1 +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB4_2 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0 +; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mul_f32 v1, 4.0, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc +; GFX11-NEXT: global_atomic_add_f32 v1, v2, v1, s[0:1] glc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_gl0_inv ; GFX11-NEXT: buffer_gl1_inv +; GFX11-NEXT: .LBB4_2: +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0 +; GFX11-NEXT: v_add_f32_e32 v0, s0, v0 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -751,24 +870,44 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 { ; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: ; GCN: ; %bb.0: +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GCN-NEXT: s_cbranch_execz .LBB7_2 +; GCN-NEXT: ; %bb.1: ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, 4.0 +; GCN-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_wbinvl1_vol +; GCN-NEXT: .LBB7_2: ; GCN-NEXT: s_endpgm ; ; GFX11-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b64 s[2:3], exec +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX11-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX11-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX11-NEXT: s_cbranch_execz .LBB7_2 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX11-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: v_mov_b32_e32 v1, 4.0 +; GFX11-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: buffer_wbinvl1_vol +; GFX11-NEXT: .LBB7_2: ; GFX11-NEXT: s_endpgm %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst ret void @@ -929,24 +1068,42 @@ ; ; GFX908-LABEL: infer_as_before_atomic: ; GFX908: ; %bb.0: +; GFX908-NEXT: s_mov_b64 s[2:3], exec +; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX908-NEXT: s_cbranch_execz .LBB9_2 +; GFX908-NEXT: ; %bb.1: ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX908-NEXT: v_mov_b32_e32 v0, 0 -; GFX908-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) ; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX908-NEXT: .LBB9_2: ; GFX908-NEXT: s_endpgm ; ; GFX90A-LABEL: infer_as_before_atomic: ; GFX90A: ; %bb.0: +; GFX90A-NEXT: s_mov_b64 s[2:3], exec +; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX90A-NEXT: s_cbranch_execz .LBB9_2 +; GFX90A-NEXT: ; %bb.1: ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX90A-NEXT: v_mov_b32_e32 v0, 0 -; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX90A-NEXT: .LBB9_2: ; GFX90A-NEXT: s_endpgm ; ; GFX10-LABEL: infer_as_before_atomic: @@ -975,12 +1132,21 @@ ; ; GFX11-LABEL: infer_as_before_atomic: ; GFX11: ; %bb.0: +; GFX11-NEXT: s_mov_b32 s2, exec_lo +; GFX11-NEXT: s_mov_b32 s3, exec_lo +; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 +; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 +; GFX11-NEXT: s_cbranch_execz .LBB9_2 +; GFX11-NEXT: ; %bb.1: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 1.0 +; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s2 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1] +; GFX11-NEXT: .LBB9_2: ; GFX11-NEXT: s_nop 0 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify' %s | FileCheck -check-prefix=IR %s + +declare i32 @llvm.amdgcn.workitem.id.x() +define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) #0 { +; IR-LABEL: @global_atomic_fadd_uni_value( +; IR-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-NEXT: [[TMP7:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP1]]) +; IR-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; IR-NEXT: [[TMP9:%.*]] = uitofp i32 [[TMP8]] to float +; IR-NEXT: [[TMP10:%.*]] = fmul float 4.000000e+00, [[TMP9]] +; IR-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-NEXT: br i1 [[TMP11]], label [[TMP12:%.*]], label [[TMP14:%.*]] +; IR: 12: +; IR-NEXT: [[TMP13:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP10]] seq_cst, align 4 +; IR-NEXT: br label [[TMP14]] +; IR: 14: +; IR-NEXT: ret void +; + %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst + ret void +} + + +define amdgpu_kernel void @global_atomic_fadd_div_value(ptr addrspace(1) %ptr) #0 { +; IR-LABEL: @global_atomic_fadd_div_value( +; IR-NEXT: [[ID_X:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; IR-NEXT: [[DIVVALUE:%.*]] = bitcast i32 [[ID_X]] to float +; IR-NEXT: [[TMP1:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32> +; IR-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP2]], i32 0 +; IR-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[TMP2]], i32 1 +; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP3]], i32 0) +; IR-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP4]], i32 [[TMP5]]) +; IR-NEXT: [[TMP7:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) +; IR-NEXT: br label [[COMPUTELOOP:%.*]] +; IR: 8: +; IR-NEXT: [[TMP9:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP16:%.*]] seq_cst, align 4 +; IR-NEXT: br label [[TMP10:%.*]] +; IR: 10: +; IR-NEXT: ret void +; IR: ComputeLoop: +; IR-NEXT: [[ACCUMULATOR:%.*]] = phi float [ 0.000000e+00, [[TMP0:%.*]] ], [ [[TMP16]], [[COMPUTELOOP]] ] +; IR-NEXT: [[ACTIVEBITS:%.*]] = phi i64 [ [[TMP7]], [[TMP0]] ], [ [[TMP19:%.*]], [[COMPUTELOOP]] ] +; IR-NEXT: [[TMP11:%.*]] = call i64 @llvm.cttz.i64(i64 [[ACTIVEBITS]], i1 true) +; IR-NEXT: [[TMP12:%.*]] = trunc i64 [[TMP11]] to i32 +; IR-NEXT: [[TMP13:%.*]] = bitcast float [[DIVVALUE]] to i32 +; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readlane(i32 [[TMP13]], i32 [[TMP12]]) +; IR-NEXT: [[TMP15:%.*]] = bitcast i32 [[TMP14]] to float +; IR-NEXT: [[TMP16]] = fadd float [[ACCUMULATOR]], [[TMP15]] +; IR-NEXT: [[TMP17:%.*]] = shl i64 1, [[TMP11]] +; IR-NEXT: [[TMP18:%.*]] = xor i64 [[TMP17]], -1 +; IR-NEXT: [[TMP19]] = and i64 [[ACTIVEBITS]], [[TMP18]] +; IR-NEXT: [[TMP20:%.*]] = icmp eq i64 [[TMP19]], 0 +; IR-NEXT: br i1 [[TMP20]], label [[COMPUTEEND:%.*]], label [[COMPUTELOOP]] +; IR: ComputeEnd: +; IR-NEXT: [[TMP21:%.*]] = icmp eq i32 [[TMP6]], 0 +; IR-NEXT: br i1 [[TMP21]], label [[TMP8:%.*]], label [[TMP10]] +; + %id.x = call i32 @llvm.amdgcn.workitem.id.x() + %divValue = bitcast i32 %id.x to float + %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue seq_cst + ret void +} + +attributes #0 = {"target-cpu"="gfx906"} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.fadd.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs | FileCheck %s -check-prefix=GCN -; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GCN +; RUN: llc < %s -march=amdgcn -mcpu=gfx908 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN +; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=DPP | FileCheck %s -check-prefix=GCN declare float @llvm.amdgcn.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i1) declare <2 x half> @llvm.amdgcn.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i1)