diff --git a/clang/test/CodeGenCUDA/fp-atomics-optremarks.cu b/clang/test/CodeGenCUDA/fp-atomics-optremarks.cu new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCUDA/fp-atomics-optremarks.cu @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -target-cpu gfx90a -Rpass=.* -S -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=GFX90A-CAS + +// REQUIRES: amdgpu-registered-target + +#include "Inputs/cuda.h" +#include + +// GFX90A-CAS: An FP atomic instruction was expanded into a CAS loop. +// GFX90A-CAS-LABEL: _Z14atomic_add_casPf +// GFX90A-CAS: flat_atomic_cmpswap v0, v[2:3], v[4:5] glc +// GFX90A-CAS: s_cbranch_execnz +__device__ float atomic_add_cas(float *p) { + return __atomic_fetch_add(p, 1.0f, memory_order_relaxed); +} diff --git a/clang/test/CodeGenOpenCL/fp-atomics-optremarks-gfx90a.cl b/clang/test/CodeGenOpenCL/fp-atomics-optremarks-gfx90a.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/fp-atomics-optremarks-gfx90a.cl @@ -0,0 +1,28 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -Rpass=si-lower -munsafe-fp-atomics %s -S -o - 2>&1 \ +// RUN: | FileCheck %s -check-prefix=GFX90A-HW + +typedef enum memory_order { + memory_order_relaxed = __ATOMIC_RELAXED, + memory_order_acquire = __ATOMIC_ACQUIRE, + memory_order_release = __ATOMIC_RELEASE, + memory_order_acq_rel = __ATOMIC_ACQ_REL, + memory_order_seq_cst = __ATOMIC_SEQ_CST +} memory_order; + +typedef enum memory_scope { + memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, + memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, + memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, + memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, +#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) + memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP +#endif +} memory_scope; + +// remark: An unsafe hardware instruction was generated. +// GFX90A-HW-LABEL: test_atomic_add +// GFX90A-HW: global_atomic_add_f64 +float test_atomic_add(global atomic_double *d, double a) { + return __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group); +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -30,6 +30,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { private: const GCNSubtarget *Subtarget; + OptimizationRemarkEmitter *ORE; public: MVT getRegisterTypeForCallingConv(LLVMContext &Context, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -19,6 +19,7 @@ #include "SIRegisterInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -12117,6 +12118,27 @@ return DenormMode == DenormalMode::getIEEE(); } +static TargetLowering::AtomicExpansionKind +atomicExpandReturn(OptimizationRemarkEmitter *ORE, AtomicRMWInst *RMW, + TargetLowering::AtomicExpansionKind Kind, bool UnsafeFlag) { + ORE = new OptimizationRemarkEmitter(RMW->getFunction()); + if (Kind == TargetLowering::AtomicExpansionKind::CmpXChg) { + ORE->emit([&]() { + OptimizationRemark Remark(DEBUG_TYPE, "Passed", RMW->getFunction()); + Remark << "An FP atomic instruction was expanded into a CAS loop."; + return Remark; + }); + } else if (Kind == TargetLowering::AtomicExpansionKind::None && UnsafeFlag) { + ORE->emit([&]() { + OptimizationRemark Remark(DEBUG_TYPE, "Passed", RMW->getFunction()); + Remark << "An unsafe hardware instruction was generated."; + return Remark; + }); + } + delete ORE; + return Kind; +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { switch (RMW->getOperation()) { @@ -12132,35 +12154,43 @@ return AtomicExpansionKind::CmpXChg; unsigned AS = RMW->getPointerAddressSpace(); - + bool UnsafeFPAtomicFlag = RMW->getFunction() + ->getFnAttribute("amdgpu-unsafe-fp-atomics") + .getValueAsBool(); if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) && Subtarget->hasAtomicFaddInsts()) { // The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe // floating point atomic instructions. May generate more efficient code, // but may not respect rounding and denormal modes, and may give incorrect // results for certain memory destinations. - if (RMW->getFunction() - ->getFnAttribute("amdgpu-unsafe-fp-atomics") - .getValueAsString() != "true") - return AtomicExpansionKind::CmpXChg; + if (!UnsafeFPAtomicFlag) + return atomicExpandReturn(ORE, RMW, AtomicExpansionKind::CmpXChg, + UnsafeFPAtomicFlag); + atomicExpandReturn(ORE, RMW, AtomicExpansionKind::None, + UnsafeFPAtomicFlag); if (Subtarget->hasGFX90AInsts()) { if (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS) - return AtomicExpansionKind::CmpXChg; + return atomicExpandReturn(ORE, RMW, AtomicExpansionKind::CmpXChg, + UnsafeFPAtomicFlag); auto SSID = RMW->getSyncScopeID(); if (SSID == SyncScope::System || SSID == RMW->getContext().getOrInsertSyncScopeID("one-as")) - return AtomicExpansionKind::CmpXChg; + return atomicExpandReturn(ORE, RMW, AtomicExpansionKind::CmpXChg, + UnsafeFPAtomicFlag); - return AtomicExpansionKind::None; + return atomicExpandReturn(ORE, RMW, AtomicExpansionKind::None, + UnsafeFPAtomicFlag); } if (AS == AMDGPUAS::FLAT_ADDRESS) - return AtomicExpansionKind::CmpXChg; + return atomicExpandReturn(ORE, RMW, AtomicExpansionKind::CmpXChg, + UnsafeFPAtomicFlag); - return RMW->use_empty() ? AtomicExpansionKind::None - : AtomicExpansionKind::CmpXChg; + auto Kind = RMW->use_empty() ? AtomicExpansionKind::None + : AtomicExpansionKind::CmpXChg; + return atomicExpandReturn(ORE, RMW, Kind, UnsafeFPAtomicFlag); } // DS FP atomics do repect the denormal mode, but the rounding mode is fixed @@ -12168,17 +12198,17 @@ // The only exception is DS_ADD_F64 which never flushes regardless of mode. if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) { if (!Ty->isDoubleTy()) - return AtomicExpansionKind::None; + return atomicExpandReturn(ORE, RMW, AtomicExpansionKind::None, + UnsafeFPAtomicFlag); - return (fpModeMatchesGlobalFPAtomicMode(RMW) || - RMW->getFunction() - ->getFnAttribute("amdgpu-unsafe-fp-atomics") - .getValueAsString() == "true") - ? AtomicExpansionKind::None - : AtomicExpansionKind::CmpXChg; + auto Kind = (fpModeMatchesGlobalFPAtomicMode(RMW) || UnsafeFPAtomicFlag) + ? AtomicExpansionKind::None + : AtomicExpansionKind::CmpXChg; + return atomicExpandReturn(ORE, RMW, Kind, UnsafeFPAtomicFlag); } - return AtomicExpansionKind::CmpXChg; + return atomicExpandReturn(ORE, RMW, AtomicExpansionKind::CmpXChg, + UnsafeFPAtomicFlag); } default: break;