diff --git a/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl b/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl --- a/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl +++ b/clang/test/CodeGenOpenCL/atomics-remarks-gfx90a.cl @@ -1,11 +1,19 @@ // RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \ // RUN: -Rpass=atomic-expand -S -o - 2>&1 | \ -// RUN: FileCheck %s --check-prefix=REMARK +// RUN: FileCheck %s --check-prefix=GFX90A-CAS-REMARK // RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \ // RUN: -Rpass=atomic-expand -S -emit-llvm -o - 2>&1 | \ // RUN: FileCheck %s --check-prefix=GFX90A-CAS +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -Rpass=si-lower -munsafe-fp-atomics %s -S -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=GFX90A-HW-REMARK + +// RUN: %clang_cc1 %s -cl-std=CL2.0 -O0 -triple=amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -Rpass=si-lower -S -emit-llvm -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=GFX90A-HW + // REQUIRES: amdgpu-registered-target typedef enum memory_order { @@ -26,10 +34,10 @@ #endif } memory_scope; -// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope [-Rpass=atomic-expand] -// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope [-Rpass=atomic-expand] -// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope [-Rpass=atomic-expand] -// REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope [-Rpass=atomic-expand] +// GFX90A-CAS-REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at workgroup-one-as memory scope [-Rpass=atomic-expand] +// GFX90A-CAS-REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at agent-one-as memory scope [-Rpass=atomic-expand] +// GFX90A-CAS-REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at one-as memory scope [-Rpass=atomic-expand] +// GFX90A-CAS-REMARK: remark: A compare and swap loop was generated for an atomic fadd operation at wavefront-one-as memory scope [-Rpass=atomic-expand] // GFX90A-CAS-LABEL: @atomic_cas // GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("workgroup-one-as") monotonic // GFX90A-CAS: atomicrmw fadd float addrspace(1)* {{.*}} syncscope("agent-one-as") monotonic @@ -41,3 +49,19 @@ float ret3 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_all_svm_devices); float ret4 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_sub_group); } + +// GFX90A-HW-REMARK: Hardware instruction generated for atomic fadd operation at memory scope workgroup-one-as due to an unsafe request. [-Rpass=si-lower] +// GFX90A-HW-REMARK: Hardware instruction generated for atomic fadd operation at memory scope agent-one-as due to an unsafe request. [-Rpass=si-lower] +// GFX90A-HW-REMARK: Hardware instruction generated for atomic fadd operation at memory scope wavefront-one-as due to an unsafe request. [-Rpass=si-lower] +// GFX90A-HW-REMARK: global_atomic_add_f32 v0, v[0:1], v2, off glc +// GFX90A-HW-REMARK: global_atomic_add_f32 v0, v[0:1], v2, off glc +// GFX90A-HW-REMARK: global_atomic_add_f32 v0, v[0:1], v2, off glc +// GFX90A-HW-LABEL: @atomic_unsafe_hw +// GFX90A-HW: atomicrmw fadd float addrspace(1)* %{{.*}}, float %{{.*}} syncscope("workgroup-one-as") monotonic, align 4 +// GFX90A-HW: atomicrmw fadd float addrspace(1)* %{{.*}}, float %{{.*}} syncscope("agent-one-as") monotonic, align 4 +// GFX90A-HW: atomicrmw fadd float addrspace(1)* %{{.*}}, float %{{.*}} syncscope("wavefront-one-as") monotonic, align 4 +void atomic_unsafe_hw(__global atomic_float *d, float a) { + float ret1 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group); + float ret2 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_device); + float ret3 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_sub_group); +} diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -45,6 +45,9 @@ unsigned &NumIntermediates, MVT &RegisterVT) const override; private: + TargetLowering::AtomicExpansionKind + reportUnsafeHWInst(AtomicRMWInst *RMW, + TargetLowering::AtomicExpansionKind Kind) const; SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, uint64_t Offset) const; SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -19,6 +19,7 @@ #include "SIRegisterInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -12118,6 +12119,25 @@ return DenormMode == DenormalMode::getIEEE(); } +TargetLowering::AtomicExpansionKind SITargetLowering::reportUnsafeHWInst( + AtomicRMWInst *RMW, TargetLowering::AtomicExpansionKind Kind) const { + OptimizationRemarkEmitter ORE(RMW->getFunction()); + LLVMContext &Ctx = RMW->getFunction()->getContext(); + SmallVector SSNs; + Ctx.getSyncScopeNames(SSNs); + auto MemScope = SSNs[RMW->getSyncScopeID()].empty() + ? "system" + : SSNs[RMW->getSyncScopeID()]; + ORE.emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "Passed", RMW) + << "Hardware instruction generated for atomic " + << RMW->getOperationName(RMW->getOperation()) + << " operation at memory scope " << MemScope + << " due to an unsafe request."; + }); + return Kind; +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { switch (RMW->getOperation()) { @@ -12154,14 +12174,15 @@ SSID == RMW->getContext().getOrInsertSyncScopeID("one-as")) return AtomicExpansionKind::CmpXChg; - return AtomicExpansionKind::None; + return reportUnsafeHWInst(RMW, AtomicExpansionKind::None); } if (AS == AMDGPUAS::FLAT_ADDRESS) return AtomicExpansionKind::CmpXChg; - return RMW->use_empty() ? AtomicExpansionKind::None - : AtomicExpansionKind::CmpXChg; + return RMW->use_empty() + ? reportUnsafeHWInst(RMW, AtomicExpansionKind::None) + : AtomicExpansionKind::CmpXChg; } // DS FP atomics do repect the denormal mode, but the rounding mode is fixed @@ -12171,6 +12192,9 @@ if (!Ty->isDoubleTy()) return AtomicExpansionKind::None; + if (!fpModeMatchesGlobalFPAtomicMode(RMW)) + return reportUnsafeHWInst(RMW, AtomicExpansionKind::None); + return (fpModeMatchesGlobalFPAtomicMode(RMW) || RMW->getFunction() ->getFnAttribute("amdgpu-unsafe-fp-atomics") diff --git a/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/atomics-remarks-gfx90a.ll @@ -1,6 +1,9 @@ ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=atomic-expand \ ; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-CAS +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=si-lower \ +; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-HW + ; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope ; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at agent memory scope ; GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at workgroup memory scope @@ -100,4 +103,97 @@ entry: %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread-one-as") monotonic, align 4 ret void -} \ No newline at end of file +} + +; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope system due to an unsafe request. +; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope agent due to an unsafe request. +; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope workgroup due to an unsafe request. +; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope wavefront due to an unsafe request. +; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope singlethread due to an unsafe request. +; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope agent-one-as due to an unsafe request. +; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope workgroup-one-as due to an unsafe request. +; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope wavefront-one-as due to an unsafe request. +; GFX90A-HW: Hardware instruction generated for atomic fadd operation at memory scope singlethread-one-as due to an unsafe request. + +; GFX90A-HW-LABEL: atomic_add_unsafe_hw: +; GFX90A-HW: ds_add_f64 v2, v[0:1] +; GFX90A-HW: s_endpgm +define amdgpu_kernel void @atomic_add_unsafe_hw(double addrspace(3)* %ptr) #0 { +main_body: + %ret = atomicrmw fadd double addrspace(3)* %ptr, double 4.0 seq_cst + ret void +} + +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_agent: +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: s_endpgm +define amdgpu_kernel void @atomic_add_unsafe_hw_agent(float addrspace(1)* %ptr, float %val) #0 { +main_body: + %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("agent") monotonic, align 4 + ret void +} + +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wg: +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: s_endpgm +define amdgpu_kernel void @atomic_add_unsafe_hw_wg(float addrspace(1)* %ptr, float %val) #0 { +main_body: + %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("workgroup") monotonic, align 4 + ret void +} + +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wavefront: +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: s_endpgm +define amdgpu_kernel void @atomic_add_unsafe_hw_wavefront(float addrspace(1)* %ptr, float %val) #0 { +main_body: + %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("wavefront") monotonic, align 4 + ret void +} + +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_single_thread: +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: s_endpgm +define amdgpu_kernel void @atomic_add_unsafe_hw_single_thread(float addrspace(1)* %ptr, float %val) #0 { +main_body: + %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("singlethread") monotonic, align 4 + ret void +} + +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_aoa: +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: s_endpgm +define amdgpu_kernel void @atomic_add_unsafe_hw_aoa(float addrspace(1)* %ptr, float %val) #0 { +main_body: + %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("agent-one-as") monotonic, align 4 + ret void +} + +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wgoa: +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: s_endpgm +define amdgpu_kernel void @atomic_add_unsafe_hw_wgoa(float addrspace(1)* %ptr, float %val) #0 { +main_body: + %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("workgroup-one-as") monotonic, align 4 + ret void +} + +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_wfoa: +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: s_endpgm +define amdgpu_kernel void @atomic_add_unsafe_hw_wfoa(float addrspace(1)* %ptr, float %val) #0 { +main_body: + %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("wavefront-one-as") monotonic, align 4 + ret void +} + +; GFX90A-HW-LABEL: atomic_add_unsafe_hw_stoa: +; GFX90A-HW: global_atomic_add_f32 v0, v1, s[2:3] +; GFX90A-HW: s_endpgm +define amdgpu_kernel void @atomic_add_unsafe_hw_stoa(float addrspace(1)* %ptr, float %val) #0 { +main_body: + %ret = atomicrmw fadd float addrspace(1)* %ptr, float %val syncscope("singlethread-one-as") monotonic, align 4 + ret void +} + +attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }