diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -17,6 +17,8 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/AtomicExpandUtils.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/TargetLowering.h" @@ -58,6 +60,7 @@ class AtomicExpand: public FunctionPass { const TargetLowering *TLI = nullptr; + OptimizationRemarkEmitter *ORE; public: static char ID; // Pass identification, replacement for typeid @@ -69,6 +72,7 @@ bool runOnFunction(Function &F) override; private: + void getAnalysisUsage(AnalysisUsage &AU) const override; bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); @@ -165,11 +169,16 @@ Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; } +void AtomicExpand::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); +} + bool AtomicExpand::runOnFunction(Function &F) { auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; + ORE = &getAnalysis().getORE(); auto &TM = TPC->getTM(); if (!TM.getSubtargetImpl(F)->enableAtomicExpand()) return false; @@ -570,7 +579,10 @@ } bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { - switch (TLI->shouldExpandAtomicRMWInIR(AI)) { + LLVMContext &Ctx = AI->getModule()->getContext(); + TargetLowering::AtomicExpansionKind Kind = + TLI->shouldExpandAtomicRMWInIR(AI); + switch (Kind) { case TargetLoweringBase::AtomicExpansionKind::None: return false; case TargetLoweringBase::AtomicExpansionKind::LLSC: { @@ -600,6 +612,17 @@ expandPartwordAtomicRMW(AI, TargetLoweringBase::AtomicExpansionKind::CmpXChg); } else { + SmallVector SSNs; + Ctx.getSyncScopeNames(SSNs); + auto MemScope = SSNs[AI->getSyncScopeID()].empty() + ? "system" + : SSNs[AI->getSyncScopeID()]; + ORE->emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "Passed", AI->getFunction()) + << "A compare and swap loop was generated for an " + << AI->getOperationName(AI->getOperation()) << " operation at " + << MemScope << " memory scope"; + }); expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun); } return true; diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-remarks-gfx90a.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-remarks-gfx90a.ll @@ -0,0 +1,103 @@ +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=atomic-expand \ +; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-CAS + +; GFX90A-CAS: A compare and swap loop was generated for an fadd operation at system memory scope +; GFX90A-CAS: A compare and swap loop was generated for an fadd operation at agent memory scope +; GFX90A-CAS: A compare and swap loop was generated for an fadd operation at workgroup memory scope +; GFX90A-CAS: A compare and swap loop was generated for an fadd operation at wavefront memory scope +; GFX90A-CAS: A compare and swap loop was generated for an fadd operation at singlethread memory scope +; GFX90A-CAS: A compare and swap loop was generated for an fadd operation at one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an fadd operation at agent-one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an fadd operation at workgroup-one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an fadd operation at wavefront-one-as memory scope +; GFX90A-CAS: A compare and swap loop was generated for an fadd operation at singlethread-one-as memory scope + +; GFX90A-CAS-LABEL: atomic_add_cas: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_agent: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_agent(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("agent") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_workgroup: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_workgroup(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_wavefront: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_wavefront(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_singlethread: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_singlethread(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_agent_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_agent_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("agent-one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_workgroup_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_workgroup_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("workgroup-one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_wavefront_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_wavefront_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("wavefront-one-as") monotonic, align 4 + ret void +} + +; GFX90A-CAS-LABEL: atomic_add_cas_singlethread_one_as: +; GFX90A-CAS: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @atomic_add_cas_singlethread_one_as(float* %p, float %q) { +entry: + %ret = atomicrmw fadd float* %p, float %q syncscope("singlethread-one-as") monotonic, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -45,6 +45,11 @@ ; GCN-O0-NEXT: Lower OpenCL enqueued blocks ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager +; GCN-O0-NEXT: Dominator Tree Construction +; GCN-O0-NEXT: Natural Loop Information +; GCN-O0-NEXT: Lazy Branch Probability Analysis +; GCN-O0-NEXT: Lazy Block Frequency Analysis +; GCN-O0-NEXT: Optimization Remark Emitter ; GCN-O0-NEXT: Expand Atomic instructions ; GCN-O0-NEXT: Lower constant intrinsics ; GCN-O0-NEXT: Remove unreachable blocks from the CFG @@ -182,6 +187,11 @@ ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Infer address spaces +; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Natural Loop Information +; GCN-O1-NEXT: Lazy Branch Probability Analysis +; GCN-O1-NEXT: Lazy Block Frequency Analysis +; GCN-O1-NEXT: Optimization Remark Emitter ; GCN-O1-NEXT: Expand Atomic instructions ; GCN-O1-NEXT: AMDGPU Promote Alloca ; GCN-O1-NEXT: Dominator Tree Construction @@ -434,6 +444,11 @@ ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Infer address spaces +; GCN-O1-OPTS-NEXT: Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Natural Loop Information +; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis +; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis +; GCN-O1-OPTS-NEXT: Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Expand Atomic instructions ; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: Dominator Tree Construction @@ -719,6 +734,11 @@ ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Infer address spaces +; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: Lazy Branch Probability Analysis +; GCN-O2-NEXT: Lazy Block Frequency Analysis +; GCN-O2-NEXT: Optimization Remark Emitter ; GCN-O2-NEXT: Expand Atomic instructions ; GCN-O2-NEXT: AMDGPU Promote Alloca ; GCN-O2-NEXT: Dominator Tree Construction @@ -1006,6 +1026,11 @@ ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Infer address spaces +; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: Lazy Branch Probability Analysis +; GCN-O3-NEXT: Lazy Block Frequency Analysis +; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Expand Atomic instructions ; GCN-O3-NEXT: AMDGPU Promote Alloca ; GCN-O3-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -10,13 +10,18 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Assumption Cache Tracker -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -16,15 +16,20 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store