diff --git a/clang/test/CodeGenCUDA/fp-atomics-optremarks.cu b/clang/test/CodeGenCUDA/fp-atomics-optremarks.cu new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCUDA/fp-atomics-optremarks.cu @@ -0,0 +1,16 @@ +// RUN: %clang_cc1 %s -triple=amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: -target-cpu gfx90a -Rpass=atomic-expand -S -o - 2>&1 | \ +// RUN: FileCheck %s --check-prefix=GFX90A-CAS + +// REQUIRES: amdgpu-registered-target + +#include "Inputs/cuda.h" +#include + +// GFX90A-CAS: A hardware CAS instruction generated: if the memory is known to be coarse-grain allocated then a hardware floating-point atomic could be requested +// GFX90A-CAS-LABEL: _Z14atomic_add_casPf +// GFX90A-CAS: flat_atomic_cmpswap v0, v[2:3], v[4:5] glc +// GFX90A-CAS: s_cbranch_execnz +__device__ float atomic_add_cas(float *p) { + return __atomic_fetch_add(p, 1.0f, memory_order_relaxed); +} diff --git a/clang/test/CodeGenOpenCL/fp-atomics-optremarks-gfx90a.cl b/clang/test/CodeGenOpenCL/fp-atomics-optremarks-gfx90a.cl new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenOpenCL/fp-atomics-optremarks-gfx90a.cl @@ -0,0 +1,28 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a \ +// RUN: -Rpass=si-lower -munsafe-fp-atomics %s -S -o - 2>&1 \ +// RUN: | FileCheck %s -check-prefix=GFX90A-HW + +typedef enum memory_order { + memory_order_relaxed = __ATOMIC_RELAXED, + memory_order_acquire = __ATOMIC_ACQUIRE, + memory_order_release = __ATOMIC_RELEASE, + memory_order_acq_rel = __ATOMIC_ACQ_REL, + memory_order_seq_cst = __ATOMIC_SEQ_CST +} memory_order; + +typedef enum memory_scope { + memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM, + memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP, + memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE, + memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES, +#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups) + memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP +#endif +} memory_scope; + +// GFX90A-HW: A hardware floating-point atomic instruction generated: only safe if the memory is known to be coarse-grain allocated +// GFX90A-HW-LABEL: test_atomic_add +// GFX90A-HW: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc +float test_atomic_add(__global atomic_double *d, double a) { + return __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group); +} diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -28,6 +28,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/RuntimeLibcalls.h" @@ -2004,7 +2005,9 @@ /// Returns how the IR-level AtomicExpand pass should expand the given /// AtomicRMW, if at all. Default is to never expand. - virtual AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { + virtual AtomicExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW, + OptimizationRemarkEmitter *ORE) const { return RMW->isFloatingPointOperation() ? AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None; } diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp --- a/llvm/lib/CodeGen/AtomicExpandPass.cpp +++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp @@ -17,6 +17,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/AtomicExpandUtils.h" #include "llvm/CodeGen/RuntimeLibcalls.h" #include "llvm/CodeGen/TargetLowering.h" @@ -58,6 +59,7 @@ class AtomicExpand: public FunctionPass { const TargetLowering *TLI = nullptr; + OptimizationRemarkEmitter *ORE; public: static char ID; // Pass identification, replacement for typeid @@ -69,6 +71,9 @@ bool runOnFunction(Function &F) override; private: + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool emitAtomicExpansionRemarks(AtomicRMWInst *RMW, + TargetLowering::AtomicExpansionKind Kind); bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); @@ -165,11 +170,16 @@ Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8; } +void AtomicExpand::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); +} + bool AtomicExpand::runOnFunction(Function &F) { auto *TPC = getAnalysisIfAvailable(); if (!TPC) return false; + ORE = &getAnalysis().getORE(); auto &TM = TPC->getTM(); if (!TM.getSubtargetImpl(F)->enableAtomicExpand()) return false; @@ -569,8 +579,27 @@ } } +bool AtomicExpand::emitAtomicExpansionRemarks( + AtomicRMWInst *RMW, TargetLowering::AtomicExpansionKind Kind) { + if (Kind == TargetLowering::AtomicExpansionKind::CmpXChg) { + ORE->emit([&]() { + OptimizationRemark Remark(DEBUG_TYPE, "Passed", RMW->getFunction()); + Remark + << "A hardware CAS instruction generated: if the memory is " + "known to be coarse-grain allocated then a hardware floating-point" + " atomic could be requested"; + return Remark; + }); + return true; + } + return false; +} + bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) { - switch (TLI->shouldExpandAtomicRMWInIR(AI)) { + TargetLowering::AtomicExpansionKind Kind = + TLI->shouldExpandAtomicRMWInIR(AI, ORE); + emitAtomicExpansionRemarks(AI, Kind); + switch (Kind) { case TargetLoweringBase::AtomicExpansionKind::None: return false; case TargetLoweringBase::AtomicExpansionKind::LLSC: { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -15,6 +15,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/TargetLowering.h" @@ -326,7 +327,9 @@ return MVT::i32; } - AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; + AtomicExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *, + OptimizationRemarkEmitter *ORE) const override; bool isConstantUnsignedBitfieldExtactLegal(unsigned Opc, LLT Ty1, LLT Ty2) const override; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -18,6 +18,7 @@ #include "AMDGPUMachineFunction.h" #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" @@ -4835,7 +4836,8 @@ } TargetLowering::AtomicExpansionKind -AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { +AMDGPUTargetLowering::shouldExpandAtomicRMWInIR( + AtomicRMWInst *RMW, OptimizationRemarkEmitter *ORE) const { switch (RMW->getOperation()) { case AtomicRMWInst::Nand: case AtomicRMWInst::FAdd: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -451,7 +451,9 @@ const SelectionDAG &DAG, bool SNaN = false, unsigned Depth = 0) const override; - AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override; + AtomicExpansionKind + shouldExpandAtomicRMWInIR(AtomicRMWInst *, + OptimizationRemarkEmitter *ORE) const override; virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -19,6 +19,7 @@ #include "SIRegisterInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" @@ -12113,8 +12114,22 @@ return DenormMode == DenormalMode::getIEEE(); } -TargetLowering::AtomicExpansionKind -SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { +static TargetLowering::AtomicExpansionKind +reportAtomicExpand(TargetLowering::AtomicExpansionKind Kind, + OptimizationRemarkEmitter *ORE, + OptimizationRemark OptRemark) { + ORE->emit([&]() { + OptRemark + << "A hardware floating-point atomic instruction generated: " + "only safe if the memory is known to be coarse-grain allocated"; + return OptRemark; + }); + return Kind; +} + +TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR( + AtomicRMWInst *RMW, OptimizationRemarkEmitter *ORE) const { + OptimizationRemark OptRemark(DEBUG_TYPE, "Passed", RMW->getFunction()); switch (RMW->getOperation()) { case AtomicRMWInst::FAdd: { Type *Ty = RMW->getType(); @@ -12149,14 +12164,15 @@ SSID == RMW->getContext().getOrInsertSyncScopeID("one-as")) return AtomicExpansionKind::CmpXChg; - return AtomicExpansionKind::None; + return reportAtomicExpand(AtomicExpansionKind::None, ORE, OptRemark); } if (AS == AMDGPUAS::FLAT_ADDRESS) return AtomicExpansionKind::CmpXChg; - return RMW->use_empty() ? AtomicExpansionKind::None - : AtomicExpansionKind::CmpXChg; + if (RMW->use_empty()) + return reportAtomicExpand(AtomicExpansionKind::None, ORE, OptRemark); + return AtomicExpansionKind::CmpXChg; } // DS FP atomics do repect the denormal mode, but the rounding mode is fixed @@ -12166,12 +12182,15 @@ if (!Ty->isDoubleTy()) return AtomicExpansionKind::None; - return (fpModeMatchesGlobalFPAtomicMode(RMW) || - RMW->getFunction() - ->getFnAttribute("amdgpu-unsafe-fp-atomics") - .getValueAsString() == "true") - ? AtomicExpansionKind::None - : AtomicExpansionKind::CmpXChg; + if (fpModeMatchesGlobalFPAtomicMode(RMW)) + return AtomicExpansionKind::None; + + if (RMW->getFunction() + ->getFnAttribute("amdgpu-unsafe-fp-atomics") + .getValueAsBool()) { + return reportAtomicExpand(AtomicExpansionKind::None, ORE, OptRemark); + } + return AtomicExpansionKind::CmpXChg; } return AtomicExpansionKind::CmpXChg; @@ -12180,7 +12199,7 @@ break; } - return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); + return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW, ORE); } const TargetRegisterClass * diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H #define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/CodeGen/TargetLowering.h" namespace llvm { @@ -1586,7 +1587,8 @@ shouldExpandAtomicLoadInIR(LoadInst *LI) const override; bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override; TargetLoweringBase::AtomicExpansionKind - shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override; + shouldExpandAtomicRMWInIR(AtomicRMWInst *AI, + OptimizationRemarkEmitter *ORE) const override; LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -29,6 +29,7 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/Analysis/ObjCARCUtil.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/IntrinsicLowering.h" @@ -48,9 +49,9 @@ #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" @@ -29170,7 +29171,8 @@ } TargetLowering::AtomicExpansionKind -X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { +X86TargetLowering::shouldExpandAtomicRMWInIR( + AtomicRMWInst *AI, OptimizationRemarkEmitter *ORE) const { unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; Type *MemType = AI->getType(); diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-remarks-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-remarks-gfx90a.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-remarks-gfx90a.ll @@ -0,0 +1,12 @@ +; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs --pass-remarks=atomic-expand \ +; RUN: %s -o - 2>&1 | FileCheck %s --check-prefix=GFX90A-CAS + +; GFX90A-CAS: An atomic instruction was expanded into a compare and swap loop +; GFX90A-CAS-LABEL: _Z14atomic_add_casPf: +; GFX90A-CAS: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc +; GFX90A-CAS: s_cbranch_execnz +define dso_local void @_Z14atomic_add_casPf(float* %p) { +entry: + %ret = atomicrmw fadd float* %p, float 7.0 monotonic, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -44,6 +44,11 @@ ; GCN-O0-NEXT: Lower OpenCL enqueued blocks ; GCN-O0-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O0-NEXT: FunctionPass Manager +; GCN-O0-NEXT: Dominator Tree Construction +; GCN-O0-NEXT: Natural Loop Information +; GCN-O0-NEXT: Lazy Branch Probability Analysis +; GCN-O0-NEXT: Lazy Block Frequency Analysis +; GCN-O0-NEXT: Optimization Remark Emitter ; GCN-O0-NEXT: Expand Atomic instructions ; GCN-O0-NEXT: Lower constant intrinsics ; GCN-O0-NEXT: Remove unreachable blocks from the CFG @@ -180,6 +185,11 @@ ; GCN-O1-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Infer address spaces +; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Natural Loop Information +; GCN-O1-NEXT: Lazy Branch Probability Analysis +; GCN-O1-NEXT: Lazy Block Frequency Analysis +; GCN-O1-NEXT: Optimization Remark Emitter ; GCN-O1-NEXT: Expand Atomic instructions ; GCN-O1-NEXT: AMDGPU Promote Alloca ; GCN-O1-NEXT: Dominator Tree Construction @@ -431,6 +441,11 @@ ; GCN-O1-OPTS-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Infer address spaces +; GCN-O1-OPTS-NEXT: Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Natural Loop Information +; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis +; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis +; GCN-O1-OPTS-NEXT: Optimization Remark Emitter ; GCN-O1-OPTS-NEXT: Expand Atomic instructions ; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: Dominator Tree Construction @@ -715,6 +730,11 @@ ; GCN-O2-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Infer address spaces +; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Natural Loop Information +; GCN-O2-NEXT: Lazy Branch Probability Analysis +; GCN-O2-NEXT: Lazy Block Frequency Analysis +; GCN-O2-NEXT: Optimization Remark Emitter ; GCN-O2-NEXT: Expand Atomic instructions ; GCN-O2-NEXT: AMDGPU Promote Alloca ; GCN-O2-NEXT: Dominator Tree Construction @@ -1001,6 +1021,11 @@ ; GCN-O3-NEXT: Lower uses of LDS variables from non-kernel functions ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Infer address spaces +; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Natural Loop Information +; GCN-O3-NEXT: Lazy Branch Probability Analysis +; GCN-O3-NEXT: Lazy Block Frequency Analysis +; GCN-O3-NEXT: Optimization Remark Emitter ; GCN-O3-NEXT: Expand Atomic instructions ; GCN-O3-NEXT: AMDGPU Promote Alloca ; GCN-O3-NEXT: Dominator Tree Construction diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -10,13 +10,18 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Assumption Cache Tracker -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -16,15 +16,20 @@ ; CHECK-NEXT: Target Pass Configuration ; CHECK-NEXT: Machine Module Information ; CHECK-NEXT: Target Transform Information +; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Type-Based Alias Analysis ; CHECK-NEXT: Scoped NoAlias Alias Analysis ; CHECK-NEXT: Assumption Cache Tracker -; CHECK-NEXT: Profile summary info ; CHECK-NEXT: Create Garbage Collector Module Metadata ; CHECK-NEXT: Machine Branch Probability Analysis ; CHECK-NEXT: ModulePass Manager ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Natural Loop Information +; CHECK-NEXT: Lazy Branch Probability Analysis +; CHECK-NEXT: Lazy Block Frequency Analysis +; CHECK-NEXT: Optimization Remark Emitter ; CHECK-NEXT: Expand Atomic instructions ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store