Index: llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -14,11 +14,8 @@ #include "AMDGPU.h" #include "Utils/AMDGPUBaseInfo.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SetVector.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/IR/InstVisitor.h" #include "llvm/InitializePasses.h" @@ -31,8 +28,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass, public InstVisitor { LegacyDivergenceAnalysis *DA; - MemoryDependenceResults *MDR; - LoopInfo *LI; + MemorySSA *MSSA; DenseMap noClobberClones; bool isEntryFunc; @@ -47,8 +43,7 @@ } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); - AU.addRequired(); - AU.addRequired(); + AU.addRequired(); AU.setPreservesAll(); } @@ -62,8 +57,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis) -INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE, "Add AMDGPU uniform metadata", false, false) @@ -77,37 +71,8 @@ } bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) { - // 1. get Loop for the Load->getparent(); - // 2. if it exists, collect all the BBs from the most outer - // loop and check for the writes. If NOT - start DFS over all preds. - // 3. Start DFS over all preds from the most outer loop header. - SetVector Checklist; - BasicBlock *Start = Load->getParent(); - Checklist.insert(Start); - const Value *Ptr = Load->getPointerOperand(); - const Loop *L = LI->getLoopFor(Start); - if (L) { - const Loop *P = L; - do { - L = P; - P = P->getParentLoop(); - } while (P); - Checklist.insert(L->block_begin(), L->block_end()); - Start = L->getHeader(); - } - - Checklist.insert(idf_begin(Start), idf_end(Start)); - for (auto &BB : Checklist) { - BasicBlock::iterator StartIt = (!L && (BB == Load->getParent())) ? - BasicBlock::iterator(Load) : BB->end(); - auto Q = MDR->getPointerDependencyFrom( - MemoryLocation::getBeforeOrAfter(Ptr), true, StartIt, BB, Load); - if (Q.isClobber() || Q.isUnknown() || - // Store defines the load and thus clobbers it. - (Q.isDef() && Q.getInst()->mayWriteToMemory())) - return true; - } - return false; + const MemoryAccess *MA = MSSA->getWalker()->getClobberingMemoryAccess(Load); + return !MSSA->isLiveOnEntryDef(MA); } void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) { @@ -172,9 +137,8 @@ if (skipFunction(F)) return false; - DA = &getAnalysis(); - MDR = &getAnalysis().getMemDep(); - LI = &getAnalysis().getLoopInfo(); + DA = &getAnalysis(); + MSSA = &getAnalysis().getMSSA(); isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv()); visit(F); Index: llvm/test/CodeGen/AMDGPU/coalescer_remat.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/coalescer_remat.ll +++ llvm/test/CodeGen/AMDGPU/coalescer_remat.ll @@ -12,8 +12,8 @@ ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0 ; It's probably OK if this is slightly higher: -; CHECK: ; NumVgprs: 4 -define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) { +; CHECK: ; NumVgprs: 8 +define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* noalias %out, <4 x float> addrspace(1)* noalias %in, i32 %flag) { entry: %cmpflag = icmp eq i32 %flag, 1 br i1 %cmpflag, label %loop, label %exit Index: llvm/test/CodeGen/AMDGPU/global-smrd-unknown.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-smrd-unknown.ll +++ llvm/test/CodeGen/AMDGPU/global-smrd-unknown.ll @@ -4,7 +4,7 @@ ; GCN: flat_load_dword ; GCN: flat_load_dword ; GCN: flat_store_dword -define amdgpu_kernel void @unknown_memdep_analysis(float addrspace(1)* nocapture readonly %arg, float %arg1) #0 { +define void @unknown_memdep_analysis(float addrspace(1)* nocapture readonly %arg, float %arg1) #0 { bb: %tmp53 = load float, float addrspace(1)* undef, align 4 %tmp54 = getelementptr inbounds float, float addrspace(1)* %arg, i32 31 Index: llvm/test/CodeGen/AMDGPU/llc-pipeline.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -95,9 +95,8 @@ ; GCN-O0-NEXT: Code sinking ; GCN-O0-NEXT: Post-Dominator Tree Construction ; GCN-O0-NEXT: Legacy Divergence Analysis -; GCN-O0-NEXT: Phi Values Analysis ; GCN-O0-NEXT: Function Alias Analysis Results -; GCN-O0-NEXT: Memory Dependence Analysis +; GCN-O0-NEXT: Memory SSA ; GCN-O0-NEXT: AMDGPU Annotate Uniform Values ; GCN-O0-NEXT: SI annotate control flow ; GCN-O0-NEXT: Natural Loop Information @@ -275,9 +274,8 @@ ; GCN-O1-NEXT: Code sinking ; GCN-O1-NEXT: Post-Dominator Tree Construction ; GCN-O1-NEXT: Legacy Divergence Analysis -; GCN-O1-NEXT: Phi Values Analysis ; GCN-O1-NEXT: Function Alias Analysis Results -; GCN-O1-NEXT: Memory Dependence Analysis +; GCN-O1-NEXT: Memory SSA ; GCN-O1-NEXT: AMDGPU Annotate Uniform Values ; GCN-O1-NEXT: SI annotate control flow ; GCN-O1-NEXT: Natural Loop Information @@ -550,9 +548,8 @@ ; GCN-O1-OPTS-NEXT: Code sinking ; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction ; GCN-O1-OPTS-NEXT: Legacy Divergence Analysis -; GCN-O1-OPTS-NEXT: Phi Values Analysis ; GCN-O1-OPTS-NEXT: Function Alias Analysis Results -; GCN-O1-OPTS-NEXT: Memory Dependence Analysis +; GCN-O1-OPTS-NEXT: Memory SSA ; GCN-O1-OPTS-NEXT: AMDGPU Annotate Uniform Values ; GCN-O1-OPTS-NEXT: SI annotate control flow ; GCN-O1-OPTS-NEXT: Natural Loop Information @@ -833,9 +830,8 @@ ; GCN-O2-NEXT: Code sinking ; GCN-O2-NEXT: Post-Dominator Tree Construction ; GCN-O2-NEXT: Legacy Divergence Analysis -; GCN-O2-NEXT: Phi Values Analysis ; GCN-O2-NEXT: Function Alias Analysis Results -; GCN-O2-NEXT: Memory Dependence Analysis +; GCN-O2-NEXT: Memory SSA ; GCN-O2-NEXT: AMDGPU Annotate Uniform Values ; GCN-O2-NEXT: SI annotate control flow ; GCN-O2-NEXT: Natural Loop Information @@ -1129,9 +1125,8 @@ ; GCN-O3-NEXT: Code sinking ; GCN-O3-NEXT: Post-Dominator Tree Construction ; GCN-O3-NEXT: Legacy Divergence Analysis -; GCN-O3-NEXT: Phi Values Analysis ; GCN-O3-NEXT: Function Alias Analysis Results -; GCN-O3-NEXT: Memory Dependence Analysis +; GCN-O3-NEXT: Memory SSA ; GCN-O3-NEXT: AMDGPU Annotate Uniform Values ; GCN-O3-NEXT: SI annotate control flow ; GCN-O3-NEXT: Natural Loop Information Index: llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll +++ llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll @@ -1,11 +1,9 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s | FileCheck %s ; FIXME: The wide loads and bundles introduce so much spilling. -define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrspace(4)* %wei_ptr, float addrspace(1)* %out_ptr) { +define protected amdgpu_kernel void @excess_soft_clause_reg_pressure(float addrspace(4)* %wei_ptr, float addrspace(1)* %out_ptr, float addrspace(1)* %in) { ; CHECK-LABEL: excess_soft_clause_reg_pressure: ; CHECK: BB0_1: ; %for.cond28.preheader -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 ; CHECK: global_load_dword ; CHECK-NEXT: global_load_dword ; CHECK-NEXT: global_load_dword @@ -14,6 +12,7 @@ ; CHECK: s_load_dwordx16 ; CHECK-NEXT: s_load_dwordx16 ; CHECK-NEXT: s_load_dwordx16 +; CHECK-NEXT: s_load_dwordx16 ; CHECK: v_writelane_b32 ; CHECK-NEXT: v_writelane_b32 @@ -32,7 +31,6 @@ ; CHECK-NEXT: v_writelane_b32 ; CHECK-NEXT: v_writelane_b32 ; CHECK-NEXT: s_load_dwordx16 -; CHECK-NEXT: s_load_dwordx16 ; CHECK: v_writelane_b32 ; CHECK-NEXT: v_writelane_b32 @@ -50,7 +48,6 @@ ; CHECK-NEXT: v_writelane_b32 ; CHECK-NEXT: v_writelane_b32 ; CHECK-NEXT: v_writelane_b32 -; CHECK-NEXT: s_load_dwordx16 ; CHECK: v_readlane_b32 ; CHECK-NEXT: v_readlane_b32 @@ -70,8 +67,10 @@ ; CHECK-NEXT: v_readlane_b32 ; CHECK: s_load_dwordx16 -; CHECK-NEXT: s_load_dwordx16 -; CHECK-NEXT: v_readlane_b32 +; CHECK: s_load_dwordx16 +; CHECK: s_load_dwordx16 + +; CHECK: v_readlane_b32 ; CHECK-NEXT: v_readlane_b32 ; CHECK-NEXT: v_readlane_b32 ; CHECK-NEXT: v_readlane_b32 @@ -100,6 +99,7 @@ %conv.frozen = freeze i32 %conv %div = udiv i32 %conv.frozen, 49 %add.ptr22 = getelementptr inbounds float, float addrspace(4)* %wei_ptr, i64 undef + %in.ptr1 = getelementptr inbounds float, float addrspace(1)* %in, i32 %i5 br label %for.cond28.preheader for.cond28.preheader: ; preds = %for.cond28.preheader, %entry @@ -135,7 +135,7 @@ %accum.sroa.118.0 = phi float [ 0.000000e+00, %entry ], [ %i259, %for.cond28.preheader ] %accum.sroa.122.0 = phi float [ 0.000000e+00, %entry ], [ %i263, %for.cond28.preheader ] %accum.sroa.126.0 = phi float [ 0.000000e+00, %entry ], [ %i267, %for.cond28.preheader ] - %i_ptr.0288 = phi float addrspace(1)* [ undef, %entry ], [ %add.ptr47.3, %for.cond28.preheader ] + %i_ptr.0288 = phi float addrspace(1)* [ %in.ptr1, %entry ], [ %add.ptr47.3, %for.cond28.preheader ] %w_ptr.0287 = phi float addrspace(4)* [ %add.ptr22, %entry ], [ %add.ptr74, %for.cond28.preheader ] %ci.0286 = phi i32 [ 0, %entry ], [ %inc116, %for.cond28.preheader ] %i8 = load float, float addrspace(1)* %i_ptr.0288, align 4