diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h @@ -41,7 +41,11 @@ unsigned InstCost; unsigned IAMInstCost; // Indirect access memory instruction count unsigned LSMInstCost; // Large stride memory instruction count - FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {} + bool HasDenseGlobalMemAcc; // Set if at least 1 basic block has relatively + // high global memory access + FuncInfo() + : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0), + HasDenseGlobalMemAcc(false) {} }; typedef ValueMap FuncInfoMap; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -116,6 +116,7 @@ bool isGlobalAddr(const Value *V) const; bool isLocalAddr(const Value *V) const; + bool isGlobalLoadUsedInBB(const Instruction &) const; }; static std::pair getMemoryInstrPtrAndType( @@ -196,6 +197,24 @@ return false; } +// Returns true if the global load `I` is used in its own basic block. +bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const { + const auto *Ld = dyn_cast(&I); + if (!Ld) + return false; + if (!isGlobalAddr(Ld->getPointerOperand())) + return false; + + for (const User *Usr : Ld->users()) { + if (const Instruction *UsrInst = dyn_cast(Usr)) { + if (UsrInst->getParent() == I.getParent()) + return true; + } + } + + return false; +} + AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) { AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F]; @@ -203,9 +222,14 @@ for (auto &B : F) { LastAccess = MemAccessInfo(); + unsigned UsedGlobalLoadsInBB = 0; for (auto &I : B) { if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) { unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32); + // TODO: Check if the global load and its user are close to each other + // instead (Or do this analysis in GCNSchedStrategy?). + if (isGlobalLoadUsedInBB(I)) + UsedGlobalLoadsInBB += Size; if (isIndirectAccess(&I)) FI.IAMInstCost += Size; if (isLargeStride(&I)) @@ -245,6 +269,16 @@ ++FI.InstCost; } } + + if (!FI.HasDenseGlobalMemAcc) { + unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size(); + if (GlobalMemAccPercentage > 50) { + LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since " + << B.getName() << " has " << GlobalMemAccPercentage + << "% global memory access\n"); + FI.HasDenseGlobalMemAcc = true; + } + } } return &FI; @@ -286,6 +320,11 @@ } bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { + // Reverting optimal scheduling in favour of occupancy with basic block(s) + // having dense global memory access can potentially hurt performance. + if (FI.HasDenseGlobalMemAcc) + return true; + return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh; } diff --git a/llvm/test/CodeGen/AMDGPU/perfhint.ll b/llvm/test/CodeGen/AMDGPU/perfhint.ll --- a/llvm/test/CodeGen/AMDGPU/perfhint.ll +++ b/llvm/test/CodeGen/AMDGPU/perfhint.ll @@ -20,7 +20,7 @@ } ; GCN-LABEL: {{^}}test_membound_1: -; GCN: MemoryBound: 0 +; GCN: MemoryBound: 1 define amdgpu_kernel void @test_membound_1(<2 x double> addrspace(1)* nocapture readonly %ptr.0, <2 x double> addrspace(1)* nocapture %ptr.1, <2 x double> %arg.0, i32 %arg.1, <4 x double> %arg.2) {