Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -133,6 +133,9 @@ void initializeAMDGPUUseNativeCallsPass(PassRegistry &); extern char &AMDGPUUseNativeCallsID; +void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &); +extern char &AMDGPUPerfHintAnalysisID; + // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.h =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -31,6 +31,7 @@ class AMDGPUTargetStreamer; class MCOperand; +class SIMachineFunctionInfo; class SISubtarget; class AMDGPUAsmPrinter final : public AsmPrinter { @@ -145,7 +146,8 @@ void emitCommonFunctionComments(uint32_t NumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, - uint64_t CodeSize); + uint64_t CodeSize, + const SIMachineFunctionInfo* MFI); public: explicit AMDGPUAsmPrinter(TargetMachine &TM, Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -281,11 +281,14 @@ uint32_t NumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, - uint64_t CodeSize) { + uint64_t CodeSize, + const SIMachineFunctionInfo *MFI) { OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); + OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), + false); } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { @@ -340,6 +343,8 @@ OutStreamer->SwitchSection(CommentSection); if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + const SIMachineFunctionInfo *MFI = MF.getInfo(); + if (!MFI->isEntryFunction()) { OutStreamer->emitRawComment(" Function info:", false); SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()]; @@ -347,7 +352,7 @@ Info.NumVGPR, Info.getTotalNumSGPRs(MF.getSubtarget()), Info.PrivateSegmentSize, - getFunctionCodeSize(MF)); + getFunctionCodeSize(MF), MFI); return false; } @@ -355,7 +360,7 @@ emitCommonFunctionComments(CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, CurrentProgramInfo.ScratchSize, - getFunctionCodeSize(MF)); + getFunctionCodeSize(MF), MFI); OutStreamer->emitRawComment( " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); @@ -384,6 +389,9 @@ " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount), false); + OutStreamer->emitRawComment( + " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); + if (MF.getSubtarget().debuggerEmitPrologue()) { OutStreamer->emitRawComment( " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -16,6 +16,7 @@ #include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPUInstrInfo.h" +#include "AMDGPUPerfHintAnalysis.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" @@ -84,6 +85,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); AU.addRequired(); SelectionDAGISel::getAnalysisUsage(AU); } @@ -241,6 +243,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) +INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis) INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) Index: lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h @@ -0,0 +1,55 @@ +//===-- AMDGPUPerfHintAnalysis.h - Attach performance hints to functions --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Analyzes if a function potentially memory bound and if a kernel +/// kernel may benefit from limiting number of waves to reduce cache thrashing. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H +#define LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H +#include "llvm/IR/ValueMap.h" +#include "llvm/Pass.h" + +namespace llvm { + +struct AMDGPUPerfHintAnalysis : public FunctionPass { + static char ID; + +public: + AMDGPUPerfHintAnalysis() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + } + + bool isMemoryBound(const Function *F) const; + + bool needsWaveLimiter(const Function *F) const; + + struct FuncInfo { + unsigned MemInstCount; + unsigned InstCount; + unsigned IAMInstCount; // Indirect access memory instruction count + unsigned LSMInstCount; // Large stride memory instruction count + FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0), + LSMInstCount(0) {} + }; + + typedef ValueMap FuncInfoMap; + +private: + + FuncInfoMap FIM; +}; +} // namespace llvm +#endif // LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H Index: lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -0,0 +1,401 @@ +//===- AMDGPUPerfHintAnalysis.cpp - attach performance hints to functions -===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Analyzes if a function potentially memory bound and if a kernel +/// kernel may benefit from limiting number of waves to reduce cache thrashing. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUPerfHintAnalysis.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/Support/CommandLine.h" +#include "Utils/AMDGPUBaseInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-perf-hint" + +static cl::opt MemBoundThresh("amdgpu-membound-thresh", + cl::init(50), + cl::Hidden, + cl::value_desc("fp value"), + cl::desc("Function mem bound threshold")); + +static cl::opt LimitWaveThresh("amdgpu-limit-wave-thresh", + cl::init(50), + cl::Hidden, + cl::value_desc("fp value"), + cl::desc("Kernel limit wave threshold")); + +static cl::opt IAWeight("amdgpu-indirect-access-weight", + cl::init(1000), + cl::Hidden, + cl::value_desc("fp value"), + cl::desc("Indirect access memory instruction weight")); + +static cl::opt LSWeight("amdgpu-large-stride-weight", + cl::init(1000), + cl::Hidden, + cl::value_desc("fp value"), + cl::desc("Large stride memory access weight")); + +static cl::opt LargeStrideThresh("amdgpu-large-stride-thresh", + cl::init(64), + cl::Hidden, + cl::value_desc("int value"), + cl::desc("Large stride memory access threshold")); + +STATISTIC(NumMemBound, "Number of functions marked as memory bound"); +STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave"); + +char llvm::AMDGPUPerfHintAnalysis::ID = 0; +char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID; + +INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, + "Analysis if a function is memory bound", true, true) + +namespace { + +struct AMDGPUPerfHint { + friend AMDGPUPerfHintAnalysis; + +public: + AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_) + : FIM(FIM_), DL(nullptr) {} + + void runOnFunction(Function &F); + +private: + + struct MemAccessInfo { + const Value *V; + const Value *Base; + int64_t Offset; + MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0){} + bool isLargeStride(MemAccessInfo &Reference) const; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + Printable print() const { + return Printable([this](raw_ostream &OS) { + OS << "Value: " << *V << '\n' + << "Base: " << *Base << " Offset: " << Offset << '\n'; + }); + } +#endif + }; + + MemAccessInfo makeMemAccessInfo(Instruction *) const; + + MemAccessInfo LastAccess; // Last memory access info + + AMDGPUPerfHintAnalysis::FuncInfoMap &FIM; + + const DataLayout *DL; + AMDGPUAS AS; + + void visit(const Function &F); + static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); + static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); + + bool isIndirectAccess(const Instruction *Inst) const; + + /// Check if the instruction is large stride. + /// The purpose is to identify memory access pattern like: + /// x = a[i]; + /// y = a[i+1000]; + /// z = a[i+2000]; + /// In the above example, the second and third memory access will be marked + /// large stride memory access. + bool isLargeStride(const Instruction *Inst); + + bool isGlobalAddr(const Value *V) const; + bool isLocalAddr(const Value *V) const; + bool isConstantAddr(const Value *V) const; +}; + +static const Value *getMemoryInstrPtr(const Instruction *Inst) { + if (auto LI = dyn_cast(Inst)) { + return LI->getPointerOperand(); + } + if (auto SI = dyn_cast(Inst)) { + return SI->getPointerOperand(); + } + if (auto AI = dyn_cast(Inst)) { + return AI->getPointerOperand(); + } + if (auto AI = dyn_cast(Inst)) { + return AI->getPointerOperand(); + } + if (auto MI = dyn_cast(Inst)) { + return MI->getRawDest(); + } + + return nullptr; +} + +bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { + DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); + DenseSet WorkSet; + DenseSet Visited; + if (const Value *MO = getMemoryInstrPtr(Inst)) { + if (isGlobalAddr(MO)) + WorkSet.insert(MO); + } + + while (!WorkSet.empty()) { + const Value *V = *WorkSet.begin(); + WorkSet.erase(WorkSet.begin()); + if (!Visited.insert(V).second) + continue; + DEBUG(dbgs() << " check: " << *V << '\n'); + + if (auto LD = dyn_cast(V)) { + auto M = LD->getPointerOperand(); + if (isGlobalAddr(M) || + isLocalAddr(M) || + isConstantAddr(M)) { + DEBUG(dbgs() << " is IA\n"); + return true; + } + continue; + } + + if (auto GEP = dyn_cast(V)) { + auto P = GEP->getPointerOperand(); + WorkSet.insert(P); + for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I) + WorkSet.insert(GEP->getOperand(I)); + continue; + } + + if (auto U = dyn_cast(V)) { + WorkSet.insert(U->getOperand(0)); + continue; + } + + if (auto BO = dyn_cast(V)) { + WorkSet.insert(BO->getOperand(0)); + WorkSet.insert(BO->getOperand(1)); + continue; + } + + if (auto S = dyn_cast(V)) { + WorkSet.insert(S->getFalseValue()); + WorkSet.insert(S->getTrueValue()); + continue; + } + + if (auto E = dyn_cast(V)) { + WorkSet.insert(E->getVectorOperand()); + continue; + } + + if (auto Phi = dyn_cast(V)) { + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) + WorkSet.insert(Phi->getIncomingValue(I)); + continue; + } + + DEBUG(dbgs() << " dropped\n"); + } + + DEBUG(dbgs() << " is not IA\n"); + return false; +} + +void AMDGPUPerfHint::visit(const Function &F) { + auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo())); + if (!FIP.second) + return; + + AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second; + + DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); + + for (auto &B : F) { + LastAccess = MemAccessInfo(); + for (auto &I : B) { + if (getMemoryInstrPtr(&I)) { + if(isIndirectAccess(&I)) + ++FI.IAMInstCount; + if(isLargeStride(&I)) + ++FI.LSMInstCount; + ++FI.MemInstCount; + ++FI.InstCount; + } else if (const CallInst *CI = dyn_cast(&I)) { + Function *Callee = CI->getCalledFunction(); + if (!Callee || Callee->isDeclaration()) { + ++FI.InstCount; + continue; + } + if (&F == Callee) // Handle immediate recursion + continue; + + visit(*Callee); + + AMDGPUPerfHintAnalysis::FuncInfoMap::iterator Loc = FIM.find(Callee); + assert(Loc != FIM.end() && "No func info"); + FI.MemInstCount += Loc->second.MemInstCount; + FI.InstCount += Loc->second.InstCount; + FI.IAMInstCount += Loc->second.IAMInstCount; + FI.LSMInstCount += Loc->second.LSMInstCount; + } else if (const auto *GEP = dyn_cast(&I)) { + APInt Off(DL->getIndexSizeInBits(GEP->getPointerAddressSpace()), 0); + if (GEP->accumulateConstantOffset(*DL, Off)) { + if (Off.isIntN(12)) + // Offset will likely be folded into load or store + continue; + } + ++FI.InstCount; + } else { + ++FI.InstCount; + } + } + } +} + +void AMDGPUPerfHint::runOnFunction(Function &F) { + if (FIM.find(&F) != FIM.end()) + return; + + const Module &M = *F.getParent(); + DL = &M.getDataLayout(); + AS = AMDGPU::getAMDGPUAS(M); + + visit(F); + + AMDGPUPerfHintAnalysis::FuncInfoMap::iterator Loc = FIM.find(&F); + assert(Loc != FIM.end() && "No func info"); + DEBUG(dbgs() << F.getName() << + " MemInst: " << Loc->second.MemInstCount << '\n' << + " IAMInst: " << Loc->second.IAMInstCount << '\n' << + " LSMInst: " << Loc->second.LSMInstCount << '\n' << + " TotalInst: " << Loc->second.InstCount << '\n'); + + auto &FI = Loc->second; + + if (isMemBound(FI)) { + DEBUG(dbgs() << F.getName() << " is memory bound\n"); + NumMemBound++; + } + + if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) { + DEBUG(dbgs() << F.getName() << " needs limit wave\n"); + NumLimitWave++; + } +} + +bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { + return static_cast(FI.MemInstCount) / FI.InstCount * 100 > + MemBoundThresh; +} + +bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo& FI) { + return static_cast(FI.MemInstCount + + FI.IAMInstCount * IAWeight + + FI.LSMInstCount * LSWeight) + / FI.InstCount * 100 > LimitWaveThresh; +} + +bool AMDGPUPerfHint::isGlobalAddr(const Value* V) const { + if (auto PT = dyn_cast(V->getType())) { + unsigned As = PT->getAddressSpace(); + // Flat likely points to global too. + return As == AS.GLOBAL_ADDRESS || As == AS.FLAT_ADDRESS; + } + return false; +} + +bool AMDGPUPerfHint::isLocalAddr(const Value* V) const { + if (auto PT = dyn_cast(V->getType())) + return PT->getAddressSpace() == AS.LOCAL_ADDRESS; + return false; +} + +bool AMDGPUPerfHint::isLargeStride(const Instruction* Inst) { + DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n'); + + MemAccessInfo MAI = makeMemAccessInfo(const_cast(Inst)); + bool IsLargeStride = MAI.isLargeStride(LastAccess); + if (MAI.Base) + LastAccess = std::move(MAI); + + return IsLargeStride; +} + +AMDGPUPerfHint::MemAccessInfo +AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { + MemAccessInfo MAI; + const Value* MO = getMemoryInstrPtr(Inst); + + DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); + // Do not treat local-addr memory access as large stride. + if (isLocalAddr(MO)) + return MAI; + + MAI.V = MO; + MAI.Base = llvm::GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL); + return MAI; +} + +bool AMDGPUPerfHint::isConstantAddr(const Value* V) const { + if (auto PT = dyn_cast(V->getType())) { + unsigned As = PT->getAddressSpace(); + return As == AS.CONSTANT_ADDRESS || As == AS.CONSTANT_ADDRESS_32BIT; + } + return false; +} + +bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(MemAccessInfo& Reference) + const { + + if (!Base || !Reference.Base || Base != Reference.Base) + return false; + + uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset + : Reference.Offset - Offset; + bool Result = Diff > LargeStrideThresh; + DEBUG(dbgs() << "[isLargeStride compare]\n" + << print() + << "<=>\n" + << Reference.print() + << "Result:" << Result << '\n'); + return Result; +} +} // namespace + +bool AMDGPUPerfHintAnalysis::runOnFunction(Function &F) { + AMDGPUPerfHint Analyzer(FIM); + Analyzer.runOnFunction(F); + return false; +} + +bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { + auto FI = FIM.find(F); + if (FI == FIM.end()) + return false; + + return AMDGPUPerfHint::isMemBound(FI->second); +} + +bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const { + auto FI = FIM.find(F); + if (FI == FIM.end()) + return false; + + return AMDGPUPerfHint::needLimitWave(FI->second); +} Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -57,6 +57,7 @@ AMDGPUUnifyDivergentExitNodes.cpp AMDGPUUnifyMetadata.cpp AMDGPUInline.cpp + AMDGPUPerfHintAnalysis.cpp AMDILCFGStructurizer.cpp GCNHazardRecognizer.cpp GCNIterativeScheduler.cpp Index: lib/Target/AMDGPU/GCNSchedStrategy.cpp =================================================================== --- lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -372,13 +372,23 @@ // We could not keep current target occupancy because of the just scheduled // region. Record new occupancy for next scheduling cycle. unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); + // Allow memory bound functions to drop to 4 waves if not limited by an + // attribute. + unsigned MinMemBoundWaves = std::max(MFI.getMinWavesPerEU(), 4u); + if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy && + WavesAfter >= MinMemBoundWaves && + (MFI.isMemoryBound() || MFI.needsWaveLimiter())) { + LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " + << MinMemBoundWaves << " waves\n"); + NewOccupancy = WavesAfter; + } if (NewOccupancy < MinOccupancy) { MinOccupancy = NewOccupancy; LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " << MinOccupancy << ".\n"); } - if (WavesAfter >= WavesBefore) { + if (WavesAfter >= MinOccupancy) { Pressure[RegionIdx] = PressureAfter; return; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -178,6 +178,12 @@ // user arguments. This is an offset from the KernargSegmentPtr. bool ImplicitArgPtr : 1; + // Function may be memory bound. + bool MemoryBound : 1; + + // Kernel may need limited waves per EU for better performance. + bool WaveLimiter : 1; + // The hard-wired high half of the address of the global information table // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since // current hardware only allows a 16 bit value. @@ -389,6 +395,14 @@ return ImplicitBufferPtr; } + bool isMemoryBound() const { + return MemoryBound; + } + + bool needsWaveLimiter() const { + return WaveLimiter; + } + AMDGPUFunctionArgInfo &getArgInfo() { return ArgInfo; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -9,6 +9,7 @@ #include "SIMachineFunctionInfo.h" #include "AMDGPUArgumentUsageInfo.h" +#include "AMDGPUPerfHintAnalysis.h" #include "AMDGPUSubtarget.h" #include "SIRegisterInfo.h" #include "Utils/AMDGPUBaseInfo.h" @@ -16,6 +17,7 @@ #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Function.h" @@ -47,6 +49,8 @@ WorkItemIDZ(false), ImplicitBufferPtr(false), ImplicitArgPtr(false), + MemoryBound(false), + WaveLimiter(false), GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) { const SISubtarget &ST = MF.getSubtarget(); @@ -170,6 +174,14 @@ S = A.getValueAsString(); if (!S.empty()) S.consumeInteger(0, HighBitsOf32BitAddress); + + if (auto Resolver = MF.getMMI().getResolver()) { + if (AMDGPUPerfHintAnalysis *PHA = static_cast( + Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) { + MemoryBound = PHA->isMemoryBound(&MF.getFunction()); + WaveLimiter= PHA->needsWaveLimiter(&MF.getFunction()); + } + } } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( Index: test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll =================================================================== --- test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s @var = addrspace(1) global float 0.0 @@ -17,9 +17,7 @@ ; CHECK: KernargSegmentAlign: 8 ; CHECK: WavefrontSize: 64 ; CHECK: NumSGPRs: 6 -; GFX700: NumVGPRs: 4 -; GFX803: NumVGPRs: 6 -; GFX900: NumVGPRs: 6 +; CHECK: NumVGPRs: 3 ; CHECK: MaxFlatWorkGroupSize: 256 define amdgpu_kernel void @test( half addrspace(1)* %r, Index: test/CodeGen/AMDGPU/perfhint.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/perfhint.ll @@ -0,0 +1,85 @@ +; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test_membound: +; MemoryBound: 1 +; WaveLimiterHint : 1 +define amdgpu_kernel void @test_membound(<4 x i32> addrspace(1)* nocapture readonly %arg, <4 x i32> addrspace(1)* nocapture %arg1) { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp2 = zext i32 %tmp to i64 + %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp2 + %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 16 + %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp2 + store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp5, align 16 + %tmp6 = add nuw nsw i64 %tmp2, 1 + %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp6 + %tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16 + %tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6 + store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16 + %tmp10 = add nuw nsw i64 %tmp2, 2 + %tmp11 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp10 + %tmp12 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp11, align 16 + %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp10 + store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16 + %tmp14 = add nuw nsw i64 %tmp2, 3 + %tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14 + %tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16 + %tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14 + store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16 + ret void +} + +; GCN-LABEL: {{^}}test_large_stride: +; MemoryBound: 0 +; WaveLimiterHint : 1 +define amdgpu_kernel void @test_large_stride(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4096 + %tmp1 = load i32, i32 addrspace(1)* %tmp, align 4 + %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + store i32 %tmp1, i32 addrspace(1)* %tmp2, align 4 + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 8192 + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %tmp4, i32 addrspace(1)* %tmp5, align 4 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 12288 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 + store i32 %tmp7, i32 addrspace(1)* %tmp8, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_indirect: +; MemoryBound: 0 +; WaveLimiterHint : 1 +define amdgpu_kernel void @test_indirect(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %tmp1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 + %tmp3 = bitcast i32 addrspace(1)* %arg to <4 x i32> addrspace(1)* + %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4 + %tmp5 = extractelement <4 x i32> %tmp4, i32 0 + %tmp6 = sext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6 + %tmp8 = load i32, i32 addrspace(1)* %tmp7, align 4 + store i32 %tmp8, i32 addrspace(1)* %arg, align 4 + %tmp9 = extractelement <4 x i32> %tmp4, i32 1 + %tmp10 = sext i32 %tmp9 to i64 + %tmp11 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp10 + %tmp12 = load i32, i32 addrspace(1)* %tmp11, align 4 + store i32 %tmp12, i32 addrspace(1)* %tmp, align 4 + %tmp13 = extractelement <4 x i32> %tmp4, i32 2 + %tmp14 = sext i32 %tmp13 to i64 + %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp14 + %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4 + store i32 %tmp16, i32 addrspace(1)* %tmp1, align 4 + %tmp17 = extractelement <4 x i32> %tmp4, i32 3 + %tmp18 = sext i32 %tmp17 to i64 + %tmp19 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp18 + %tmp20 = load i32, i32 addrspace(1)* %tmp19, align 4 + store i32 %tmp20, i32 addrspace(1)* %tmp2, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x()