Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h @@ -136,6 +136,9 @@ void initializeAMDGPUUseNativeCallsPass(PassRegistry &); extern char &AMDGPUUseNativeCallsID; +void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &); +extern char &AMDGPUPerfHintAnalysisID; + // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -29,6 +29,7 @@ namespace llvm { +class AMDGPUMachineFunction; class AMDGPUTargetStreamer; class MCOperand; class SISubtarget; @@ -144,7 +145,8 @@ void emitCommonFunctionComments(uint32_t NumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, - uint64_t CodeSize); + uint64_t CodeSize, + const AMDGPUMachineFunction* MFI); public: explicit AMDGPUAsmPrinter(TargetMachine &TM, Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -278,11 +278,14 @@ uint32_t NumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, - uint64_t CodeSize) { + uint64_t CodeSize, + const AMDGPUMachineFunction *MFI) { OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); + OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), + false); } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { @@ -339,7 +342,7 @@ Info.NumVGPR, Info.getTotalNumSGPRs(MF.getSubtarget()), Info.PrivateSegmentSize, - getFunctionCodeSize(MF)); + getFunctionCodeSize(MF), MFI); return false; } @@ -347,7 +350,7 @@ emitCommonFunctionComments(CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, CurrentProgramInfo.ScratchSize, - getFunctionCodeSize(MF)); + getFunctionCodeSize(MF), MFI); OutStreamer->emitRawComment( " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); @@ -376,6 +379,9 @@ " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount), false); + OutStreamer->emitRawComment( + " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); + if (MF.getSubtarget().debuggerEmitPrologue()) { OutStreamer->emitRawComment( " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -16,6 +16,7 @@ #include "AMDGPUArgumentUsageInfo.h" #include "AMDGPUISelLowering.h" // For AMDGPUISD #include "AMDGPUInstrInfo.h" +#include "AMDGPUPerfHintAnalysis.h" #include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" @@ -85,6 +86,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); AU.addRequired(); SelectionDAGISel::getAnalysisUsage(AU); } @@ -242,6 +244,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo) +INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis) INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "isel", "AMDGPU DAG->DAG Pattern Instruction Selection", false, false) Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -36,6 +36,12 @@ bool NoSignedZerosFPMath; + // Function may be memory bound. + bool MemoryBound; + + // Kernel may need limited waves per EU for better performance. + bool WaveLimiter; + public: AMDGPUMachineFunction(const MachineFunction &MF); @@ -78,6 +84,14 @@ return NoSignedZerosFPMath; } + bool isMemoryBound() const { + return MemoryBound; + } + + bool needsWaveLimiter() const { + return WaveLimiter; + } + unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV); }; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -9,6 +9,8 @@ #include "AMDGPUMachineFunction.h" #include "AMDGPUSubtarget.h" +#include "AMDGPUPerfHintAnalysis.h" +#include "llvm/CodeGen/MachineModuleInfo.h" using namespace llvm; @@ -20,9 +22,19 @@ LDSSize(0), ABIArgOffset(0), IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())), - NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) { + NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath), + MemoryBound(false), + WaveLimiter(false) { // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset, // except reserved size is not correctly aligned. + + if (auto *Resolver = MF.getMMI().getResolver()) { + if (AMDGPUPerfHintAnalysis *PHA = static_cast( + Resolver->getAnalysisIfAvailable(&AMDGPUPerfHintAnalysisID, true))) { + MemoryBound = PHA->isMemoryBound(&MF.getFunction()); + WaveLimiter = PHA->needsWaveLimiter(&MF.getFunction()); + } + } } unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL, Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h @@ -0,0 +1,55 @@ +//===- AMDGPUPerfHintAnalysis.h - analysis of functions memory traffic ----===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Analyzes if a function potentially memory bound and if a kernel +/// kernel may benefit from limiting number of waves to reduce cache thrashing. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H +#define LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H +#include "llvm/IR/ValueMap.h" +#include "llvm/Pass.h" + +namespace llvm { + +struct AMDGPUPerfHintAnalysis : public FunctionPass { + static char ID; + +public: + AMDGPUPerfHintAnalysis() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + } + + bool isMemoryBound(const Function *F) const; + + bool needsWaveLimiter(const Function *F) const; + + struct FuncInfo { + unsigned MemInstCount; + unsigned InstCount; + unsigned IAMInstCount; // Indirect access memory instruction count + unsigned LSMInstCount; // Large stride memory instruction count + FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0), + LSMInstCount(0) {} + }; + + typedef ValueMap FuncInfoMap; + +private: + + FuncInfoMap FIM; +}; +} // namespace llvm +#endif // LLVM_LIB_TARGET_AMDGPU_MDGPUPERFHINTANALYSIS_H Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -0,0 +1,404 @@ +//===- AMDGPUPerfHintAnalysis.cpp - analysis of functions memory traffic --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Analyzes if a function potentially memory bound and if a kernel +/// kernel may benefit from limiting number of waves to reduce cache thrashing. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUPerfHintAnalysis.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-perf-hint" + +static cl::opt + MemBoundThresh("amdgpu-membound-threshold", cl::init(50), cl::Hidden, + cl::desc("Function mem bound threshold in %")); + +static cl::opt + LimitWaveThresh("amdgpu-limit-wave-threshold", cl::init(50), cl::Hidden, + cl::desc("Kernel limit wave threshold in %")); + +static cl::opt + IAWeight("amdgpu-indirect-access-weight", cl::init(1000), cl::Hidden, + cl::desc("Indirect access memory instruction weight")); + +static cl::opt + LSWeight("amdgpu-large-stride-weight", cl::init(1000), cl::Hidden, + cl::desc("Large stride memory access weight")); + +static cl::opt + LargeStrideThresh("amdgpu-large-stride-threshold", cl::init(64), cl::Hidden, + cl::desc("Large stride memory access threshold")); + +STATISTIC(NumMemBound, "Number of functions marked as memory bound"); +STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave"); + +char llvm::AMDGPUPerfHintAnalysis::ID = 0; +char &llvm::AMDGPUPerfHintAnalysisID = AMDGPUPerfHintAnalysis::ID; + +INITIALIZE_PASS(AMDGPUPerfHintAnalysis, DEBUG_TYPE, + "Analysis if a function is memory bound", true, true) + +namespace { + +struct AMDGPUPerfHint { + friend AMDGPUPerfHintAnalysis; + +public: + AMDGPUPerfHint(AMDGPUPerfHintAnalysis::FuncInfoMap &FIM_, + const TargetLowering *TLI_) + : FIM(FIM_), DL(nullptr), TLI(TLI_) {} + + void runOnFunction(Function &F); + +private: + struct MemAccessInfo { + const Value *V; + const Value *Base; + int64_t Offset; + MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0) {} + bool isLargeStride(MemAccessInfo &Reference) const; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + Printable print() const { + return Printable([this](raw_ostream &OS) { + OS << "Value: " << *V << '\n' + << "Base: " << *Base << " Offset: " << Offset << '\n'; + }); + } +#endif + }; + + MemAccessInfo makeMemAccessInfo(Instruction *) const; + + MemAccessInfo LastAccess; // Last memory access info + + AMDGPUPerfHintAnalysis::FuncInfoMap &FIM; + + const DataLayout *DL; + + AMDGPUAS AS; + + const TargetLowering *TLI; + + AMDGPUPerfHintAnalysis::FuncInfoMap::iterator visit(const Function &F); + static bool isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &F); + static bool needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &F); + + bool isIndirectAccess(const Instruction *Inst) const; + + /// Check if the instruction is large stride. + /// The purpose is to identify memory access pattern like: + /// x = a[i]; + /// y = a[i+1000]; + /// z = a[i+2000]; + /// In the above example, the second and third memory access will be marked + /// large stride memory access. + bool isLargeStride(const Instruction *Inst); + + bool isGlobalAddr(const Value *V) const; + bool isLocalAddr(const Value *V) const; + bool isConstantAddr(const Value *V) const; +}; + +static const Value *getMemoryInstrPtr(const Instruction *Inst) { + if (auto LI = dyn_cast(Inst)) { + return LI->getPointerOperand(); + } + if (auto SI = dyn_cast(Inst)) { + return SI->getPointerOperand(); + } + if (auto AI = dyn_cast(Inst)) { + return AI->getPointerOperand(); + } + if (auto AI = dyn_cast(Inst)) { + return AI->getPointerOperand(); + } + if (auto MI = dyn_cast(Inst)) { + return MI->getRawDest(); + } + + return nullptr; +} + +bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { + LLVM_DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); + SmallSet WorkSet; + SmallSet Visited; + if (const Value *MO = getMemoryInstrPtr(Inst)) { + if (isGlobalAddr(MO)) + WorkSet.insert(MO); + } + + while (!WorkSet.empty()) { + const Value *V = *WorkSet.begin(); + WorkSet.erase(*WorkSet.begin()); + if (!Visited.insert(V).second) + continue; + LLVM_DEBUG(dbgs() << " check: " << *V << '\n'); + + if (auto LD = dyn_cast(V)) { + auto M = LD->getPointerOperand(); + if (isGlobalAddr(M) || isLocalAddr(M) || isConstantAddr(M)) { + LLVM_DEBUG(dbgs() << " is IA\n"); + return true; + } + continue; + } + + if (auto GEP = dyn_cast(V)) { + auto P = GEP->getPointerOperand(); + WorkSet.insert(P); + for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I) + WorkSet.insert(GEP->getOperand(I)); + continue; + } + + if (auto U = dyn_cast(V)) { + WorkSet.insert(U->getOperand(0)); + continue; + } + + if (auto BO = dyn_cast(V)) { + WorkSet.insert(BO->getOperand(0)); + WorkSet.insert(BO->getOperand(1)); + continue; + } + + if (auto S = dyn_cast(V)) { + WorkSet.insert(S->getFalseValue()); + WorkSet.insert(S->getTrueValue()); + continue; + } + + if (auto E = dyn_cast(V)) { + WorkSet.insert(E->getVectorOperand()); + continue; + } + + if (auto Phi = dyn_cast(V)) { + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) + WorkSet.insert(Phi->getIncomingValue(I)); + continue; + } + + LLVM_DEBUG(dbgs() << " dropped\n"); + } + + LLVM_DEBUG(dbgs() << " is not IA\n"); + return false; +} + +AMDGPUPerfHintAnalysis::FuncInfoMap::iterator +AMDGPUPerfHint::visit(const Function &F) { + auto FIP = FIM.insert(std::make_pair(&F, AMDGPUPerfHintAnalysis::FuncInfo())); + if (!FIP.second) + return FIP.first; + + AMDGPUPerfHintAnalysis::FuncInfo &FI = FIP.first->second; + + LLVM_DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); + + for (auto &B : F) { + LastAccess = MemAccessInfo(); + for (auto &I : B) { + if (getMemoryInstrPtr(&I)) { + if (isIndirectAccess(&I)) + ++FI.IAMInstCount; + if (isLargeStride(&I)) + ++FI.LSMInstCount; + ++FI.MemInstCount; + ++FI.InstCount; + continue; + } + CallSite CS(const_cast(&I)); + if (CS) { + Function *Callee = CS.getCalledFunction(); + if (!Callee || Callee->isDeclaration()) { + ++FI.InstCount; + continue; + } + if (&F == Callee) // Handle immediate recursion + continue; + + auto Loc = visit(*Callee); + + assert(Loc != FIM.end() && "No func info"); + FI.MemInstCount += Loc->second.MemInstCount; + FI.InstCount += Loc->second.InstCount; + FI.IAMInstCount += Loc->second.IAMInstCount; + FI.LSMInstCount += Loc->second.LSMInstCount; + } else if (auto *GEP = dyn_cast(&I)) { + TargetLoweringBase::AddrMode AM; + auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL); + AM.BaseGV = dyn_cast_or_null(const_cast(Ptr)); + AM.HasBaseReg = !AM.BaseGV; + if (TLI->isLegalAddressingMode(*DL, AM, GEP->getResultElementType(), + GEP->getPointerAddressSpace())) + // Offset will likely be folded into load or store + continue; + ++FI.InstCount; + } else { + ++FI.InstCount; + } + } + } + + return FIP.first; +} + +void AMDGPUPerfHint::runOnFunction(Function &F) { + if (FIM.find(&F) != FIM.end()) + return; + + const Module &M = *F.getParent(); + DL = &M.getDataLayout(); + AS = AMDGPU::getAMDGPUAS(M); + + auto Loc = visit(F); + + assert(Loc != FIM.end() && "No func info"); + LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Loc->second.MemInstCount + << '\n' + << " IAMInst: " << Loc->second.IAMInstCount << '\n' + << " LSMInst: " << Loc->second.LSMInstCount << '\n' + << " TotalInst: " << Loc->second.InstCount << '\n'); + + auto &FI = Loc->second; + + if (isMemBound(FI)) { + LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); + NumMemBound++; + } + + if (AMDGPU::isEntryFunctionCC(F.getCallingConv()) && needLimitWave(FI)) { + LLVM_DEBUG(dbgs() << F.getName() << " needs limit wave\n"); + NumLimitWave++; + } +} + +bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { + return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh; +} + +bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { + return ((FI.MemInstCount + FI.IAMInstCount * IAWeight + + FI.LSMInstCount * LSWeight) * + 100 / FI.InstCount) > LimitWaveThresh; +} + +bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { + if (auto PT = dyn_cast(V->getType())) { + unsigned As = PT->getAddressSpace(); + // Flat likely points to global too. + return As == AS.GLOBAL_ADDRESS || As == AS.FLAT_ADDRESS; + } + return false; +} + +bool AMDGPUPerfHint::isLocalAddr(const Value *V) const { + if (auto PT = dyn_cast(V->getType())) + return PT->getAddressSpace() == AS.LOCAL_ADDRESS; + return false; +} + +bool AMDGPUPerfHint::isLargeStride(const Instruction *Inst) { + LLVM_DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n'); + + MemAccessInfo MAI = makeMemAccessInfo(const_cast(Inst)); + bool IsLargeStride = MAI.isLargeStride(LastAccess); + if (MAI.Base) + LastAccess = std::move(MAI); + + return IsLargeStride; +} + +AMDGPUPerfHint::MemAccessInfo +AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { + MemAccessInfo MAI; + const Value *MO = getMemoryInstrPtr(Inst); + + LLVM_DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); + // Do not treat local-addr memory access as large stride. + if (isLocalAddr(MO)) + return MAI; + + MAI.V = MO; + MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL); + return MAI; +} + +bool AMDGPUPerfHint::isConstantAddr(const Value *V) const { + if (auto PT = dyn_cast(V->getType())) { + unsigned As = PT->getAddressSpace(); + return As == AS.CONSTANT_ADDRESS || As == AS.CONSTANT_ADDRESS_32BIT; + } + return false; +} + +bool AMDGPUPerfHint::MemAccessInfo::isLargeStride( + MemAccessInfo &Reference) const { + + if (!Base || !Reference.Base || Base != Reference.Base) + return false; + + uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset + : Reference.Offset - Offset; + bool Result = Diff > LargeStrideThresh; + LLVM_DEBUG(dbgs() << "[isLargeStride compare]\n" + << print() << "<=>\n" + << Reference.print() << "Result:" << Result << '\n'); + return Result; +} +} // namespace + +bool AMDGPUPerfHintAnalysis::runOnFunction(Function &F) { + auto *TPC = getAnalysisIfAvailable(); + if (!TPC) + return false; + + const TargetMachine &TM = TPC->getTM(); + const TargetSubtargetInfo *ST = TM.getSubtargetImpl(F); + + AMDGPUPerfHint Analyzer(FIM, ST->getTargetLowering()); + Analyzer.runOnFunction(F); + return false; +} + +bool AMDGPUPerfHintAnalysis::isMemoryBound(const Function *F) const { + auto FI = FIM.find(F); + if (FI == FIM.end()) + return false; + + return AMDGPUPerfHint::isMemBound(FI->second); +} + +bool AMDGPUPerfHintAnalysis::needsWaveLimiter(const Function *F) const { + auto FI = FIM.find(F); + if (FI == FIM.end()) + return false; + + return AMDGPUPerfHint::needLimitWave(FI->second); +} Index: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt +++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt @@ -58,6 +58,7 @@ AMDGPUUnifyDivergentExitNodes.cpp AMDGPUUnifyMetadata.cpp AMDGPUInline.cpp + AMDGPUPerfHintAnalysis.cpp AMDILCFGStructurizer.cpp GCNHazardRecognizer.cpp GCNIterativeScheduler.cpp Index: llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ llvm/trunk/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -372,13 +372,23 @@ // We could not keep current target occupancy because of the just scheduled // region. Record new occupancy for next scheduling cycle. unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); + // Allow memory bound functions to drop to 4 waves if not limited by an + // attribute. + unsigned MinMemBoundWaves = std::max(MFI.getMinWavesPerEU(), 4u); + if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy && + WavesAfter >= MinMemBoundWaves && + (MFI.isMemoryBound() || MFI.needsWaveLimiter())) { + LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " + << MinMemBoundWaves << " waves\n"); + NewOccupancy = WavesAfter; + } if (NewOccupancy < MinOccupancy) { MinOccupancy = NewOccupancy; LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " << MinOccupancy << ".\n"); } - if (WavesAfter >= WavesBefore) { + if (WavesAfter >= MinOccupancy) { Pressure[RegionIdx] = PressureAfter; return; } Index: llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ llvm/trunk/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s @var = addrspace(1) global float 0.0 @@ -17,9 +17,7 @@ ; CHECK: KernargSegmentAlign: 8 ; CHECK: WavefrontSize: 64 ; CHECK: NumSGPRs: 6 -; GFX700: NumVGPRs: 4 -; GFX803: NumVGPRs: 6 -; GFX900: NumVGPRs: 6 +; CHECK: NumVGPRs: 3 ; CHECK: MaxFlatWorkGroupSize: 256 define amdgpu_kernel void @test( half addrspace(1)* %r, Index: llvm/trunk/test/CodeGen/AMDGPU/perfhint.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/perfhint.ll +++ llvm/trunk/test/CodeGen/AMDGPU/perfhint.ll @@ -0,0 +1,85 @@ +; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test_membound: +; MemoryBound: 1 +; WaveLimiterHint : 1 +define amdgpu_kernel void @test_membound(<4 x i32> addrspace(1)* nocapture readonly %arg, <4 x i32> addrspace(1)* nocapture %arg1) { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp2 = zext i32 %tmp to i64 + %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp2 + %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 16 + %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp2 + store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp5, align 16 + %tmp6 = add nuw nsw i64 %tmp2, 1 + %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp6 + %tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16 + %tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6 + store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16 + %tmp10 = add nuw nsw i64 %tmp2, 2 + %tmp11 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp10 + %tmp12 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp11, align 16 + %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp10 + store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16 + %tmp14 = add nuw nsw i64 %tmp2, 3 + %tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14 + %tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16 + %tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14 + store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16 + ret void +} + +; GCN-LABEL: {{^}}test_large_stride: +; MemoryBound: 0 +; WaveLimiterHint : 1 +define amdgpu_kernel void @test_large_stride(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4096 + %tmp1 = load i32, i32 addrspace(1)* %tmp, align 4 + %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + store i32 %tmp1, i32 addrspace(1)* %tmp2, align 4 + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 8192 + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %tmp4, i32 addrspace(1)* %tmp5, align 4 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 12288 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 + store i32 %tmp7, i32 addrspace(1)* %tmp8, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_indirect: +; MemoryBound: 0 +; WaveLimiterHint : 1 +define amdgpu_kernel void @test_indirect(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %tmp1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 + %tmp3 = bitcast i32 addrspace(1)* %arg to <4 x i32> addrspace(1)* + %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4 + %tmp5 = extractelement <4 x i32> %tmp4, i32 0 + %tmp6 = sext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6 + %tmp8 = load i32, i32 addrspace(1)* %tmp7, align 4 + store i32 %tmp8, i32 addrspace(1)* %arg, align 4 + %tmp9 = extractelement <4 x i32> %tmp4, i32 1 + %tmp10 = sext i32 %tmp9 to i64 + %tmp11 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp10 + %tmp12 = load i32, i32 addrspace(1)* %tmp11, align 4 + store i32 %tmp12, i32 addrspace(1)* %tmp, align 4 + %tmp13 = extractelement <4 x i32> %tmp4, i32 2 + %tmp14 = sext i32 %tmp13 to i64 + %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp14 + %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4 + store i32 %tmp16, i32 addrspace(1)* %tmp1, align 4 + %tmp17 = extractelement <4 x i32> %tmp4, i32 3 + %tmp18 = sext i32 %tmp17 to i64 + %tmp19 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp18 + %tmp20 = load i32, i32 addrspace(1)* %tmp19, align 4 + store i32 %tmp20, i32 addrspace(1)* %tmp2, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x()