Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -133,6 +133,10 @@ void initializeAMDGPUUseNativeCallsPass(PassRegistry &); extern char &AMDGPUUseNativeCallsID; +ModulePass *createAMDGPUPerfHintPass(); +void initializeAMDGPUPerfHintPass(PassRegistry &); +extern char &AMDGPUPerfHintID; + // Passes common to R600 and SI FunctionPass *createAMDGPUPromoteAlloca(); void initializeAMDGPUPromoteAllocaPass(PassRegistry&); Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.h =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -31,6 +31,7 @@ class AMDGPUTargetStreamer; class MCOperand; +class SIMachineFunctionInfo; class SISubtarget; class AMDGPUAsmPrinter final : public AsmPrinter { @@ -145,7 +146,8 @@ void emitCommonFunctionComments(uint32_t NumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, - uint64_t CodeSize); + uint64_t CodeSize, + const SIMachineFunctionInfo* MFI); public: explicit AMDGPUAsmPrinter(TargetMachine &TM, Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -281,11 +281,14 @@ uint32_t NumVGPR, uint32_t NumSGPR, uint64_t ScratchSize, - uint64_t CodeSize) { + uint64_t CodeSize, + const SIMachineFunctionInfo *MFI) { OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); + OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()), + false); } bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { @@ -340,6 +343,8 @@ OutStreamer->SwitchSection(CommentSection); if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { + const SIMachineFunctionInfo *MFI = MF.getInfo(); + if (!MFI->isEntryFunction()) { OutStreamer->emitRawComment(" Function info:", false); SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()]; @@ -347,7 +352,7 @@ Info.NumVGPR, Info.getTotalNumSGPRs(MF.getSubtarget()), Info.PrivateSegmentSize, - getFunctionCodeSize(MF)); + getFunctionCodeSize(MF), MFI); return false; } @@ -355,7 +360,7 @@ emitCommonFunctionComments(CurrentProgramInfo.NumVGPR, CurrentProgramInfo.NumSGPR, CurrentProgramInfo.ScratchSize, - getFunctionCodeSize(MF)); + getFunctionCodeSize(MF), MFI); OutStreamer->emitRawComment( " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); @@ -384,6 +389,9 @@ " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount), false); + OutStreamer->emitRawComment( + " WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false); + if (MF.getSubtarget().debuggerEmitPrologue()) { OutStreamer->emitRawComment( " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + Index: lib/Target/AMDGPU/AMDGPUPerfHint.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUPerfHint.cpp @@ -0,0 +1,416 @@ +//===-- AMDGPUPerfHint.cpp - Attach performance hints to functions --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief Adds amdgpu-hint-memory-bound metadata on a potentially memory bound +/// functions and amdgpu-hint-wave-limiter on kernels which may benefit from +/// limiting number of waves to reduce cache conflicts. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIDefines.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/Support/CommandLine.h" +#include "Utils/AMDGPUBaseInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-perf-hint" + +static cl::opt MemBoundThresh("amdgpu-membound-thresh", + cl::init(50), + cl::Hidden, + cl::value_desc("fp value"), + cl::desc("Function mem bound threshold")); + +static cl::opt LimitWaveThresh("amdgpu-limit-wave-thresh", + cl::init(50), + cl::Hidden, + cl::value_desc("fp value"), + cl::desc("Kernel limit wave threshold")); + +static cl::opt IAWeight("amdgpu-indirect-access-weight", + cl::init(1000), + cl::Hidden, + cl::value_desc("fp value"), + cl::desc("Indirect access memory instruction weight")); + +static cl::opt LSWeight("amdgpu-large-stride-weight", + cl::init(1000), + cl::Hidden, + cl::value_desc("fp value"), + cl::desc("Large stride memory access weight")); + +static cl::opt LargeStrideThresh("amdgpu-large-stride-thresh", + cl::init(64), + cl::Hidden, + cl::value_desc("int value"), + cl::desc("Large stride memory access threshold")); + +STATISTIC(NumMemBound, "Number of functions marked as memory bound"); +STATISTIC(NumLimitWave, "Number of functions marked as needing limit wave"); + +namespace { + +struct AMDGPUPerfHint : public ModulePass { + static char ID; + +public: + AMDGPUPerfHint() : ModulePass(ID), DL(nullptr) {} + + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + } + +private: + + struct MemAccessInfo { + const Value *V; + const Value *Base; + int64_t Offset; + MemAccessInfo() : V(nullptr), Base(nullptr), Offset(0){} + bool isLargeStride(MemAccessInfo &Reference) const; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + Printable print() const { + return Printable([this](raw_ostream &OS) { + OS << "Value: " << *V << '\n' + << "Base: " << *Base << " Offset: " << Offset << '\n'; + }); + } +#endif + }; + + MemAccessInfo makeMemAccessInfo(Instruction *) const; + + MemAccessInfo LastAccess; // Last memory access info + + struct FuncInfo { + unsigned MemInstCount; + unsigned InstCount; + unsigned IAMInstCount; // Indirect access memory instruction count + unsigned LSMInstCount; // Large stride memory instruction count + FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0), + LSMInstCount(0) {} + }; + + typedef ValueMap FuncInfoMap; + FuncInfoMap FIM; + const DataLayout *DL; + AMDGPUAS AS; + + void visit(Function &F); + bool isMemBound(const FuncInfo &F) const; + bool needLimitWave(const FuncInfo &F) const; + bool markMemoryBound(Function &F, const FuncInfo &FI); + bool markLimitWave(Function &F, const FuncInfo &FI); + + /// Set function attribute \param Name with \param Value if it does not + /// already have it. + void setFunctionAttribIfNone(Function &F, StringRef Name, int Value); + bool isIndirectAccess(const Instruction *Inst) const; + + /// Check if the instruction is large stride. + /// The purpose is to identify memory access pattern like: + /// x = a[i]; + /// y = a[i+1000]; + /// z = a[i+2000]; + /// In the above example, the second and third memory access will be marked + /// large stride memory access. + bool isLargeStride(const Instruction *Inst); + + bool isGlobalAddr(const Value *V) const; + bool isLocalAddr(const Value *V) const; + bool isConstantAddr(const Value *V) const; +}; +} // namespace + +char AMDGPUPerfHint::ID = 0; +char &llvm::AMDGPUPerfHintID = AMDGPUPerfHint::ID; + +INITIALIZE_PASS(AMDGPUPerfHint, DEBUG_TYPE, + "Add performance hint attribute to function", false, false) + +ModulePass *llvm::createAMDGPUPerfHintPass() { + return new AMDGPUPerfHint(); +} + +// Create or add to a named metadata for marking memory bound kernels. +bool AMDGPUPerfHint::markMemoryBound(Function &F, const FuncInfo &FI) { + if (!isMemBound(FI)) + return false; + + DEBUG(dbgs() << F.getName() << " is memory bound\n"); + setFunctionAttribIfNone(F, ATTR_MEMBOUND_HINT, 1); + + NumMemBound++; + + return true; +} + +bool AMDGPUPerfHint::isIndirectAccess(const Instruction *Inst) const { + DEBUG(dbgs() << "[isIndirectAccess] " << *Inst << '\n'); + DenseSet WorkSet; + DenseSet Visited; + if (auto LI = dyn_cast(Inst)) { + auto MO = LI->getPointerOperand(); + if (isGlobalAddr(MO)) + WorkSet.insert(MO); + } else if (auto LI = dyn_cast(Inst)) { + auto MO = LI->getPointerOperand(); + if (isGlobalAddr(MO)) + WorkSet.insert(MO); + } + + while (!WorkSet.empty()) { + const Value *V = *WorkSet.begin(); + WorkSet.erase(WorkSet.begin()); + if (Visited.count(V)) + continue; + Visited.insert(V); + DEBUG(dbgs() << " check: " << *V << '\n'); + + if (auto LD = dyn_cast(V)) { + auto M = LD->getPointerOperand(); + if (isGlobalAddr(M) || + isLocalAddr(M) || + isConstantAddr(M)) { + DEBUG(dbgs() << " is IA\n"); + return true; + } + continue; + } + + if (auto GEP = dyn_cast(V)) { + auto P = GEP->getPointerOperand(); + WorkSet.insert(P); + for (unsigned I = 1, E = GEP->getNumIndices() + 1; I != E; ++I) + WorkSet.insert(GEP->getOperand(I)); + continue; + } + + if (auto U = dyn_cast(V)) { + WorkSet.insert(U->getOperand(0)); + continue; + } + + if (auto BO = dyn_cast(V)) { + WorkSet.insert(BO->getOperand(0)); + WorkSet.insert(BO->getOperand(1)); + continue; + } + + if (auto S = dyn_cast(V)) { + WorkSet.insert(S->getFalseValue()); + WorkSet.insert(S->getTrueValue()); + continue; + } + + if (auto E = dyn_cast(V)) { + WorkSet.insert(E->getVectorOperand()); + continue; + } + + if (auto Phi = dyn_cast(V)) { + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) + WorkSet.insert(Phi->getIncomingValue(I)); + continue; + } + + DEBUG(dbgs() << " dropped\n"); + } + + DEBUG(dbgs() << " is not IA\n"); + return false; +} + +void AMDGPUPerfHint::visit(Function &F) { + if (FIM.count(&F)) + return; + + DEBUG(dbgs() << "[AMDGPUPerfHint] process " << F.getName() << '\n'); + FuncInfo FI; + + for (auto &B : F) { + LastAccess = MemAccessInfo(); + for (auto &I : B) { + if (isa(I) || isa(I)) { + if(isIndirectAccess(&I)) + ++FI.IAMInstCount; + if(isLargeStride(&I)) + ++FI.LSMInstCount; + ++FI.MemInstCount; + ++FI.InstCount; + } else if (CallInst *CI = dyn_cast(&I)) { + Function *Callee = CI->getCalledFunction(); + if (Callee == NULL || Callee->isDeclaration()) { + ++FI.InstCount; + continue; + } + if (&F == Callee) // Handle immediate recursion + continue; + + visit(*Callee); + + FuncInfoMap::iterator Loc = FIM.find(Callee); + assert (Loc != FIM.end() && "No func info"); + FI.MemInstCount += Loc->second.MemInstCount; + FI.InstCount += Loc->second.InstCount; + FI.IAMInstCount += Loc->second.IAMInstCount; + FI.LSMInstCount += Loc->second.LSMInstCount; + } else if (GetElementPtrInst *GEP = dyn_cast(&I)) { + APInt Off(DL->getIndexSizeInBits(GEP->getPointerAddressSpace()), 0); + if (GEP->accumulateConstantOffset(*DL, Off)) { + if (Off.isIntN(12)) + // Offset will likely be folded into load or store + continue; + } + ++FI.InstCount; + } else { + ++FI.InstCount; + } + } + } + FIM[&F] = FI; +} + +bool AMDGPUPerfHint::isMemBound(const FuncInfo &FI) const { + return static_cast(FI.MemInstCount) / FI.InstCount * 100 > + MemBoundThresh; +} + +bool AMDGPUPerfHint::runOnModule(Module &M) { + bool Changed = false; + DL = &M.getDataLayout(); + AS = AMDGPU::getAMDGPUAS(M); + + for (auto &I : M) { + if (I.isDeclaration()) + continue; + + visit(I); + + FuncInfoMap::iterator Loc = FIM.find(&I); + assert(Loc != FIM.end() && "No func info"); + DEBUG(dbgs() << I.getName() << + " MemInst: " << Loc->second.MemInstCount << '\n' << + " IAMInst: " << Loc->second.IAMInstCount << '\n' << + " LSMInst: " << Loc->second.LSMInstCount << '\n' << + " TotalInst: " << Loc->second.InstCount << '\n'); + + auto &FI = Loc->second; + Changed |= markMemoryBound(I, FI); + if (AMDGPU::isEntryFunctionCC(I.getCallingConv())) + Changed |= markLimitWave(I, FI); + } + + return Changed; +} + +bool AMDGPUPerfHint::needLimitWave(const FuncInfo& FI) const { + return static_cast(FI.MemInstCount + + FI.IAMInstCount * IAWeight + + FI.LSMInstCount * LSWeight) + / FI.InstCount * 100 > LimitWaveThresh; +} + +bool AMDGPUPerfHint::markLimitWave(Function& F, const FuncInfo& FI) { + if (!needLimitWave(FI)) + return false; + + DEBUG(dbgs() << F.getName() << " needs limit wave\n"); + setFunctionAttribIfNone(F, ATTR_WAVE_LIMITER_HINT, 1); + + NumLimitWave++; + + return true; +} + +void AMDGPUPerfHint::setFunctionAttribIfNone(Function &F, StringRef Name, + int Value) { + if (!F.hasFnAttribute(Name)) + F.addFnAttr(Name, std::to_string(Value)); +} + +bool AMDGPUPerfHint::isGlobalAddr(const Value* V) const { + if (auto PT = dyn_cast(V->getType())) { + unsigned As = PT->getAddressSpace(); + // Flat likely points to global too. + return As == AS.GLOBAL_ADDRESS || As == AS.FLAT_ADDRESS; + } + return false; +} + +bool AMDGPUPerfHint::isLocalAddr(const Value* V) const { + if (auto PT = dyn_cast(V->getType())) + return PT->getAddressSpace() == AS.LOCAL_ADDRESS; + return false; +} + +bool AMDGPUPerfHint::isLargeStride(const Instruction* Inst) { + DEBUG(dbgs() << "[isLargeStride] " << *Inst << '\n'); + + MemAccessInfo MAI = makeMemAccessInfo(const_cast(Inst)); + bool IsLargeStride = MAI.isLargeStride(LastAccess); + if (MAI.Base) + LastAccess = std::move(MAI); + + return IsLargeStride; +} + +AMDGPUPerfHint::MemAccessInfo +AMDGPUPerfHint::makeMemAccessInfo(Instruction *Inst) const { + MemAccessInfo MAI; + const Value* MO = nullptr; + if (auto LI = dyn_cast(Inst)) + MO = LI->getPointerOperand(); + else if (auto SI = dyn_cast(Inst)) + MO = SI->getPointerOperand(); + + DEBUG(dbgs() << "[isLargeStride] MO: " << *MO << '\n'); + // Do not treat local-addr memory access as large stride. + if (isLocalAddr(MO)) + return MAI; + + MAI.V = MO; + MAI.Base = llvm::GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL); + return std::move(MAI); +} + +bool AMDGPUPerfHint::isConstantAddr(const Value* V) const { + if (auto PT = dyn_cast(V->getType())) + return PT->getAddressSpace() == AS.CONSTANT_ADDRESS; + return false; +} + +bool AMDGPUPerfHint::MemAccessInfo::isLargeStride(MemAccessInfo& Reference) + const { + + if (!Base || !Reference.Base || Base != Reference.Base) + return false; + + uint64_t Diff = Offset > Reference.Offset ? Offset - Reference.Offset + : Reference.Offset - Offset; + bool Result = Diff > LargeStrideThresh; + DEBUG(dbgs() << "[isLargeStride compare]\n" + << print() + << "<=>\n" + << Reference.print() + << "Result:" << Result << '\n'); + return Result; +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -181,6 +181,7 @@ initializeAMDGPUUseNativeCallsPass(*PR); initializeAMDGPUSimplifyLibCallsPass(*PR); initializeAMDGPUInlinerPass(*PR); + initializeAMDGPUPerfHintPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -672,6 +673,7 @@ bool AMDGPUPassConfig::addPreISel() { addPass(createFlattenCFGPass()); + addPass(createAMDGPUPerfHintPass()); return false; } Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -57,6 +57,7 @@ AMDGPUUnifyDivergentExitNodes.cpp AMDGPUUnifyMetadata.cpp AMDGPUInline.cpp + AMDGPUPerfHint.cpp AMDILCFGStructurizer.cpp GCNHazardRecognizer.cpp GCNIterativeScheduler.cpp Index: lib/Target/AMDGPU/GCNSchedStrategy.cpp =================================================================== --- lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -372,13 +372,23 @@ // We could not keep current target occupancy because of the just scheduled // region. Record new occupancy for next scheduling cycle. unsigned NewOccupancy = std::max(WavesAfter, WavesBefore); + // Allow memory bound functions to drop to 4 waves if not limited by an + // attribute. + unsigned MinMemBoundWaves = std::max(MFI.getMinWavesPerEU(), 4u); + if (WavesAfter < WavesBefore && WavesAfter < MinOccupancy && + WavesAfter >= MinMemBoundWaves && + (MFI.isMemoryBound() || MFI.needsWaveLimiter())) { + LLVM_DEBUG(dbgs() << "Function is memory bound, allow occupancy drop up to " + << MinMemBoundWaves << " waves\n"); + NewOccupancy = WavesAfter; + } if (NewOccupancy < MinOccupancy) { MinOccupancy = NewOccupancy; LLVM_DEBUG(dbgs() << "Occupancy lowered for the function to " << MinOccupancy << ".\n"); } - if (WavesAfter >= WavesBefore) { + if (WavesAfter >= MinOccupancy) { Pressure[RegionIdx] = PressureAfter; return; } Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -534,6 +534,9 @@ #define R_SPILLED_SGPRS 0x4 #define R_SPILLED_VGPRS 0x8 + +#define ATTR_MEMBOUND_HINT "amdgpu-hint-memory-bound" +#define ATTR_WAVE_LIMITER_HINT "amdgpu-hint-wave-limiter" } // End namespace llvm #endif Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -178,6 +178,12 @@ // user arguments. This is an offset from the KernargSegmentPtr. bool ImplicitArgPtr : 1; + // Function may be memory bound. + bool MemoryBound : 1; + + // Kernel may need limited waves per EU for better performance. + bool WaveLimiter : 1; + // The hard-wired high half of the address of the global information table // for AMDPAL OS type. 0xffffffff represents no hard-wired high half, since // current hardware only allows a 16 bit value. @@ -389,6 +395,14 @@ return ImplicitBufferPtr; } + bool isMemoryBound() const { + return MemoryBound; + } + + bool needsWaveLimiter() const { + return WaveLimiter; + } + AMDGPUFunctionArgInfo &getArgInfo() { return ArgInfo; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -47,6 +47,8 @@ WorkItemIDZ(false), ImplicitBufferPtr(false), ImplicitArgPtr(false), + MemoryBound(false), + WaveLimiter(false), GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) { const SISubtarget &ST = MF.getSubtarget(); @@ -170,6 +172,12 @@ S = A.getValueAsString(); if (!S.empty()) S.consumeInteger(0, HighBitsOf32BitAddress); + + if (F.hasFnAttribute(ATTR_MEMBOUND_HINT)) + MemoryBound = true; + + if (F.hasFnAttribute(ATTR_WAVE_LIMITER_HINT)) + WaveLimiter= true; } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( Index: test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll =================================================================== --- test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll +++ test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX803 --check-prefix=NOTES %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-misched=0 -filetype=obj -o - < %s | llvm-readobj -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s @var = addrspace(1) global float 0.0 @@ -17,9 +17,7 @@ ; CHECK: KernargSegmentAlign: 8 ; CHECK: WavefrontSize: 64 ; CHECK: NumSGPRs: 6 -; GFX700: NumVGPRs: 4 -; GFX803: NumVGPRs: 6 -; GFX900: NumVGPRs: 6 +; CHECK: NumVGPRs: 3 ; CHECK: MaxFlatWorkGroupSize: 256 define amdgpu_kernel void @test( half addrspace(1)* %r, Index: test/CodeGen/AMDGPU/perfhint.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/perfhint.ll @@ -0,0 +1,85 @@ +; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}test_membound: +; MemoryBound: 1 +; WaveLimiterHint : 1 +define amdgpu_kernel void @test_membound(<4 x i32> addrspace(1)* nocapture readonly %arg, <4 x i32> addrspace(1)* nocapture %arg1) { +bb: + %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() + %tmp2 = zext i32 %tmp to i64 + %tmp3 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp2 + %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 16 + %tmp5 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp2 + store <4 x i32> %tmp4, <4 x i32> addrspace(1)* %tmp5, align 16 + %tmp6 = add nuw nsw i64 %tmp2, 1 + %tmp7 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp6 + %tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16 + %tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6 + store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16 + %tmp10 = add nuw nsw i64 %tmp2, 2 + %tmp11 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp10 + %tmp12 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp11, align 16 + %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp10 + store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16 + %tmp14 = add nuw nsw i64 %tmp2, 3 + %tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14 + %tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16 + %tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14 + store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16 + ret void +} + +; GCN-LABEL: {{^}}test_large_stride: +; MemoryBound: 0 +; WaveLimiterHint : 1 +define amdgpu_kernel void @test_large_stride(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4096 + %tmp1 = load i32, i32 addrspace(1)* %tmp, align 4 + %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + store i32 %tmp1, i32 addrspace(1)* %tmp2, align 4 + %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 8192 + %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4 + %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + store i32 %tmp4, i32 addrspace(1)* %tmp5, align 4 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 12288 + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 + store i32 %tmp7, i32 addrspace(1)* %tmp8, align 4 + ret void +} + +; GCN-LABEL: {{^}}test_indirect: +; MemoryBound: 0 +; WaveLimiterHint : 1 +define amdgpu_kernel void @test_indirect(i32 addrspace(1)* nocapture %arg) { +bb: + %tmp = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %tmp1 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 + %tmp3 = bitcast i32 addrspace(1)* %arg to <4 x i32> addrspace(1)* + %tmp4 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp3, align 4 + %tmp5 = extractelement <4 x i32> %tmp4, i32 0 + %tmp6 = sext i32 %tmp5 to i64 + %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6 + %tmp8 = load i32, i32 addrspace(1)* %tmp7, align 4 + store i32 %tmp8, i32 addrspace(1)* %arg, align 4 + %tmp9 = extractelement <4 x i32> %tmp4, i32 1 + %tmp10 = sext i32 %tmp9 to i64 + %tmp11 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp10 + %tmp12 = load i32, i32 addrspace(1)* %tmp11, align 4 + store i32 %tmp12, i32 addrspace(1)* %tmp, align 4 + %tmp13 = extractelement <4 x i32> %tmp4, i32 2 + %tmp14 = sext i32 %tmp13 to i64 + %tmp15 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp14 + %tmp16 = load i32, i32 addrspace(1)* %tmp15, align 4 + store i32 %tmp16, i32 addrspace(1)* %tmp1, align 4 + %tmp17 = extractelement <4 x i32> %tmp4, i32 3 + %tmp18 = sext i32 %tmp17 to i64 + %tmp19 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp18 + %tmp20 = load i32, i32 addrspace(1)* %tmp19, align 4 + store i32 %tmp20, i32 addrspace(1)* %tmp2, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x()