Index: llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h +++ llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h @@ -37,12 +37,11 @@ bool needsWaveLimiter(const Function *F) const; struct FuncInfo { - unsigned MemInstCount; - unsigned InstCount; - unsigned IAMInstCount; // Indirect access memory instruction count - unsigned LSMInstCount; // Large stride memory instruction count - FuncInfo() : MemInstCount(0), InstCount(0), IAMInstCount(0), - LSMInstCount(0) {} + unsigned MemInstCost; + unsigned InstCost; + unsigned IAMInstCost; // Indirect access memory instruction count + unsigned LSMInstCost; // Large stride memory instruction count + FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {} }; typedef ValueMap FuncInfoMap; Index: llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -209,19 +209,22 @@ for (auto &B : F) { LastAccess = MemAccessInfo(); for (auto &I : B) { - if (getMemoryInstrPtr(&I)) { + if (const Value *Ptr = getMemoryInstrPtr(&I)) { + unsigned Size = divideCeil( + Ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), + 32); if (isIndirectAccess(&I)) - ++FI.IAMInstCount; + FI.IAMInstCost += Size; if (isLargeStride(&I)) - ++FI.LSMInstCount; - ++FI.MemInstCount; - ++FI.InstCount; + FI.LSMInstCost += Size; + FI.MemInstCost += Size; + FI.InstCost += Size; continue; } if (auto *CB = dyn_cast(&I)) { Function *Callee = CB->getCalledFunction(); if (!Callee || Callee->isDeclaration()) { - ++FI.InstCount; + ++FI.InstCost; continue; } if (&F == Callee) // Handle immediate recursion @@ -231,10 +234,10 @@ if (Loc == FIM.end()) continue; - FI.MemInstCount += Loc->second.MemInstCount; - FI.InstCount += Loc->second.InstCount; - FI.IAMInstCount += Loc->second.IAMInstCount; - FI.LSMInstCount += Loc->second.LSMInstCount; + FI.MemInstCost += Loc->second.MemInstCost; + FI.InstCost += Loc->second.InstCost; + FI.IAMInstCost += Loc->second.IAMInstCost; + FI.LSMInstCost += Loc->second.LSMInstCost; } else if (auto *GEP = dyn_cast(&I)) { TargetLoweringBase::AddrMode AM; auto *Ptr = GetPointerBaseWithConstantOffset(GEP, AM.BaseOffs, *DL); @@ -244,9 +247,9 @@ GEP->getPointerAddressSpace())) // Offset will likely be folded into load or store continue; - ++FI.InstCount; + ++FI.InstCost; } else { - ++FI.InstCount; + ++FI.InstCost; } } } @@ -264,11 +267,11 @@ const AMDGPUPerfHintAnalysis::FuncInfo *Info = visit(F); - LLVM_DEBUG(dbgs() << F.getName() << " MemInst: " << Info->MemInstCount + LLVM_DEBUG(dbgs() << F.getName() << " MemInst cost: " << Info->MemInstCost << '\n' - << " IAMInst: " << Info->IAMInstCount << '\n' - << " LSMInst: " << Info->LSMInstCount << '\n' - << " TotalInst: " << Info->InstCount << '\n'); + << " IAMInst cost: " << Info->IAMInstCost << '\n' + << " LSMInst cost: " << Info->LSMInstCost << '\n' + << " TotalInst cost: " << Info->InstCost << '\n'); if (isMemBound(*Info)) { LLVM_DEBUG(dbgs() << F.getName() << " is memory bound\n"); @@ -286,13 +289,12 @@ } bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { - return FI.MemInstCount * 100 / FI.InstCount > MemBoundThresh; + return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh; } bool AMDGPUPerfHint::needLimitWave(const AMDGPUPerfHintAnalysis::FuncInfo &FI) { - return ((FI.MemInstCount + FI.IAMInstCount * IAWeight + - FI.LSMInstCount * LSWeight) * - 100 / FI.InstCount) > LimitWaveThresh; + return ((FI.MemInstCost + FI.IAMInstCost * IAWeight + + FI.LSMInstCost * LSWeight) * 100 / FI.InstCost) > LimitWaveThresh; } bool AMDGPUPerfHint::isGlobalAddr(const Value *V) const { Index: llvm/test/CodeGen/AMDGPU/perfhint.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/perfhint.ll +++ llvm/test/CodeGen/AMDGPU/perfhint.ll @@ -16,16 +16,6 @@ %tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16 %tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6 store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16 - %tmp10 = add nuw nsw i64 %tmp2, 2 - %tmp11 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp10 - %tmp12 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp11, align 16 - %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp10 - store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16 - %tmp14 = add nuw nsw i64 %tmp2, 3 - %tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14 - %tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16 - %tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14 - store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16 ret void }