Index: llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -209,13 +209,16 @@ for (auto &B : F) { LastAccess = MemAccessInfo(); for (auto &I : B) { - if (getMemoryInstrPtr(&I)) { + if (const Value *Ptr = getMemoryInstrPtr(&I)) { if (isIndirectAccess(&I)) ++FI.IAMInstCount; if (isLargeStride(&I)) ++FI.LSMInstCount; - ++FI.MemInstCount; - ++FI.InstCount; + unsigned Size = divideCeil( + Ptr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), + 32); + FI.MemInstCount += Size; + FI.InstCount += Size; continue; } if (auto *CB = dyn_cast(&I)) { Index: llvm/test/CodeGen/AMDGPU/perfhint.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/perfhint.ll +++ llvm/test/CodeGen/AMDGPU/perfhint.ll @@ -16,16 +16,6 @@ %tmp8 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp7, align 16 %tmp9 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp6 store <4 x i32> %tmp8, <4 x i32> addrspace(1)* %tmp9, align 16 - %tmp10 = add nuw nsw i64 %tmp2, 2 - %tmp11 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp10 - %tmp12 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp11, align 16 - %tmp13 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp10 - store <4 x i32> %tmp12, <4 x i32> addrspace(1)* %tmp13, align 16 - %tmp14 = add nuw nsw i64 %tmp2, 3 - %tmp15 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg, i64 %tmp14 - %tmp16 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp15, align 16 - %tmp17 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %arg1, i64 %tmp14 - store <4 x i32> %tmp16, <4 x i32> addrspace(1)* %tmp17, align 16 ret void }