diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -224,6 +224,9 @@ LastAccess = MemAccessInfo(); unsigned UsedGlobalLoadsInBB = 0; for (auto &I : B) { + // skip bitcast for cost calculation + if (I.isCast()) + continue; if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) { unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32); // TODO: Check if the global load and its user are close to each other diff --git a/llvm/test/CodeGen/AMDGPU/perfhint-instr-cost.ll b/llvm/test/CodeGen/AMDGPU/perfhint-instr-cost.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/perfhint-instr-cost.ll @@ -0,0 +1,15 @@ +; RUN: llc -march=amdgcn -debug-only=amdgpu-perf-hint < %s 2>&1 | FileCheck %s +; RUN: llc -march=amdgcn -debug-only=amdgpu-perf-hint -opaque-pointers=1 < %s 2>&1 | FileCheck %s +define i32 @perfHintInstrCost(i8 addrspace(4)* %p1, i8 addrspace(4)* %p2, i8 addrspace(4)* %p3) #0 { +; CHECK: MemInst cost: 3 +; CHECK: TotalInst cost: 6 + %x.cast = bitcast i8 addrspace(4)* %p1 to i32 addrspace(4)* + %y.cast = bitcast i8 addrspace(4)* %p2 to i32 addrspace(4)* + %z.cast = bitcast i8 addrspace(4)* %p3 to i32 addrspace(4)* + %x = load volatile i32, i32 addrspace(4)* %x.cast + %y = load volatile i32, i32 addrspace(4)* %y.cast + %z = load volatile i32, i32 addrspace(4)* %z.cast + %sum = add i32 %x, %y + %sum2 = add i32 %sum, %z + ret i32 %sum2 +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/perfhint.ll b/llvm/test/CodeGen/AMDGPU/perfhint.ll --- a/llvm/test/CodeGen/AMDGPU/perfhint.ll +++ b/llvm/test/CodeGen/AMDGPU/perfhint.ll @@ -1,4 +1,5 @@ ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -opaque-pointers=1 < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}test_membound: ; GCN: MemoryBound: 1 @@ -21,6 +22,7 @@ ; GCN-LABEL: {{^}}test_membound_1: ; GCN: MemoryBound: 1 +; GCN: WaveLimiterHint : 1 define amdgpu_kernel void @test_membound_1(<2 x double> addrspace(1)* nocapture readonly %ptr.0, <2 x double> addrspace(1)* nocapture %ptr.1, <2 x double> %arg.0, i32 %arg.1, <4 x double> %arg.2) { @@ -112,7 +114,7 @@ } ; GCN-LABEL: {{^}}test_indirect: -; GCN: MemoryBound: 0 +; GCN: MemoryBound: 1 ; GCN: WaveLimiterHint : 1 define amdgpu_kernel void @test_indirect(i32 addrspace(1)* nocapture %arg) { bb: