diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp @@ -333,6 +333,10 @@ // Do not treat local-addr memory access as large stride. if (isLocalAddr(MO)) return MAI; + // Do not treat constant-addr memory access as large stride because K$ + // behavior is very different from L0$. + if (isConstantAddr(MO)) + return MAI; MAI.V = MO; MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL); diff --git a/llvm/test/CodeGen/AMDGPU/perfhint.ll b/llvm/test/CodeGen/AMDGPU/perfhint.ll --- a/llvm/test/CodeGen/AMDGPU/perfhint.ll +++ b/llvm/test/CodeGen/AMDGPU/perfhint.ll @@ -49,6 +49,26 @@ ret void } +; GCN-LABEL: {{^}}test_constant_not_large_stride: +; GCN: MemoryBound: 0 +; GCN: WaveLimiterHint : 0 +define amdgpu_kernel void @test_constant_not_large_stride(i32 addrspace(4)* nocapture %arg, i32 addrspace(1)* nocapture %arg1) { +bb: + %tmp = getelementptr inbounds i32, i32 addrspace(4)* %arg, i64 4096 + %tmp1 = load i32, i32 addrspace(4)* %tmp, align 4 + %tmp3 = getelementptr inbounds i32, i32 addrspace(4)* %arg, i64 8192 + %tmp4 = load i32, i32 addrspace(4)* %tmp3, align 4 + %tmp6 = getelementptr inbounds i32, i32 addrspace(4)* %arg, i64 12288 + %tmp7 = load i32, i32 addrspace(4)* %tmp6, align 4 + %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 1 + store i32 %tmp1, i32 addrspace(1)* %tmp2, align 4 + %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 2 + store i32 %tmp4, i32 addrspace(1)* %tmp5, align 4 + %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 3 + store i32 %tmp7, i32 addrspace(1)* %tmp8, align 4 + ret void +} + ; GCN-LABEL: {{^}}test_indirect: ; GCN: MemoryBound: 0 ; GCN: WaveLimiterHint : 1