diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -333,6 +333,10 @@
   // Do not treat local-addr memory access as large stride.
   if (isLocalAddr(MO))
     return MAI;
+  // Do not treat constant-addr memory access as large stride because K$
+  // behavior is very different from L0$.
+  if (isConstantAddr(MO))
+    return MAI;
 
   MAI.V = MO;
   MAI.Base = GetPointerBaseWithConstantOffset(MO, MAI.Offset, *DL);
diff --git a/llvm/test/CodeGen/AMDGPU/perfhint.ll b/llvm/test/CodeGen/AMDGPU/perfhint.ll
--- a/llvm/test/CodeGen/AMDGPU/perfhint.ll
+++ b/llvm/test/CodeGen/AMDGPU/perfhint.ll
@@ -49,6 +49,26 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}test_constant_not_large_stride:
+; GCN: MemoryBound: 0
+; GCN: WaveLimiterHint : 0
+define amdgpu_kernel void @test_constant_not_large_stride(i32 addrspace(4)* nocapture %arg, i32 addrspace(1)* nocapture %arg1) {
+bb:
+  %tmp = getelementptr inbounds i32, i32 addrspace(4)* %arg, i64 4096
+  %tmp1 = load i32, i32 addrspace(4)* %tmp, align 4
+  %tmp3 = getelementptr inbounds i32, i32 addrspace(4)* %arg, i64 8192
+  %tmp4 = load i32, i32 addrspace(4)* %tmp3, align 4
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(4)* %arg, i64 12288
+  %tmp7 = load i32, i32 addrspace(4)* %tmp6, align 4
+  %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 1
+  store i32 %tmp1, i32 addrspace(1)* %tmp2, align 4
+  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 2
+  store i32 %tmp4, i32 addrspace(1)* %tmp5, align 4
+  %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 3
+  store i32 %tmp7, i32 addrspace(1)* %tmp8, align 4
+  ret void
+}
+
 ; GCN-LABEL: {{^}}test_indirect:
 ; GCN: MemoryBound: 0
 ; GCN: WaveLimiterHint : 1