diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.h
@@ -41,7 +41,11 @@
     unsigned InstCost;
     unsigned IAMInstCost; // Indirect access memory instruction count
     unsigned LSMInstCost; // Large stride memory instruction count
-    FuncInfo() : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0) {}
+    bool HasDenseGlobalMemAcc; // Set if at least 1 basic block has relatively
+                               // high global memory access
+    FuncInfo()
+        : MemInstCost(0), InstCost(0), IAMInstCost(0), LSMInstCost(0),
+          HasDenseGlobalMemAcc(false) {}
   };
 
   typedef ValueMap<const Function*, FuncInfo> FuncInfoMap;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -116,6 +116,7 @@
 
   bool isGlobalAddr(const Value *V) const;
   bool isLocalAddr(const Value *V) const;
+  bool isGlobalLoadUsedInBB(const Instruction &) const;
 };
 
 static std::pair<const Value *, const Type *> getMemoryInstrPtrAndType(
@@ -196,6 +197,24 @@
   return false;
 }
 
+// Returns true if the global load `I` is used in its own basic block.
+bool AMDGPUPerfHint::isGlobalLoadUsedInBB(const Instruction &I) const {
+  const auto *Ld = dyn_cast<LoadInst>(&I);
+  if (!Ld)
+    return false;
+  if (!isGlobalAddr(Ld->getPointerOperand()))
+    return false;
+
+  for (const User *Usr : Ld->users()) {
+    if (const Instruction *UsrInst = dyn_cast<Instruction>(Usr)) {
+      if (UsrInst->getParent() == I.getParent())
+        return true;
+    }
+  }
+
+  return false;
+}
+
 AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
   AMDGPUPerfHintAnalysis::FuncInfo &FI = FIM[&F];
 
@@ -203,9 +222,14 @@
 
   for (auto &B : F) {
     LastAccess = MemAccessInfo();
+    unsigned UsedGlobalLoadsInBB = 0;
     for (auto &I : B) {
       if (const Type *Ty = getMemoryInstrPtrAndType(&I).second) {
         unsigned Size = divideCeil(Ty->getPrimitiveSizeInBits(), 32);
+        // TODO: Check if the global load and its user are close to each other
+        // instead (Or do this analysis in GCNSchedStrategy?).
+        if (isGlobalLoadUsedInBB(I))
+          UsedGlobalLoadsInBB += Size;
         if (isIndirectAccess(&I))
           FI.IAMInstCost += Size;
         if (isLargeStride(&I))
@@ -245,6 +269,16 @@
         ++FI.InstCost;
       }
     }
+
+    if (!FI.HasDenseGlobalMemAcc) {
+      unsigned GlobalMemAccPercentage = UsedGlobalLoadsInBB * 100 / B.size();
+      if (GlobalMemAccPercentage > 50) {
+        LLVM_DEBUG(dbgs() << "[HasDenseGlobalMemAcc] Set to true since "
+                          << B.getName() << " has " << GlobalMemAccPercentage
+                          << "% global memory access\n");
+        FI.HasDenseGlobalMemAcc = true;
+      }
+    }
   }
 
   return &FI;
@@ -286,6 +320,11 @@
 }
 
 bool AMDGPUPerfHint::isMemBound(const AMDGPUPerfHintAnalysis::FuncInfo &FI) {
+  // Reverting optimal scheduling in favour of occupancy with basic block(s)
+  // having dense global memory access can potentially hurt performance.
+  if (FI.HasDenseGlobalMemAcc)
+    return true;
+
   return FI.MemInstCost * 100 / FI.InstCost > MemBoundThresh;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/perfhint.ll b/llvm/test/CodeGen/AMDGPU/perfhint.ll
--- a/llvm/test/CodeGen/AMDGPU/perfhint.ll
+++ b/llvm/test/CodeGen/AMDGPU/perfhint.ll
@@ -20,7 +20,7 @@
 }
 
 ; GCN-LABEL: {{^}}test_membound_1:
-; GCN: MemoryBound: 0
+; GCN: MemoryBound: 1
 define amdgpu_kernel void @test_membound_1(<2 x double> addrspace(1)* nocapture readonly %ptr.0,
                                            <2 x double> addrspace(1)* nocapture %ptr.1,
                                            <2 x double> %arg.0, i32 %arg.1, <4 x double> %arg.2) {