diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -232,6 +232,7 @@
 
   unsigned getInliningThresholdMultiplier() const { return 11; }
   unsigned adjustInliningThreshold(const CallBase *CB) const;
+  unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const;
 
   int getInlinerVectorBonusPercent() const { return 0; }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1206,11 +1206,11 @@
   return adjustThreshold;
 }
 
-unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
+static size_t getTotalAllocaSize(const CallBase *CB, const DataLayout &DL) {
   // If we have a pointer to private array passed into a function
   // it will not be optimized out, leaving scratch usage.
   // Increase the inline threshold to allow inlining in this case.
-  unsigned adjustThreshold = 0;
+  // The bigger the private array the bigger the bonus.
   uint64_t AllocaSize = 0;
   SmallPtrSet<const AllocaInst *, 8> AIVisited;
   for (Value *PtrArg : CB->args()) {
@@ -1224,18 +1224,38 @@
       if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
         continue;
       AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
-      // If the amount of stack memory is excessive we will not be able
-      // to get rid of the scratch anyway, bail out.
-      if (AllocaSize > ArgAllocaCutoff) {
-        AllocaSize = 0;
-        break;
-      }
     }
   }
-  adjustThreshold +=
-      adjustInliningThresholdUsingCallee(CB, TLI, this);
-  adjustThreshold += AllocaSize ? ArgAllocaCost : AllocaSize;
-  return adjustThreshold;
+  return AllocaSize;
+}
+
+unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
+  unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
+  size_t AllocaSize = getTotalAllocaSize(CB, DL);
+  if (AllocaSize > 0) {
+    Threshold += ArgAllocaCost;
+  }
+  return Threshold;
+}
+
+unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
+                                         const AllocaInst *AI) const {
+  auto AllocaSize = getTotalAllocaSize(CB, DL);
+  if (AllocaSize <= ArgAllocaCutoff) {
+    // the inline threshold bonus has been given already by
+    // adjustInliningThreshold
+    return 0;
+  }
+
+  // give an inline threshold bonus depending on the size of the alloca that is
+  // being optimized by SROA. the bigger the array is, the better chances we
+  // have to avoid scratch memory
+  auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
+  unsigned ThresholdBonus =
+      (ArgAllocaCost * getInliningThresholdMultiplier() * ArgAllocaSize) /
+      AllocaSize;
+  // awkwardly, this gets multiplied by the vector and single-bb bonuses
+  return ThresholdBonus;
 }
 
 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll
@@ -30,6 +30,8 @@
 
 define coldcc void @foo_private_ptr2(ptr addrspace(5) nocapture %p1, ptr addrspace(5) nocapture %p2) {
 entry:
+  call void @forbid_sroa(ptr addrspace(5) %p1)
+  call void @forbid_sroa(ptr addrspace(5) %p2)
   %tmp1 = load float, ptr addrspace(5) %p1, align 4
   %cmp = fcmp ogt float %tmp1, 1.000000e+00
   br i1 %cmp, label %if.then, label %if.end
@@ -171,6 +173,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare float @_Z3sinf(float) #1
+declare void @forbid_sroa(ptr addrspace(5) nocapture %p)
 
 attributes #0 = { noinline }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll
--- a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll
@@ -1,21 +1,94 @@
-; RUN: opt -mtriple=amdgcn--amdhsa -S -passes=inline -inline-threshold=0 -debug-only=inline-cost < %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple=amdgcn--amdhsa -S -passes=inline -inline-threshold=0 -debug-only=inline-cost %s 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
 target datalayout = "A5"
 
 ; Verify we are properly adding cost of the -amdgpu-inline-arg-alloca-cost to the threshold.
+; Awkwardly, the adjusted threshold is multiplied by the single-basic-block bonus in .
 
+define void @local_access_only(ptr addrspace(5) %p, i32 %idx) {
+  %arrayidx = getelementptr inbounds [64 x float], ptr addrspace(5) %p, i32 0, i32 %idx
+  %value = load float, ptr addrspace(5) %arrayidx
+  store float %value , ptr addrspace(5) %arrayidx, align 4
+  ret void
+}
+
+; below the cutoff, the alloca cost is 0, and only the cost of the instructions saved by sroa is counted
+; CHECK: Analyzing call of local_access_only... (caller:test_inliner_sroa_single_below_cutoff)
+; CHECK: NumAllocaArgs: 1
+; CHECK: SROACostSavings: 10
+; CHECK: SROACostSavingsLost: 0
+; CHECK: Threshold: 66000
+define amdgpu_kernel void @test_inliner_sroa_single_below_cutoff(ptr addrspace(1) %a, i32 %n) {
+entry:
+  %pvt_arr = alloca [64 x float], align 4, addrspace(5)
+  call void @local_access_only(ptr addrspace(5) %pvt_arr, i32 4)
+  ret void
+}
+
+; above the cutoff, attribute a cost to the alloca
+; CHECK: Analyzing call of local_access_only... (caller:test_inliner_sroa_single_above_cutoff)
 ; CHECK: NumAllocaArgs: 1
+; CHECK: SROACostSavings: 44010
+; CHECK: SROACostSavingsLost: 0
+; CHECK: Threshold: 66000
+define amdgpu_kernel void @test_inliner_sroa_single_above_cutoff(ptr addrspace(1) %a, i32 %n) {
+entry:
+  %pvt_arr = alloca [65 x float], align 4, addrspace(5)
+  call void @local_access_only(ptr addrspace(5) %pvt_arr, i32 4)
+  ret void
+}
+
+define void @use_first_externally(ptr addrspace(5) %p1, ptr addrspace(5) %p2) {
+  call void @external(ptr addrspace(5) %p1)
+  %arrayidx = getelementptr inbounds [64 x float], ptr addrspace(5) %p2, i32 0, i32 7
+  %value = load float, ptr addrspace(5) %arrayidx
+  store float %value , ptr addrspace(5) %arrayidx, align 4
+  ret void
+}
+
+define void @use_both_externally(ptr addrspace(5) %p1, ptr addrspace(5) %p2) {
+  call void @external(ptr addrspace(5) %p1)
+  call void @external(ptr addrspace(5) %p2)
+  ret void
+}
+
+; CHECK: Analyzing call of use_first_externally... (caller:test_inliner_sroa_double)
+; CHECK: NumAllocaArgs: 2
+; CHECK: SROACostSavings: 21671
+; CHECK: SROACostSavingsLost: 22338
 ; CHECK: Threshold: 66000
+define amdgpu_kernel void @test_inliner_sroa_double() {
+entry:
+  %pvt_arr1 = alloca [33 x float], align 4, addrspace(5)
+  %pvt_arr2 = alloca [32 x float], align 4, addrspace(5)
+  call void @use_first_externally(ptr addrspace(5) %pvt_arr1, ptr addrspace(5) %pvt_arr2)
+  ret void
+}
 
-define void @use_private_ptr_arg(ptr addrspace(5) nocapture %p) {
+; CHECK: Analyzing call of use_both_externally... (caller:test_inliner_no_sroa)
+; CHECK: NumAllocaArgs: 2
+; CHECK: SROACostSavings: 0
+; CHECK: SROACostSavingsLost: 43999
+; CHECK: Threshold: 66000
+define amdgpu_kernel void @test_inliner_no_sroa() {
+entry:
+  %pvt_arr1 = alloca [33 x float], align 4, addrspace(5)
+  %pvt_arr2 = alloca [32 x float], align 4, addrspace(5)
+  call void @use_both_externally(ptr addrspace(5) %pvt_arr1, ptr addrspace(5) %pvt_arr2)
   ret void
 }
 
-define amdgpu_kernel void @test_inliner_pvt_ptr(ptr addrspace(1) nocapture %a, i32 %n) {
+; CHECK: Analyzing call of use_both_externally... (caller:test_inliner_no_alloc)
+; CHECK: NumAllocaArgs: 0
+; CHECK: SROACostSavings: 0
+; CHECK: SROACostSavingsLost: 0
+; CHECK: Threshold: 0
+define amdgpu_kernel void @test_inliner_no_alloc(ptr addrspace(5) %a, ptr addrspace(5) %b) {
 entry:
-  %pvt_arr = alloca [64 x float], align 4, addrspace(5)
-  call void @use_private_ptr_arg(ptr addrspace(5) %pvt_arr)
+  call void @use_both_externally(ptr addrspace(5) %a, ptr addrspace(5) %b)
   ret void
 }
+
+declare void @external(ptr addrspace(5) %p)
\ No newline at end of file