diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -39,6 +39,7 @@
 typedef unsigned ID;
 }
 
+class AllocaInst;
 class AssumptionCache;
 class BlockFrequencyInfo;
 class DominatorTree;
@@ -344,6 +345,10 @@
   /// \returns A value to be added to the inlining threshold.
   unsigned adjustInliningThreshold(const CallBase *CB) const;
 
+  /// \returns The cost of having an Alloca in the caller if not inlined, to be
+  /// added to the threshold
+  unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const;
+
   /// \returns Vector bonus in percent.
   ///
   /// Vector bonuses: We want to more aggressively inline vector-dense kernels
@@ -1671,9 +1676,11 @@
   getPointersChainCost(ArrayRef<const Value *> Ptrs, const Value *Base,
                        const TTI::PointersChainInfo &Info,
                        TTI::TargetCostKind CostKind) = 0;
-  virtual unsigned getInliningThresholdMultiplier() = 0;
+  virtual unsigned getInliningThresholdMultiplier() const = 0;
   virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0;
-  virtual int getInlinerVectorBonusPercent() = 0;
+  virtual int getInlinerVectorBonusPercent() const = 0;
+  virtual unsigned getCallerAllocaCost(const CallBase *CB,
+                                       const AllocaInst *AI) const = 0;
   virtual InstructionCost getMemcpyCost(const Instruction *I) = 0;
   virtual unsigned
   getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize,
@@ -2033,15 +2040,19 @@
                                        TargetCostKind CostKind) override {
     return Impl.getPointersChainCost(Ptrs, Base, Info, CostKind);
   }
-  unsigned getInliningThresholdMultiplier() override {
+  unsigned getInliningThresholdMultiplier() const override {
     return Impl.getInliningThresholdMultiplier();
   }
   unsigned adjustInliningThreshold(const CallBase *CB) override {
     return Impl.adjustInliningThreshold(CB);
-  }
-  int getInlinerVectorBonusPercent() override {
+  } 
+  int getInlinerVectorBonusPercent() const override {
     return Impl.getInlinerVectorBonusPercent();
   }
+  unsigned getCallerAllocaCost(const CallBase *CB,
+                               const AllocaInst *AI) const override {
+    return Impl.getCallerAllocaCost(CB, AI);
+  }
   InstructionCost getMemcpyCost(const Instruction *I) override {
     return Impl.getMemcpyCost(I);
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -70,6 +70,9 @@
 
   unsigned getInliningThresholdMultiplier() const { return 1; }
   unsigned adjustInliningThreshold(const CallBase *CB) const { return 0; }
+  unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
+    return 0;
+  };
 
   int getInlinerVectorBonusPercent() const { return 150; }
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -532,10 +532,13 @@
     return TargetTransformInfo::TCC_Expensive;
   }
 
-  unsigned getInliningThresholdMultiplier() { return 1; }
+  unsigned getInliningThresholdMultiplier() const { return 1; }
   unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
+  unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
+    return 0;
+  }
 
-  int getInlinerVectorBonusPercent() { return 150; }
+  int getInlinerVectorBonusPercent() const { return 150; }
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP,
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -717,7 +717,8 @@
   void onInitializeSROAArg(AllocaInst *Arg) override {
     assert(Arg != nullptr &&
            "Should not initialize SROA costs for null value.");
-    SROAArgCosts[Arg] = 0;
+    SROACostSavings += SROAArgCosts[Arg] =
+        TTI.getCallerAllocaCost(&CandidateCall, Arg);
   }
 
   void onAggregateSROAUse(AllocaInst *SROAArg) override {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -217,6 +217,11 @@
   return TTIImpl->adjustInliningThreshold(CB);
 }
 
+unsigned TargetTransformInfo::getCallerAllocaCost(const CallBase *CB,
+                                                  const AllocaInst *AI) const {
+  return TTIImpl->getCallerAllocaCost(CB, AI);
+}
+
 int TargetTransformInfo::getInlinerVectorBonusPercent() const {
   return TTIImpl->getInlinerVectorBonusPercent();
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -230,10 +230,11 @@
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
 
-  unsigned getInliningThresholdMultiplier() { return 11; }
+  unsigned getInliningThresholdMultiplier() const { return 11; }
   unsigned adjustInliningThreshold(const CallBase *CB) const;
+  unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const;
 
-  int getInlinerVectorBonusPercent() { return 0; }
+  int getInlinerVectorBonusPercent() const { return 0; }
 
   InstructionCost getArithmeticReductionCost(
       unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1206,11 +1206,11 @@
   return adjustThreshold;
 }
 
-unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
+static size_t getTotalAllocaSize(const CallBase *CB, const DataLayout &DL) {
   // If we have a pointer to private array passed into a function
   // it will not be optimized out, leaving scratch usage.
   // Increase the inline threshold to allow inlining in this case.
-  unsigned adjustThreshold = 0;
+  // The bigger the private array the bigger the bonus.
   uint64_t AllocaSize = 0;
   SmallPtrSet<const AllocaInst *, 8> AIVisited;
   for (Value *PtrArg : CB->args()) {
@@ -1224,18 +1224,40 @@
       if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
         continue;
       AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
-      // If the amount of stack memory is excessive we will not be able
-      // to get rid of the scratch anyway, bail out.
-      if (AllocaSize > ArgAllocaCutoff) {
-        AllocaSize = 0;
-        break;
-      }
     }
   }
-  adjustThreshold +=
-      adjustInliningThresholdUsingCallee(CB, TLI, this);
-  adjustThreshold += AllocaSize ? ArgAllocaCost : AllocaSize;
-  return adjustThreshold;
+  return AllocaSize;
+}
+
+unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
+  unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
+  size_t AllocaSize = getTotalAllocaSize(CB, DL);
+  if (AllocaSize > 0) {
+    Threshold += ArgAllocaCost;
+  }
+  return Threshold;
+}
+
+unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
+                                         const AllocaInst *AI) const {
+  auto AllocaSize = getTotalAllocaSize(CB, DL);
+  if (AllocaSize <= ArgAllocaCutoff) {
+    // the inline threshold bonus has been given already by
+    // adjustInliningThreshold
+    return 0;
+  }
+
+  // give an inline threshold bonus depending on the size of the alloca that is
+  // being optimized by SROA. the bigger the array is, the better chances we
+  // have to avoid scratch memory
+  auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
+  unsigned ThresholdBonus =
+      (ArgAllocaCost * getInliningThresholdMultiplier() * ArgAllocaSize) /
+      AllocaSize;
+
+  // awkwardly, this bonus gets multiplied by the single-bb-bonus and the
+  // vector-bonus
+  return ThresholdBonus;
 }
 
 void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -92,7 +92,7 @@
 
   // Increase the inlining cost threshold by a factor of 11, reflecting that
   // calls are particularly expensive in NVPTX.
-  unsigned getInliningThresholdMultiplier() { return 11; }
+  unsigned getInliningThresholdMultiplier() const { return 11; }
 
   InstructionCost getArithmeticInstrCost(
       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -36,7 +36,7 @@
   /// \name Scalar TTI Implementations
   /// @{
 
-  unsigned getInliningThresholdMultiplier() { return 3; }
+  unsigned getInliningThresholdMultiplier() const { return 3; }
   unsigned adjustInliningThreshold(const CallBase *CB) const;
 
   InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-inline.ll
@@ -30,6 +30,8 @@
 
 define coldcc void @foo_private_ptr2(ptr addrspace(5) nocapture %p1, ptr addrspace(5) nocapture %p2) {
 entry:
+  call void @forbid_sroa(ptr addrspace(5) %p1)
+  call void @forbid_sroa(ptr addrspace(5) %p2)
   %tmp1 = load float, ptr addrspace(5) %p1, align 4
   %cmp = fcmp ogt float %tmp1, 1.000000e+00
   br i1 %cmp, label %if.then, label %if.end
@@ -171,6 +173,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare float @_Z3sinf(float) #1
+declare void @forbid_sroa(ptr addrspace(5) nocapture %p)
 
 attributes #0 = { noinline }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll
--- a/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll
+++ b/llvm/test/Transforms/Inline/AMDGPU/amdgpu-inline-alloca-argument-cost.ll
@@ -1,21 +1,94 @@
-; RUN: opt -mtriple=amdgcn--amdhsa -S -passes=inline -inline-threshold=0 -debug-only=inline-cost < %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple=amdgcn--amdhsa -S -passes=inline -inline-threshold=0 -debug-only=inline-cost %s 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
 target datalayout = "A5"
 
 ; Verify we are properly adding cost of the -amdgpu-inline-arg-alloca-cost to the threshold.
+; Awkwardly, the adjusted threshold is multiplied by the single-basic-block bonus in .
 
+define void @local_access_only(ptr addrspace(5) %p, i32 %idx) {
+  %arrayidx = getelementptr inbounds [64 x float], ptr addrspace(5) %p, i32 0, i32 %idx
+  %value = load float, ptr addrspace(5) %arrayidx
+  store float %value , ptr addrspace(5) %arrayidx, align 4
+  ret void
+}
+
+; below the cutoff, the alloca cost is 0, and only the cost of the instructions saved by sroa is counted
+; CHECK: Analyzing call of local_access_only... (caller:test_inliner_sroa_single_below_cutoff)
+; CHECK: NumAllocaArgs: 1
+; CHECK: SROACostSavings: 10
+; CHECK: SROACostSavingsLost: 0
+; CHECK: Threshold: 66000
+define amdgpu_kernel void @test_inliner_sroa_single_below_cutoff(ptr addrspace(1) %a, i32 %n) {
+entry:
+  %pvt_arr = alloca [64 x float], align 4, addrspace(5)
+  call void @local_access_only(ptr addrspace(5) %pvt_arr, i32 4)
+  ret void
+}
+
+; above the cutoff, attribute a cost to the alloca
+; CHECK: Analyzing call of local_access_only... (caller:test_inliner_sroa_single_above_cutoff)
 ; CHECK: NumAllocaArgs: 1
+; CHECK: SROACostSavings: 44010
+; CHECK: SROACostSavingsLost: 0
+; CHECK: Threshold: 66000
+define amdgpu_kernel void @test_inliner_sroa_single_above_cutoff(ptr addrspace(1) %a, i32 %n) {
+entry:
+  %pvt_arr = alloca [65 x float], align 4, addrspace(5)
+  call void @local_access_only(ptr addrspace(5) %pvt_arr, i32 4)
+  ret void
+}
+
+define void @use_first_externally(ptr addrspace(5) %p1, ptr addrspace(5) %p2) {
+  call void @external(ptr addrspace(5) %p1)
+  %arrayidx = getelementptr inbounds [64 x float], ptr addrspace(5) %p2, i32 0, i32 7
+  %value = load float, ptr addrspace(5) %arrayidx
+  store float %value , ptr addrspace(5) %arrayidx, align 4
+  ret void
+}
+
+define void @use_both_externally(ptr addrspace(5) %p1, ptr addrspace(5) %p2) {
+  call void @external(ptr addrspace(5) %p1)
+  call void @external(ptr addrspace(5) %p2)
+  ret void
+}
+
+; CHECK: Analyzing call of use_first_externally... (caller:test_inliner_sroa_double)
+; CHECK: NumAllocaArgs: 2
+; CHECK: SROACostSavings: 21671
+; CHECK: SROACostSavingsLost: 22338
 ; CHECK: Threshold: 66000
+define amdgpu_kernel void @test_inliner_sroa_double() {
+entry:
+  %pvt_arr1 = alloca [33 x float], align 4, addrspace(5)
+  %pvt_arr2 = alloca [32 x float], align 4, addrspace(5)
+  call void @use_first_externally(ptr addrspace(5) %pvt_arr1, ptr addrspace(5) %pvt_arr2)
+  ret void
+}
 
-define void @use_private_ptr_arg(ptr addrspace(5) nocapture %p) {
+; CHECK: Analyzing call of use_both_externally... (caller:test_inliner_no_sroa)
+; CHECK: NumAllocaArgs: 2
+; CHECK: SROACostSavings: 0
+; CHECK: SROACostSavingsLost: 43999
+; CHECK: Threshold: 66000
+define amdgpu_kernel void @test_inliner_no_sroa() {
+entry:
+  %pvt_arr1 = alloca [33 x float], align 4, addrspace(5)
+  %pvt_arr2 = alloca [32 x float], align 4, addrspace(5)
+  call void @use_both_externally(ptr addrspace(5) %pvt_arr1, ptr addrspace(5) %pvt_arr2)
   ret void
 }
 
-define amdgpu_kernel void @test_inliner_pvt_ptr(ptr addrspace(1) nocapture %a, i32 %n) {
+; CHECK: Analyzing call of use_both_externally... (caller:test_inliner_no_alloc)
+; CHECK: NumAllocaArgs: 0
+; CHECK: SROACostSavings: 0
+; CHECK: SROACostSavingsLost: 0
+; CHECK: Threshold: 0
+define amdgpu_kernel void @test_inliner_no_alloc(ptr addrspace(5) %a, ptr addrspace(5) %b) {
 entry:
-  %pvt_arr = alloca [64 x float], align 4, addrspace(5)
-  call void @use_private_ptr_arg(ptr addrspace(5) %pvt_arr)
+  call void @use_both_externally(ptr addrspace(5) %a, ptr addrspace(5) %b)
   ret void
 }
+
+declare void @external(ptr addrspace(5) %p)
\ No newline at end of file