Index: llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
+++ llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.h
@@ -20,6 +20,7 @@
 
 namespace llvm {
 
+class GCNTargetMachine;
 class GCNSubtarget;
 class MachineFunction;
 class TargetMachine;
@@ -71,12 +72,16 @@
     return Info->getSecond();
   }
 
+  const SIFunctionResourceInfo &getWorstCaseResourceInfo(const Module &M);
+
 private:
-  SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF,
-                                              const TargetMachine &TM) const;
-  void propagateIndirectCallRegisterUsage();
+  void computeWorstCaseModuleRegisterUsage(const Module &M);
+
+  SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF);
 
+  const GCNTargetMachine *TM = nullptr;
   DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
+  Optional<SIFunctionResourceInfo> ModuleWorstCaseInfo;
 };
 } // namespace llvm
 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURESOURCEUSAGEANALYSIS_H
Index: llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -25,6 +25,7 @@
 
 #include "AMDGPUResourceUsageAnalysis.h"
 #include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
 #include "GCNSubtarget.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/Analysis/CallGraph.h"
@@ -102,8 +103,7 @@
   if (!TPC)
     return false;
 
-  const TargetMachine &TM = TPC->getTM<TargetMachine>();
-  bool HasIndirectCall = false;
+  TM = static_cast<const GCNTargetMachine *>(&TPC->getTM<TargetMachine>());
 
   for (CallGraphNode *I : SCC) {
     Function *F = I->getFunction();
@@ -118,19 +118,14 @@
         std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
     SIFunctionResourceInfo &Info = CI.first->second;
     assert(CI.second && "should only be called once per function");
-    Info = analyzeResourceUsage(MF, TM);
-    HasIndirectCall |= Info.HasIndirectCall;
+    Info = analyzeResourceUsage(MF);
   }
 
-  if (HasIndirectCall)
-    propagateIndirectCallRegisterUsage();
-
   return false;
 }
 
 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
-AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
-    const MachineFunction &MF, const TargetMachine &TM) const {
+AMDGPUResourceUsageAnalysis::analyzeResourceUsage(const MachineFunction &MF) {
   SIFunctionResourceInfo Info;
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -476,9 +471,16 @@
               std::max(CalleeFrameSize,
                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
 
+          const SIFunctionResourceInfo &WorstCase =
+              getWorstCaseResourceInfo(*MF.getFunction().getParent());
+          MaxSGPR = std::max(WorstCase.NumExplicitSGPR - 1, MaxSGPR);
+          MaxVGPR = std::max(WorstCase.NumVGPR - 1, MaxVGPR);
+          MaxAGPR = std::max(WorstCase.NumAGPR - 1, MaxAGPR);
+
           // Register usage of indirect calls gets handled later
           Info.UsesVCC = true;
-          Info.UsesFlatScratch = ST.hasFlatAddressSpace();
+          Info.UsesFlatScratch |=
+              WorstCase.UsesFlatScratch && ST.hasFlatAddressSpace();
           Info.HasDynamicallySizedStack = true;
           Info.HasIndirectCall = true;
         } else {
@@ -507,31 +509,49 @@
   return Info;
 }
 
-void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
-  // Collect the maximum number of registers from non-hardware-entrypoints.
-  // All these functions are potential targets for indirect calls.
-  int32_t NonKernelMaxSGPRs = 0;
-  int32_t NonKernelMaxVGPRs = 0;
-  int32_t NonKernelMaxAGPRs = 0;
-
-  for (const auto &I : CallGraphResourceInfo) {
-    if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
-      auto &Info = I.getSecond();
-      NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
-      NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
-      NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
-    }
-  }
+const AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo &
+AMDGPUResourceUsageAnalysis::getWorstCaseResourceInfo(const Module &M) {
+  if (ModuleWorstCaseInfo)
+    return *ModuleWorstCaseInfo;
+
+  computeWorstCaseModuleRegisterUsage(M);
+  return *ModuleWorstCaseInfo;
+}
+
+/// Find the worst case register usage for all callable functions in the module,
+/// assuming all reachable functions are defined in the current module.
+void AMDGPUResourceUsageAnalysis::computeWorstCaseModuleRegisterUsage(
+    const Module &M) {
+  assert(!ModuleWorstCaseInfo);
+  ModuleWorstCaseInfo = SIFunctionResourceInfo();
+  ModuleWorstCaseInfo->UsesVCC = true;
+  ModuleWorstCaseInfo->HasDynamicallySizedStack = true;
+  ModuleWorstCaseInfo->HasRecursion = true;
+  ModuleWorstCaseInfo->HasIndirectCall = true;
+
+  for (const Function &F : M) {
+    if (F.isIntrinsic())
+      continue;
 
-  // Add register usage for functions with indirect calls.
-  // For calls to unknown functions, we assume the maximum register usage of
-  // all non-hardware-entrypoints in the current module.
-  for (auto &I : CallGraphResourceInfo) {
-    auto &Info = I.getSecond();
-    if (Info.HasIndirectCall) {
-      Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
-      Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
-      Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
+    if (AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+      continue;
+
+    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+    const int32_t MaxVGPR = ST.getMaxNumVGPRs(F);
+    const int32_t MaxSGPR = ST.getMaxNumSGPRs(F);
+
+    ModuleWorstCaseInfo->NumVGPR =
+        std::max(ModuleWorstCaseInfo->NumVGPR, MaxVGPR);
+
+    if (ST.hasMAIInsts()) {
+      const int32_t MaxAGPR = ST.getMaxNumAGPRs(F);
+      ModuleWorstCaseInfo->NumAGPR =
+          std::max(ModuleWorstCaseInfo->NumAGPR, MaxAGPR);
     }
+
+    ModuleWorstCaseInfo->NumExplicitSGPR =
+        std::max(ModuleWorstCaseInfo->NumExplicitSGPR, MaxSGPR);
+
+    ModuleWorstCaseInfo->UsesFlatScratch |= ST.hasFlatAddressSpace();
   }
 }
Index: llvm/lib/Target/AMDGPU/GCNSubtarget.h
===================================================================
--- llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1110,6 +1110,10 @@
   /// unit requirement.
   unsigned getMaxNumVGPRs(const Function &F) const;
 
+  unsigned getMaxNumAGPRs(const Function &F) const {
+    return getMaxNumVGPRs(F);
+  }
+
   /// \returns Maximum number of VGPRs that meets number of waves per execution
   /// unit requirement for function \p MF, or number of VGPRs explicitly
   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
Index: llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
+++ llvm/test/CodeGen/AMDGPU/agpr-register-count.ll
@@ -154,22 +154,23 @@
 declare void @undef_func()
 
 ; GCN-LABEL: {{^}}kernel_call_undef_func:
-; GFX908: .amdhsa_next_free_vgpr 32
-; GFX90A: .amdhsa_next_free_vgpr 64
-; GFX90A: .amdhsa_accum_offset 32
+; GFX908: .amdhsa_next_free_vgpr 128
+; GFX90A: .amdhsa_next_free_vgpr 512
+; GFX90A: .amdhsa_accum_offset 256
 ; GCN908: NumVgprs: 128
+; GCN908: NumAgprs: 128
 ; GCN90A: NumVgprs: 256
-; GCN:    NumAgprs: 32
-; GFX908: TotalNumVgprs: 32
-; GFX90A: TotalNumVgprs: 64
-; GFX908: VGPRBlocks: 7
-; GFX90A: VGPRBlocks: 7
-; GFX908: NumVGPRsForWavesPerEU: 32
-; GFX90A: NumVGPRsForWavesPerEU: 64
-; GFX90A: AccumOffset: 32
-; GFX908: Occupancy: 8
-; GFX90A: Occupancy: 8
-; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 7
+; GCN90A: NumAgprs: 256
+; GFX908: TotalNumVgprs: 128
+; GFX90A: TotalNumVgprs: 512
+; GFX908: VGPRBlocks: 31
+; GFX90A: VGPRBlocks: 63
+; GFX908: NumVGPRsForWavesPerEU: 128
+; GFX90A: NumVGPRsForWavesPerEU: 512
+; GFX90A: AccumOffset: 256
+; GFX908: Occupancy: 2
+; GFX90A: Occupancy: 1
+; GFX90A: COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 63
 define amdgpu_kernel void @kernel_call_undef_func() #0 {
 bb:
   call void @undef_func()
Index: llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -144,8 +144,8 @@
 
 ; GCN: amdpal.pipelines:
 ; GCN-NEXT:  - .registers:
-; SDAG-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}}
-; GISEL-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ce{{$}}
+; SDAG-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf03cf{{$}}
+; GISEL-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf03cf{{$}}
 ; GCN-NEXT:      0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
@@ -178,24 +178,24 @@
 ; GCN-NEXT:        .vgpr_count:     0x2{{$}}
 ; GCN-NEXT:      no_stack_extern_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x68{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x6c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
-; GCN-NEXT:        .vgpr_count:     0x29{{$}}
+; GCN-NEXT:        .vgpr_count:     0x40{{$}}
 ; GCN-NEXT:      no_stack_extern_call_many_args:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x68{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x6c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x90{{$}}
-; SDAG-NEXT:        .vgpr_count:     0x2a{{$}}
-; GISEL-NEXT:        .vgpr_count:     0x34{{$}}
+; SDAG-NEXT:        .vgpr_count:     0x40{{$}}
+; GISEL-NEXT:        .vgpr_count:     0x40{{$}}
 ; GCN-NEXT:      no_stack_indirect_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x68{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x6c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
-; SDAG-NEXT:        .vgpr_count:     0x2a{{$}}
-; GISEL-NEXT:        .vgpr_count:     0x34{{$}}
+; SDAG-NEXT:        .vgpr_count:     0x40{{$}}
+; GISEL-NEXT:        .vgpr_count:     0x40{{$}}
 ; GCN-NEXT:      simple_lds:
 ; GCN-NEXT:        .lds_size:       0x100{{$}}
 ; GCN-NEXT:        .sgpr_count:     0x20{{$}}
@@ -218,17 +218,17 @@
 ; GCN-NEXT:        .vgpr_count:     0x3{{$}}
 ; GCN-NEXT:      simple_stack_extern_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x68{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x6c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
-; GCN-NEXT:        .vgpr_count:     0x2a{{$}}
+; GCN-NEXT:        .vgpr_count:     0x40{{$}}
 ; GCN-NEXT:      simple_stack_indirect_call:
 ; GCN-NEXT:        .lds_size:       0{{$}}
-; GFX8-NEXT:        .sgpr_count:     0x28{{$}}
-; GFX9-NEXT:        .sgpr_count:     0x2c{{$}}
+; GFX8-NEXT:        .sgpr_count:     0x68{{$}}
+; GFX9-NEXT:        .sgpr_count:     0x6c{{$}}
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x20{{$}}
-; SDAG-NEXT:        .vgpr_count:     0x2b{{$}}
-; GISEL-NEXT:        .vgpr_count:     0x34{{$}}
+; SDAG-NEXT:        .vgpr_count:     0x40{{$}}
+; GISEL-NEXT:        .vgpr_count:     0x40{{$}}
 ; GCN-NEXT:      simple_stack_recurse:
 ; GCN-NEXT:        .lds_size:       0{{$}}
 ; GCN-NEXT:        .sgpr_count:     0x26{{$}}
Index: llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
+++ llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size-vgpr-limit.ll
@@ -556,9 +556,9 @@
 
 ; GCN-LABEL: {{^}}f1024:
 ; GFX9: NumVgprs: 64
-; GFX90A: NumVgprs: 64
-; GFX90A: NumAgprs: 64
-; GFX90A: TotalNumVgprs: 128
+; GFX90A: NumVgprs: 128
+; GFX90A: NumAgprs: 128
+; GFX90A: TotalNumVgprs: 256
 ; GFX10WGP-WAVE32: NumVgprs: 128
 ; GFX10WGP-WAVE64: NumVgprs: 128
 ; GFX10CU-WAVE32: NumVgprs: 64
Index: llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll
@@ -0,0 +1,31 @@
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=ALL,GFX908 %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=ALL,GFX90A %s
+
+; CallGraphAnalysis, which CodeGenSCC order depends on, does not look
+; through aliases. If GlobalOpt is never run, we do not see direct
+; calls,
+
+@alias = hidden alias void (), void ()* @aliasee_default
+
+; ALL-LABEL: {{^}}kernel:
+; GFX908: .amdhsa_next_free_vgpr 64
+; GFX908-NEXT: .amdhsa_next_free_sgpr 102
+
+; GFX90A: .amdhsa_next_free_vgpr 256
+; GFX90A-NEXT: .amdhsa_next_free_sgpr 102
+; GFX90A-NEXT: .amdhsa_accum_offset 128
+define amdgpu_kernel void @kernel() #0 {
+bb:
+  call void @alias() #2
+  ret void
+}
+
+define internal void @aliasee_default() #1 {
+bb:
+  call void asm sideeffect "; clobber a26 ", "~{a26}"()
+  ret void
+}
+
+attributes #0 = { noinline norecurse nounwind optnone }
+attributes #1 = { noinline norecurse nounwind readnone willreturn }
+attributes #2 = { nounwind readnone willreturn }
Index: llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll
@@ -0,0 +1,26 @@
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; CallGraphAnalysis, which CodeGenSCC order depends on, does not look
+; through aliases. If GlobalOpt is never run, we do not see direct
+; calls,
+
+@alias0 = hidden alias void (), void ()* @aliasee_default_vgpr64_sgpr102
+
+; CHECK-LABEL: {{^}}kernel0:
+; CHECK: .amdhsa_next_free_vgpr 64
+; CHECK-NEXT: .amdhsa_next_free_sgpr 102
+define amdgpu_kernel void @kernel0() #0 {
+bb:
+  call void @alias0() #2
+  ret void
+}
+
+define internal void @aliasee_default_vgpr64_sgpr102() #1 {
+bb:
+  call void asm sideeffect "; clobber v52 ", "~{v52}"()
+  ret void
+}
+
+attributes #0 = { noinline norecurse nounwind optnone }
+attributes #1 = { noinline norecurse nounwind readnone willreturn }
+attributes #2 = { nounwind readnone willreturn }
Index: llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll
@@ -0,0 +1,29 @@
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; CallGraphAnalysis, which CodeGenSCC order depends on, does not look
+; through aliases. If GlobalOpt is never run, we do not see direct
+; calls,
+
+@alias1 = hidden alias void (), void ()* @aliasee_vgpr32_sgpr76
+
+; The parent kernel has a higher VGPR usage than the possible callees.
+
+; CHECK-LABEL: {{^}}kernel1:
+; CHECK: .amdhsa_next_free_vgpr 42
+; CHECK-NEXT: .amdhsa_next_free_sgpr 74
+define amdgpu_kernel void @kernel1() #0 {
+bb:
+  call void asm sideeffect "; clobber v40 ", "~{v40}"()
+  call void @alias1() #2
+  ret void
+}
+
+define internal void @aliasee_vgpr32_sgpr76() #1 {
+bb:
+  call void asm sideeffect "; clobber v26 ", "~{v26}"()
+  ret void
+}
+
+attributes #0 = { noinline norecurse nounwind optnone }
+attributes #1 = { noinline norecurse nounwind readnone willreturn "amdgpu-waves-per-eu"="8,10" }
+attributes #2 = { nounwind readnone willreturn }
Index: llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll
@@ -0,0 +1,26 @@
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; CallGraphAnalysis, which CodeGenSCC order depends on, does not look
+; through aliases. If GlobalOpt is never run, we do not see direct
+; calls,
+
+@alias2 = hidden alias void (), void()* @aliasee_vgpr64_sgpr102
+
+; CHECK-LABEL: {{^}}kernel2:
+; CHECK: .amdhsa_next_free_vgpr 64
+; CHECK-NEXT: .amdhsa_next_free_sgpr 102
+define amdgpu_kernel void @kernel2() #0 {
+bb:
+  call void @alias2() #2
+  ret void
+}
+
+define internal void @aliasee_vgpr64_sgpr102() #1 {
+bb:
+  call void asm sideeffect "; clobber v52 ", "~{v52}"()
+  ret void
+}
+
+attributes #0 = { noinline norecurse nounwind optnone }
+attributes #1 = { noinline norecurse nounwind readnone willreturn "amdgpu-waves-per-eu"="4,10" }
+attributes #2 = { nounwind readnone willreturn }
Index: llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll
@@ -0,0 +1,26 @@
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+
+; CallGraphAnalysis, which CodeGenSCC order depends on, does not look
+; through aliases. If GlobalOpt is never run, we do not see direct
+; calls,
+
+@alias3 = hidden alias void (), void ()* @aliasee_vgpr256_sgpr102
+
+; CHECK-LABEL: {{^}}kernel3:
+; CHECK: .amdhsa_next_free_vgpr 256
+; CHECK-NEXT: .amdhsa_next_free_sgpr 102
+define amdgpu_kernel void @kernel3() #0 {
+bb:
+  call void @alias3() #2
+  ret void
+}
+
+define internal void @aliasee_vgpr256_sgpr102() #1 {
+bb:
+  call void asm sideeffect "; clobber v252 ", "~{v252}"()
+  ret void
+}
+
+attributes #0 = { noinline norecurse nounwind optnone }
+attributes #1 = { noinline norecurse nounwind readnone willreturn "amdgpu-flat-work-group-size"="1,256" "amdgpu-waves-per-eu"="1,1" }
+attributes #2 = { nounwind readnone willreturn }
Index: llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
+++ llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll
@@ -227,10 +227,10 @@
 ; Make sure there's no assert when a sgpr96 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr96_external_call
 ; GCN: ; sgpr96 s[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
+; CI: NumSgprs: 104
+; VI-NOBUG: NumSgprs: 108
 ; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: NumVgprs: 64
 define amdgpu_kernel void @count_use_sgpr96_external_call()  {
 entry:
   tail call void asm sideeffect "; sgpr96 $0", "s"(<3 x i32> <i32 10, i32 11, i32 12>) #1
@@ -241,10 +241,10 @@
 ; Make sure there's no assert when a sgpr160 is used.
 ; GCN-LABEL: {{^}}count_use_sgpr160_external_call
 ; GCN: ; sgpr160 s[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
+; CI: NumSgprs: 104
+; VI-NOBUG: NumSgprs: 108
 ; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: NumVgprs: 64
 define amdgpu_kernel void @count_use_sgpr160_external_call()  {
 entry:
   tail call void asm sideeffect "; sgpr160 $0", "s"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
@@ -255,10 +255,10 @@
 ; Make sure there's no assert when a vgpr160 is used.
 ; GCN-LABEL: {{^}}count_use_vgpr160_external_call
 ; GCN: ; vgpr160 v[{{[0-9]+}}:{{[0-9]+}}]
-; CI: NumSgprs: 84
-; VI-NOBUG: NumSgprs: 86
+; CI: NumSgprs: 104
+; VI-NOBUG: NumSgprs: 108
 ; VI-BUG: NumSgprs: 96
-; GCN: NumVgprs: 50
+; GCN: NumVgprs: 64
 define amdgpu_kernel void @count_use_vgpr160_external_call()  {
 entry:
   tail call void asm sideeffect "; vgpr160 $0", "v"(<5 x i32> <i32 10, i32 11, i32 12, i32 13, i32 14>) #1
Index: llvm/test/CodeGen/AMDGPU/indirect-call.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -16,8 +16,8 @@
 ; GCN-NEXT:     amd_machine_version_stepping = 0
 ; GCN-NEXT:     kernel_code_entry_byte_offset = 256
 ; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
-; GCN-NEXT:     granulated_workitem_vgpr_count = 7
-; GCN-NEXT:     granulated_wavefront_sgpr_count = 4
+; GCN-NEXT:     granulated_workitem_vgpr_count = 15
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 12
 ; GCN-NEXT:     priority = 0
 ; GCN-NEXT:     float_mode = 240
 ; GCN-NEXT:     priv = 0
@@ -60,8 +60,8 @@
 ; GCN-NEXT:     gds_segment_byte_size = 0
 ; GCN-NEXT:     kernarg_segment_byte_size = 64
 ; GCN-NEXT:     workgroup_fbarrier_count = 0
-; GCN-NEXT:     wavefront_sgpr_count = 37
-; GCN-NEXT:     workitem_vgpr_count = 32
+; GCN-NEXT:     wavefront_sgpr_count = 104
+; GCN-NEXT:     workitem_vgpr_count = 64
 ; GCN-NEXT:     reserved_vgpr_first = 0
 ; GCN-NEXT:     reserved_vgpr_count = 0
 ; GCN-NEXT:     reserved_sgpr_first = 0
@@ -109,8 +109,8 @@
 ; GISEL-NEXT:     amd_machine_version_stepping = 0
 ; GISEL-NEXT:     kernel_code_entry_byte_offset = 256
 ; GISEL-NEXT:     kernel_code_prefetch_byte_size = 0
-; GISEL-NEXT:     granulated_workitem_vgpr_count = 7
-; GISEL-NEXT:     granulated_wavefront_sgpr_count = 4
+; GISEL-NEXT:     granulated_workitem_vgpr_count = 15
+; GISEL-NEXT:     granulated_wavefront_sgpr_count = 12
 ; GISEL-NEXT:     priority = 0
 ; GISEL-NEXT:     float_mode = 240
 ; GISEL-NEXT:     priv = 0
@@ -153,8 +153,8 @@
 ; GISEL-NEXT:     gds_segment_byte_size = 0
 ; GISEL-NEXT:     kernarg_segment_byte_size = 64
 ; GISEL-NEXT:     workgroup_fbarrier_count = 0
-; GISEL-NEXT:     wavefront_sgpr_count = 37
-; GISEL-NEXT:     workitem_vgpr_count = 32
+; GISEL-NEXT:     wavefront_sgpr_count = 104
+; GISEL-NEXT:     workitem_vgpr_count = 64
 ; GISEL-NEXT:     reserved_vgpr_first = 0
 ; GISEL-NEXT:     reserved_vgpr_count = 0
 ; GISEL-NEXT:     reserved_sgpr_first = 0
@@ -207,8 +207,8 @@
 ; GCN-NEXT:     amd_machine_version_stepping = 0
 ; GCN-NEXT:     kernel_code_entry_byte_offset = 256
 ; GCN-NEXT:     kernel_code_prefetch_byte_size = 0
-; GCN-NEXT:     granulated_workitem_vgpr_count = 7
-; GCN-NEXT:     granulated_wavefront_sgpr_count = 4
+; GCN-NEXT:     granulated_workitem_vgpr_count = 15
+; GCN-NEXT:     granulated_wavefront_sgpr_count = 12
 ; GCN-NEXT:     priority = 0
 ; GCN-NEXT:     float_mode = 240
 ; GCN-NEXT:     priv = 0
@@ -251,8 +251,8 @@
 ; GCN-NEXT:     gds_segment_byte_size = 0
 ; GCN-NEXT:     kernarg_segment_byte_size = 64
 ; GCN-NEXT:     workgroup_fbarrier_count = 0
-; GCN-NEXT:     wavefront_sgpr_count = 37
-; GCN-NEXT:     workitem_vgpr_count = 32
+; GCN-NEXT:     wavefront_sgpr_count = 104
+; GCN-NEXT:     workitem_vgpr_count = 64
 ; GCN-NEXT:     reserved_vgpr_first = 0
 ; GCN-NEXT:     reserved_vgpr_count = 0
 ; GCN-NEXT:     reserved_sgpr_first = 0
@@ -301,8 +301,8 @@
 ; GISEL-NEXT:     amd_machine_version_stepping = 0
 ; GISEL-NEXT:     kernel_code_entry_byte_offset = 256
 ; GISEL-NEXT:     kernel_code_prefetch_byte_size = 0
-; GISEL-NEXT:     granulated_workitem_vgpr_count = 7
-; GISEL-NEXT:     granulated_wavefront_sgpr_count = 4
+; GISEL-NEXT:     granulated_workitem_vgpr_count = 15
+; GISEL-NEXT:     granulated_wavefront_sgpr_count = 12
 ; GISEL-NEXT:     priority = 0
 ; GISEL-NEXT:     float_mode = 240
 ; GISEL-NEXT:     priv = 0
@@ -345,8 +345,8 @@
 ; GISEL-NEXT:     gds_segment_byte_size = 0
 ; GISEL-NEXT:     kernarg_segment_byte_size = 64
 ; GISEL-NEXT:     workgroup_fbarrier_count = 0
-; GISEL-NEXT:     wavefront_sgpr_count = 37
-; GISEL-NEXT:     workitem_vgpr_count = 32
+; GISEL-NEXT:     wavefront_sgpr_count = 104
+; GISEL-NEXT:     workitem_vgpr_count = 64
 ; GISEL-NEXT:     reserved_vgpr_first = 0
 ; GISEL-NEXT:     reserved_vgpr_count = 0
 ; GISEL-NEXT:     reserved_sgpr_first = 0