diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -446,7 +446,7 @@
     OutStreamer->SwitchSection(ConfigSection);
   }
 
-  if (MFI->isEntryFunction()) {
+  if (MFI->isModuleEntryFunction()) {
     getSIProgramInfo(CurrentProgramInfo, MF);
   } else {
     auto I = CallGraphResourceInfo.insert(
@@ -459,7 +459,7 @@
   if (STM.isAmdPalOS()) {
     if (MFI->isEntryFunction())
       EmitPALMetadata(MF, CurrentProgramInfo);
-    else
+    else if (MFI->isModuleEntryFunction())
       emitPALFunctionMetadata(MF);
   } else if (!STM.isAmdHsaOS()) {
     EmitProgramInfoSI(MF, CurrentProgramInfo);
@@ -922,7 +922,22 @@
           = TII->getNamedOperand(MI, AMDGPU::OpName::callee);
 
         const Function *Callee = getCalleeFunction(*CalleeOp);
-        if (!Callee || Callee->isDeclaration()) {
+        DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
+            CallGraphResourceInfo.end();
+        bool IsExternal = !Callee || Callee->isDeclaration();
+        if (!IsExternal)
+          I = CallGraphResourceInfo.find(Callee);
+
+        if (IsExternal || I == CallGraphResourceInfo.end()) {
+          // Avoid crashing on undefined behavior with an illegal call to a
+          // kernel. If a callsite's calling convention doesn't match the
+          // function's, it's undefined behavior. If the callsite calling
+          // convention does match, that would have errored earlier.
+          // FIXME: The verifier shouldn't allow this.
+          if (!IsExternal &&
+              AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
+            report_fatal_error("invalid call to entry function");
+
           // If this is a call to an external function, we can't do much. Make
           // conservative guesses.
 
@@ -943,19 +958,6 @@
           // We force CodeGen to run in SCC order, so the callee's register
           // usage etc. should be the cumulative usage of all callees.
 
-          auto I = CallGraphResourceInfo.find(Callee);
-          if (I == CallGraphResourceInfo.end()) {
-            // Avoid crashing on undefined behavior with an illegal call to a
-            // kernel. If a callsite's calling convention doesn't match the
-            // function's, it's undefined behavior. If the callsite calling
-            // convention does match, that would have errored earlier.
-            // FIXME: The verifier shouldn't allow this.
-            if (AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
-              report_fatal_error("invalid call to entry function");
-
-            llvm_unreachable("callee should have been handled before caller");
-          }
-
           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
@@ -1266,7 +1268,11 @@
 void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
   auto *MD = getTargetStreamer()->getPALMetadata();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  MD->setStackFrameSize(MF, MFI.getStackSize());
+  MD->setFunctionScratchSize(MF, MFI.getStackSize());
+  // Set compute registers
+  MD->setRsrc1(CallingConv::AMDGPU_CS,
+               CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
+  MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2);
 }
 
 // This is supposed to be log2(Size)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1301,7 +1301,7 @@
 
   if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
       G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
-    if (!MFI->isEntryFunction()) {
+    if (!MFI->isModuleEntryFunction()) {
       SDLoc DL(Op);
       const Function &Fn = DAG.getMachineFunction().getFunction();
       DiagnosticInfoUnsupported BadLDSDecl(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2260,7 +2260,7 @@
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
-    if (!MFI->isEntryFunction()) {
+    if (!MFI->isModuleEntryFunction()) {
       const Function &Fn = MF.getFunction();
       DiagnosticInfoUnsupported BadLDSDecl(
         Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -48,6 +48,9 @@
   // by other functions.
   bool IsEntryFunction = false;
 
+  // Entry points called by other functions instead of the driver.
+  bool IsModuleEntryFunction = false;
+
   bool NoSignedZerosFPMath = false;
 
   // Function may be memory bound.
@@ -77,6 +80,8 @@
     return IsEntryFunction;
   }
 
+  bool isModuleEntryFunction() const { return IsModuleEntryFunction; }
+
   bool hasNoSignedZerosFPMath() const {
     return NoSignedZerosFPMath;
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -13,11 +13,13 @@
 
 using namespace llvm;
 
-AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
-  MachineFunctionInfo(),
-  Mode(MF.getFunction()),
-  IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
-  NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
+AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF)
+    : MachineFunctionInfo(), Mode(MF.getFunction()),
+      IsEntryFunction(
+          AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
+      IsModuleEntryFunction(
+          AMDGPU::isModuleEntryFunctionCC(MF.getFunction().getCallingConv())),
+      NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
 
   // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -576,6 +576,15 @@
 LLVM_READNONE
 bool isEntryFunctionCC(CallingConv::ID CC);
 
+// These functions are considered entrypoints into the current module, i.e. they
+// are allowed to be called from outside the current module. This is different
+// from isEntryFunctionCC, which is only true for functions that are entered by
+// the hardware. Module entry points include all entry functions but also
+// include functions that can be called from other functions inside or outside
+// the current module. Module entry functions are allowed to allocate LDS.
+LLVM_READNONE
+bool isModuleEntryFunctionCC(CallingConv::ID CC);
+
 // FIXME: Remove this when calling conventions cleaned up
 LLVM_READNONE
 inline bool isKernel(CallingConv::ID CC) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1068,6 +1068,15 @@
   }
 }
 
+bool isModuleEntryFunctionCC(CallingConv::ID CC) {
+  switch (CC) {
+  case CallingConv::AMDGPU_Gfx:
+    return true;
+  default:
+    return isEntryFunctionCC(CC);
+  }
+}
+
 bool hasXNACK(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureXNACK];
 }
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -79,7 +79,7 @@
   void setScratchSize(unsigned CC, unsigned Val);
 
   // Set the stack frame size of a function in the metadata.
-  void setStackFrameSize(const MachineFunction &MF, unsigned Val);
+  void setFunctionScratchSize(const MachineFunction &MF, unsigned Val);
 
   // Set the hardware register bit in PAL metadata to enable wave32 on the
   // shader of the given calling convention.
@@ -130,6 +130,9 @@
   // Get (create if necessary) the shader functions map.
   msgpack::MapDocNode getShaderFunctions();
 
+  // Get (create if necessary) a function in the shader functions map.
+  msgpack::MapDocNode getShaderFunction(StringRef Name);
+
   // Get (create if necessary) the .hardware_stages entry for the given calling
   // convention.
   msgpack::MapDocNode getHwStage(unsigned CC);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -238,12 +238,11 @@
   getHwStage(CC)[".scratch_memory_size"] = MsgPackDoc.getNode(Val);
 }
 
-// Set the scratch size in the metadata.
-void AMDGPUPALMetadata::setStackFrameSize(const MachineFunction &MF,
-                                          unsigned Val) {
-  auto Node = MsgPackDoc.getMapNode();
+// Set the stack frame size of a function in the metadata.
+void AMDGPUPALMetadata::setFunctionScratchSize(const MachineFunction &MF,
+                                               unsigned Val) {
+  auto Node = getShaderFunction(MF.getFunction().getName());
   Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val);
-  getShaderFunctions()[MF.getFunction().getName()] = Node;
 }
 
 // Set the hardware register bit in PAL metadata to enable wave32 on the
@@ -747,6 +746,12 @@
   return ShaderFunctions.getMap();
 }
 
+// Get (create if necessary) a function in the shader functions map.
+msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunction(StringRef Name) {
+  auto Functions = getShaderFunctions();
+  return Functions[Name].getMap(/*Convert=*/true);
+}
+
 // Return the PAL metadata hardware shader stage name.
 static const char *getStageName(CallingConv::ID CC) {
   switch (CC) {
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
--- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=amdgcn--amdpal -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s
 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG -enable-var-scope %s
 ; RUN: llc -global-isel -mtriple=amdgcn--amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL -enable-var-scope %s
 
@@ -126,10 +125,29 @@
   ret float %add
 }
 
+@lds = internal addrspace(3) global [64 x float] undef
+
+define amdgpu_gfx float @simple_lds(float %arg0) #0 {
+  %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0
+  %val = load float, float addrspace(3)* %lds_ptr
+  ret float %val
+}
+
+define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 {
+  %lds_ptr = getelementptr [64 x float], [64 x float] addrspace(3)* @lds, i32 0, i32 0
+  %val = load float, float addrspace(3)* %lds_ptr
+  %res = call amdgpu_gfx float @simple_lds_recurse(float %val)
+  ret float %res
+}
+
 attributes #0 = { nounwind }
 
 ; GCN: amdpal.pipelines:
-; GCN-NEXT:   - .registers:      {}
+; GCN-NEXT:  - .registers:
+; SDAG-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01ca{{$}}
+; SDAG-NEXT:      0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}}
+; GISEL-NEXT:      0x2e12 (COMPUTE_PGM_RSRC1): 0xaf01cf{{$}}
+; GISEL-NEXT:      0x2e13 (COMPUTE_PGM_RSRC2): 0x8001{{$}}
 ; GCN-NEXT:    .shader_functions:
 ; GCN-NEXT:      dynamic_stack:
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
@@ -148,6 +166,10 @@
 ; GISEL-NEXT:        .stack_frame_size_in_bytes: 0xd0{{$}}
 ; GCN-NEXT:      no_stack_indirect_call:
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
+; GCN-NEXT:      simple_lds:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0{{$}}
+; GCN-NEXT:      simple_lds_recurse:
+; GCN-NEXT:        .stack_frame_size_in_bytes: 0x10{{$}}
 ; GCN-NEXT:      simple_stack:
 ; GCN-NEXT:        .stack_frame_size_in_bytes: 0x14{{$}}
 ; GCN-NEXT:      simple_stack_call: