diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -93,6 +93,9 @@
       const MachineFunction &MF,
       const SIProgramInfo &PI) const;
 
+  // Determine whether we want to pad the end of .text with s_code_end.
+  bool wantFinalGuard(Module &M);
+
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
                             std::unique_ptr<MCStreamer> Streamer);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -317,12 +317,8 @@
   CallGraphResourceInfo.clear();
 
   // Pad with s_code_end to help tools and guard against instruction prefetch
-  // causing stale data in caches. Arguably this should be done by the linker,
-  // which is why this isn't done for Mesa.
-  const MCSubtargetInfo &STI = *getGlobalSTI();
-  if (AMDGPU::isGFX10(STI) &&
-      (STI.getTargetTriple().getOS() == Triple::AMDHSA ||
-       STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
+  // causing stale data in caches.
+  if (wantFinalGuard(M)) {
     OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
     getTargetStreamer()->EmitCodeEnd();
   }
@@ -330,6 +326,43 @@
   return AsmPrinter::doFinalization(M);
 }
 
+// Determine whether we want to pad the end of .text with s_code_end. We do
+// that on GFX10 to help tools and guard against instruction prefetch causing
+// stale data in caches.  Arguably this should be done by the linker, so:
+// - It is not done on Mesa.
+// - On PAL, it is only done if doing a full pipeline compile, as then there is
+//   no later link step.
+bool AMDGPUAsmPrinter::wantFinalGuard(Module &M) {
+  const MCSubtargetInfo &STI = *getGlobalSTI();
+  if (!AMDGPU::isGFX10(STI))
+    return false;
+  if (STI.getTargetTriple().getOS() == Triple::AMDHSA)
+    return true;
+  if (STI.getTargetTriple().getOS() != Triple::AMDPAL)
+    return false;
+  // On PAL, we add the guard only if doing a full pipeline compile.  We can
+  // spot a full pipeline compile by there being a PS and at least one of GS or
+  // VS, or instead a CS.
+  bool GotPs = false, GotGsOrVs = false;
+  for (Function &F : M) {
+    if (!F.isDeclaration()) {
+      switch (F.getCallingConv()) {
+      case CallingConv::AMDGPU_CS:
+        return true;
+        break;
+      case CallingConv::AMDGPU_PS:
+        GotPs = true;
+        break;
+      case CallingConv::AMDGPU_GS:
+      case CallingConv::AMDGPU_VS:
+        GotGsOrVs = true;
+        break;
+      }
+    }
+  }
+  return GotPs && GotGsOrVs;
+}
+
 // Print comments that apply to both callable functions and entry points.
 void AMDGPUAsmPrinter::emitCommonFunctionComments(
   uint32_t NumVGPR,
diff --git a/llvm/test/CodeGen/AMDGPU/s_code_end.ll b/llvm/test/CodeGen/AMDGPU/s_code_end.ll
--- a/llvm/test/CodeGen/AMDGPU/s_code_end.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_code_end.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10END,GFX10END-OBJ %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10NOEND,GFX10NOEND-ASM %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10NOEND,GFX10NOEND-ASM %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10NOEND,GFX10NOEND-OBJ %s
 
diff --git a/llvm/test/CodeGen/AMDGPU/s_code_end.ll b/llvm/test/CodeGen/AMDGPU/s_code_end_pal_pipeline.ll
copy from llvm/test/CodeGen/AMDGPU/s_code_end.ll
copy to llvm/test/CodeGen/AMDGPU/s_code_end_pal_pipeline.ll
--- a/llvm/test/CodeGen/AMDGPU/s_code_end.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_code_end_pal_pipeline.ll
@@ -1,8 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10END,GFX10END-OBJ %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10NOEND,GFX10NOEND-ASM %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10NOEND,GFX10NOEND-OBJ %s
 
 ; GCN:            a_kernel1{{>?}}:
 ; GCN:                    s_endpgm
@@ -11,7 +7,7 @@
 
 ; GCN-OBJ-NEXT:           s_nop 0
 
-define amdgpu_kernel void @a_kernel1() {
+define amdgpu_vs void @a_kernel1() {
   ret void
 }
 
@@ -22,7 +18,7 @@
 
 ; GCN-OBJ:   {{^$}}
 
-define amdgpu_kernel void @a_kernel2() {
+define amdgpu_ps void @a_kernel2() {
   ret void
 }