diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -320,11 +320,35 @@
   // causing stale data in caches. Arguably this should be done by the linker,
   // which is why this isn't done for Mesa.
   const MCSubtargetInfo &STI = *getGlobalSTI();
-  if (AMDGPU::isGFX10(STI) &&
-      (STI.getTargetTriple().getOS() == Triple::AMDHSA ||
-       STI.getTargetTriple().getOS() == Triple::AMDPAL)) {
-    OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
-    getTargetStreamer()->EmitCodeEnd();
+  if (AMDGPU::isGFX10(STI)) {
+    bool AddGuard = STI.getTargetTriple().getOS() == Triple::AMDHSA;
+    if (!AddGuard && STI.getTargetTriple().getOS() == Triple::AMDPAL) {
+      // On PAL, we add the guard only if doing a full pipeline compile, as
+      // there is no later link step. We can spot a full pipeline compile by
+      // there being a PS and at least one of GS or VS, or instead a CS.
+      bool GotPs = false, GotGsOrVs = false, GotCs = false;
+      for (Function &F : M) {
+        if (!F.isDeclaration()) {
+          switch (F.getCallingConv()) {
+          case CallingConv::AMDGPU_CS:
+            GotCs = true;
+            break;
+          case CallingConv::AMDGPU_PS:
+            GotPs = true;
+            break;
+          case CallingConv::AMDGPU_GS:
+          case CallingConv::AMDGPU_VS:
+            GotGsOrVs = true;
+            break;
+          }
+        }
+      }
+      AddGuard = (GotPs && GotGsOrVs) || GotCs;
+    }
+    if (AddGuard) {
+      OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+      getTargetStreamer()->EmitCodeEnd();
+    }
   }
 
   return AsmPrinter::doFinalization(M);
diff --git a/llvm/test/CodeGen/AMDGPU/s_code_end.ll b/llvm/test/CodeGen/AMDGPU/s_code_end.ll
--- a/llvm/test/CodeGen/AMDGPU/s_code_end.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_code_end.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10END,GFX10END-OBJ %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10NOEND,GFX10NOEND-ASM %s
 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10NOEND,GFX10NOEND-ASM %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10NOEND,GFX10NOEND-OBJ %s
 
diff --git a/llvm/test/CodeGen/AMDGPU/s_code_end.ll b/llvm/test/CodeGen/AMDGPU/s_code_end_pal_pipeline.ll
copy from llvm/test/CodeGen/AMDGPU/s_code_end.ll
copy to llvm/test/CodeGen/AMDGPU/s_code_end_pal_pipeline.ll
--- a/llvm/test/CodeGen/AMDGPU/s_code_end.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_code_end_pal_pipeline.ll
@@ -1,8 +1,4 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10END,GFX10END-OBJ %s
 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10NOEND,GFX10NOEND-ASM %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10NOEND,GFX10NOEND-OBJ %s
 
 ; GCN:            a_kernel1{{>?}}:
 ; GCN:                    s_endpgm
@@ -11,7 +7,7 @@
 
 ; GCN-OBJ-NEXT:           s_nop 0
 
-define amdgpu_kernel void @a_kernel1() {
+define amdgpu_vs void @a_kernel1() {
   ret void
 }
 
@@ -22,7 +18,7 @@
 
 ; GCN-OBJ:   {{^$}}
 
-define amdgpu_kernel void @a_kernel2() {
+define amdgpu_ps void @a_kernel2() {
   ret void
 }