diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -320,11 +320,35 @@ // causing stale data in caches. Arguably this should be done by the linker, // which is why this isn't done for Mesa. const MCSubtargetInfo &STI = *getGlobalSTI(); - if (AMDGPU::isGFX10(STI) && - (STI.getTargetTriple().getOS() == Triple::AMDHSA || - STI.getTargetTriple().getOS() == Triple::AMDPAL)) { - OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); - getTargetStreamer()->EmitCodeEnd(); + if (AMDGPU::isGFX10(STI)) { + bool AddGuard = STI.getTargetTriple().getOS() == Triple::AMDHSA; + if (!AddGuard && STI.getTargetTriple().getOS() == Triple::AMDPAL) { + // On PAL, we add the guard only if doing a full pipeline compile, as + // there is no later link step. We can spot a full pipeline compile by + // there being a PS and at least one of GS or VS, or instead a CS. + bool GotPs = false, GotGsOrVs = false, GotCs = false; + for (Function &F : M) { + if (!F.isDeclaration()) { + switch (F.getCallingConv()) { + case CallingConv::AMDGPU_CS: + GotCs = true; + break; + case CallingConv::AMDGPU_PS: + GotPs = true; + break; + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_VS: + GotGsOrVs = true; + break; + } + } + } + AddGuard = (GotPs && GotGsOrVs) || GotCs; + } + if (AddGuard) { + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + getTargetStreamer()->EmitCodeEnd(); + } } return AsmPrinter::doFinalization(M); diff --git a/llvm/test/CodeGen/AMDGPU/s_code_end.ll b/llvm/test/CodeGen/AMDGPU/s_code_end.ll --- a/llvm/test/CodeGen/AMDGPU/s_code_end.ll +++ b/llvm/test/CodeGen/AMDGPU/s_code_end.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10END,GFX10END-OBJ %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10NOEND,GFX10NOEND-ASM %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10NOEND,GFX10NOEND-ASM %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10NOEND,GFX10NOEND-OBJ %s diff --git a/llvm/test/CodeGen/AMDGPU/s_code_end.ll b/llvm/test/CodeGen/AMDGPU/s_code_end_pal_pipeline.ll copy from llvm/test/CodeGen/AMDGPU/s_code_end.ll copy to llvm/test/CodeGen/AMDGPU/s_code_end_pal_pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/s_code_end.ll +++ b/llvm/test/CodeGen/AMDGPU/s_code_end_pal_pipeline.ll @@ -1,8 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10END,GFX10END-OBJ %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10NOEND,GFX10NOEND-ASM %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10NOEND,GFX10NOEND-OBJ %s ; GCN: a_kernel1{{>?}}: ; GCN: s_endpgm @@ -11,7 +7,7 @@ ; GCN-OBJ-NEXT: s_nop 0 -define amdgpu_kernel void @a_kernel1() { +define amdgpu_vs void @a_kernel1() { ret void } @@ -22,7 +18,7 @@ ; GCN-OBJ: {{^$}} -define amdgpu_kernel void @a_kernel2() { +define amdgpu_ps void @a_kernel2() { ret void }