diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -93,6 +93,9 @@ const MachineFunction &MF, const SIProgramInfo &PI) const; + // Determine whether we want to pad the end of .text with s_code_end. + bool wantFinalGuard(Module &M); + public: explicit AMDGPUAsmPrinter(TargetMachine &TM, std::unique_ptr Streamer); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -317,12 +317,8 @@ CallGraphResourceInfo.clear(); // Pad with s_code_end to help tools and guard against instruction prefetch - // causing stale data in caches. Arguably this should be done by the linker, - // which is why this isn't done for Mesa. - const MCSubtargetInfo &STI = *getGlobalSTI(); - if (AMDGPU::isGFX10(STI) && - (STI.getTargetTriple().getOS() == Triple::AMDHSA || - STI.getTargetTriple().getOS() == Triple::AMDPAL)) { + // causing stale data in caches. + if (wantFinalGuard(M)) { OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); getTargetStreamer()->EmitCodeEnd(); } @@ -330,6 +326,43 @@ return AsmPrinter::doFinalization(M); } +// Determine whether we want to pad the end of .text with s_code_end. We do +// that on GFX10 to help tools and guard against instruction prefetch causing +// stale data in caches. Arguably this should be done by the linker, so: +// - It is not done on Mesa. +// - On PAL, it is only done if doing a full pipeline compile, as then there is +// no later link step. +bool AMDGPUAsmPrinter::wantFinalGuard(Module &M) { + const MCSubtargetInfo &STI = *getGlobalSTI(); + if (!AMDGPU::isGFX10(STI)) + return false; + if (STI.getTargetTriple().getOS() == Triple::AMDHSA) + return true; + if (STI.getTargetTriple().getOS() != Triple::AMDPAL) + return false; + // On PAL, we add the guard only if doing a full pipeline compile. We can + // spot a full pipeline compile by there being a PS and at least one of GS or + // VS, or instead a CS. + bool GotPs = false, GotGsOrVs = false; + for (Function &F : M) { + if (!F.isDeclaration()) { + switch (F.getCallingConv()) { + case CallingConv::AMDGPU_CS: + return true; + break; + case CallingConv::AMDGPU_PS: + GotPs = true; + break; + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_VS: + GotGsOrVs = true; + break; + } + } + } + return GotPs && GotGsOrVs; +} + // Print comments that apply to both callable functions and entry points. void AMDGPUAsmPrinter::emitCommonFunctionComments( uint32_t NumVGPR, diff --git a/llvm/test/CodeGen/AMDGPU/s_code_end.ll b/llvm/test/CodeGen/AMDGPU/s_code_end.ll --- a/llvm/test/CodeGen/AMDGPU/s_code_end.ll +++ b/llvm/test/CodeGen/AMDGPU/s_code_end.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10END,GFX10END-OBJ %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10NOEND,GFX10NOEND-ASM %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10NOEND,GFX10NOEND-ASM %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10NOEND,GFX10NOEND-OBJ %s diff --git a/llvm/test/CodeGen/AMDGPU/s_code_end.ll b/llvm/test/CodeGen/AMDGPU/s_code_end_pal_pipeline.ll copy from llvm/test/CodeGen/AMDGPU/s_code_end.ll copy to llvm/test/CodeGen/AMDGPU/s_code_end_pal_pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/s_code_end.ll +++ b/llvm/test/CodeGen/AMDGPU/s_code_end_pal_pipeline.ll @@ -1,8 +1,4 @@ -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10END,GFX10END-OBJ %s ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10END,GFX10END-ASM %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10NOEND,GFX10NOEND-ASM %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx1010 -d - | FileCheck --check-prefixes=GCN,GCN-OBJ,GFX10NOEND,GFX10NOEND-OBJ %s ; GCN: a_kernel1{{>?}}: ; GCN: s_endpgm @@ -11,7 +7,7 @@ ; GCN-OBJ-NEXT: s_nop 0 -define amdgpu_kernel void @a_kernel1() { +define amdgpu_vs void @a_kernel1() { ret void } @@ -22,7 +18,7 @@ ; GCN-OBJ: {{^$}} -define amdgpu_kernel void @a_kernel2() { +define amdgpu_ps void @a_kernel2() { ret void }