Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -295,6 +295,12 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) { CallGraphResourceInfo.clear(); + + if (AMDGPU::isGFX10(*getGlobalSTI())) { + OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); + getTargetStreamer()->EmitCodeEnd(); + } + return AsmPrinter::doFinalization(M); } @@ -928,6 +934,11 @@ 1ULL << ScratchAlignShift) >> ScratchAlignShift; + if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) { + ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1; + ProgInfo.MemOrdered = 1; + } + ProgInfo.ComputePGMRSrc1 = S_00B848_VGPRS(ProgInfo.VGPRBlocks) | S_00B848_SGPRS(ProgInfo.SGPRBlocks) | @@ -936,7 +947,9 @@ S_00B848_PRIV(ProgInfo.Priv) | S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp) | S_00B848_DEBUG_MODE(ProgInfo.DebugMode) | - S_00B848_IEEE_MODE(ProgInfo.IEEEMode); + S_00B848_IEEE_MODE(ProgInfo.IEEEMode) | + S_00B848_WGP_MODE(ProgInfo.WgpMode) | + S_00B848_MEM_ORDERED(ProgInfo.MemOrdered); // 0 = X, 1 = XY, 2 = XYZ unsigned TIDIGCompCnt = 0; @@ -1077,7 +1090,7 @@ Out.compute_pgm_resource_registers = CurrentProgramInfo.ComputePGMRSrc1 | (CurrentProgramInfo.ComputePGMRSrc2 << 32); - Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64; if (CurrentProgramInfo.DynamicCallStack) Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK; Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h @@ -74,6 +74,9 @@ /// \returns True on success, false on failure. virtual bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) = 0; + /// \returns True on success, false on failure. + virtual bool EmitCodeEnd() = 0; + virtual void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, @@ -113,6 +116,9 @@ /// \returns True on success, false on failure. bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; + /// \returns True on success, false on failure. + bool EmitCodeEnd() override; + void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, @@ -155,6 +161,9 @@ /// \returns True on success, false on failure. bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override; + /// \returns True on success, false on failure. + bool EmitCodeEnd() override; + void EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, Index: lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp =================================================================== --- lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -235,6 +235,13 @@ return true; } +bool AMDGPUTargetAsmStreamer::EmitCodeEnd() { + const uint32_t Encoded_s_code_end = 0xbf9f0000; + OS << "\t.p2alignl 6, " << Encoded_s_code_end << '\n'; + OS << "\t.fill 32, 4, " << Encoded_s_code_end << '\n'; + return true; +} + void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR, @@ -552,6 +559,18 @@ return true; } +bool AMDGPUTargetELFStreamer::EmitCodeEnd() { + const uint32_t Encoded_s_code_end = 0xbf9f0000; + + MCStreamer &OS = getStreamer(); + OS.PushSection(); + OS.EmitValueToAlignment(64, Encoded_s_code_end, 4); + for (unsigned I = 0; I < 32; ++I) + OS.EmitIntValue(Encoded_s_code_end, 4); + OS.PopSection(); + return true; +} + void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor( const MCSubtargetInfo &STI, StringRef KernelName, const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR, Index: lib/Target/AMDGPU/SIProgramInfo.h =================================================================== --- lib/Target/AMDGPU/SIProgramInfo.h +++ lib/Target/AMDGPU/SIProgramInfo.h @@ -28,6 +28,8 @@ uint32_t DX10Clamp = 0; uint32_t DebugMode = 0; uint32_t IEEEMode = 0; + uint32_t WgpMode = 0; // GFX10+ + uint32_t MemOrdered = 0; // GFX10+ uint64_t ScratchSize = 0; uint64_t ComputePGMRSrc1 = 0; Index: test/CodeGen/AMDGPU/s_code_end.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/s_code_end.ll @@ -0,0 +1,80 @@ +; RUN: llc -march=amdgcn -mcpu=gfx1010 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,GCN-ASM,GFX10,GFX10-ASM %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -filetype=obj < %s | llvm-objdump -arch=amdgcn -mcpu=gfx1010 -disassemble - | FileCheck -check-prefixes=GCN,GCN-OBJ,GFX10,GFX10-OBJ %s + +; GCN: a_kernel1: +; GCN-NEXT: s_endpgm +; GCN-ASM-NEXT: [[END_LABEL1:\.Lfunc_end.*]]: +; GCN-ASM-NEXT: .size a_kernel1, [[END_LABEL1]]-a_kernel1 +; GCN-ASM: .section .AMDGPU.config + +; GCN-OBJ-NEXT: s_nop 0 + +define amdgpu_kernel void @a_kernel1() { + ret void +} + +; GCN: a_kernel2: +; GCN-NEXT: s_endpgm +; GCN-ASM-NEXT: [[END_LABEL2:\.Lfunc_end.*]]: +; GCN-ASM-NEXT: .size a_kernel2, [[END_LABEL2]]-a_kernel2 +; GCN-ASM: .section .AMDGPU.config + +; GCN-OBJ-NEXT: {{^$}} + +define amdgpu_kernel void @a_kernel2() { + ret void +} + +; GCN-ASM: .text +; GCN-ASM-NEXT: .globl a_function +; GCN-ASM-NEXT: .p2align 2 +; GCN-ASM-NEXT: .type a_function,@function + +; GCN-NEXT: a_function: +; GCN: s_setpc_b64 +; GCN-ASM-NEXT: [[END_LABEL3:\.Lfunc_end.*]]: +; GCN-ASM-NEXT: .size a_function, [[END_LABEL3]]-a_function +; GFX10-ASM: .p2alignl 6, 3214868480 +; GFX10-ASM-NEXT: .fill 32, 4, 3214868480 + +; GFX10-OBJ-NEXT: s_code_end + +; GFX10-OBJ: s_code_end // 000000000140: +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end + +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end + +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end + +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end +; GFX10-OBJ-NEXT: s_code_end + +define void @a_function() { + ret void +}