Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPU.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.td @@ -329,6 +329,13 @@ "Reserve registers for debugger usage" >; +def FeatureDebuggerEmitPrologue : SubtargetFeature< + "amdgpu-debugger-emit-prologue", + "DebuggerEmitPrologue", + "true", + "Emit debugger prologue" +>; + //===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -42,6 +42,8 @@ FlatUsed(false), ReservedVGPRFirst(0), ReservedVGPRCount(0), + DebuggerWavefrontPrivateSegmentOffsetSGPR((uint16_t)-1), + DebuggerPrivateSegmentBufferSGPR((uint16_t)-1), VCCUsed(false), CodeLen(0) {} @@ -75,6 +77,14 @@ // The number of consecutive VGPRs reserved. uint16_t ReservedVGPRCount; + // Fixed SGPR number used to hold wave scratch offset for entire kernel + // execution, or uint16_t(-1) if the register is not used or not known. + uint16_t DebuggerWavefrontPrivateSegmentOffsetSGPR; + // Fixed SGPR number of the first 4 SGPRs used to hold scratch V# for entire + // kernel execution, or uint16_t(-1) if the register is not used or not + // known. + uint16_t DebuggerPrivateSegmentBufferSGPR; + // Bonus information for debugging. bool VCCUsed; uint64_t CodeLen; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -200,6 +200,13 @@ OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount), false); + if (MF.getSubtarget().debuggerEmitPrologue()) { + OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + + Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); + OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" + + Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false); + } + OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)), false); @@ -444,6 +451,16 @@ MaxVGPR += MFI->getDebuggerReservedVGPRCount(); } + // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and + // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue" + // attribute was specified. + if (STM.debuggerEmitPrologue()) { + ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR = + RI->getHWRegIndex(MFI->getScratchWaveOffsetReg()); + ProgInfo.DebuggerPrivateSegmentBufferSGPR = + RI->getHWRegIndex(MFI->getScratchRSrcReg()); + } + // We found the maximum register index. They start at 0, so add one to get the // number of registers. ProgInfo.NumVGPR = MaxVGPR + 1; @@ -670,6 +687,9 @@ if (MFI->hasDispatchPtr()) header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR; + if (STM.debuggerSupported()) + header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED; + if (STM.isXNACKEnabled()) header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED; @@ -681,6 +701,13 @@ header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst; header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount; + if (STM.debuggerEmitPrologue()) { + header.debug_wavefront_private_segment_offset_sgpr = + KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; + header.debug_private_segment_buffer_sgpr = + KernelInfo.DebuggerPrivateSegmentBufferSGPR; + } + AMDGPUTargetStreamer *TS = static_cast(OutStreamer->getTargetStreamer()); Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -77,6 +77,7 @@ bool EnableXNACK; bool DebuggerInsertNops; bool DebuggerReserveRegs; + bool DebuggerEmitPrologue; // Used as options. bool EnableVGPRSpilling; @@ -402,6 +403,11 @@ return EnableSIScheduler; } + bool debuggerSupported() const { + return debuggerInsertNops() && debuggerReserveRegs() && + debuggerEmitPrologue(); + } + bool debuggerInsertNops() const { return DebuggerInsertNops; } @@ -410,6 +416,10 @@ return DebuggerReserveRegs; } + bool debuggerEmitPrologue() const { + return DebuggerEmitPrologue; + } + bool loadStoreOptEnabled() const { return EnableLoadStoreOpt; } Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -101,6 +101,7 @@ EnableXNACK(false), DebuggerInsertNops(false), DebuggerReserveRegs(false), + DebuggerEmitPrologue(false), EnableVGPRSpilling(false), EnablePromoteAlloca(false), Index: llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.h +++ llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.h @@ -29,6 +29,10 @@ void processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS = nullptr) const override; + +private: + /// \brief Emits debugger prologue. + void emitDebuggerPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const; }; } Index: llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -39,6 +39,12 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { + // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was + // specified. + const SISubtarget &ST = MF.getSubtarget(); + if (ST.debuggerEmitPrologue()) + emitDebuggerPrologue(MF, MBB); + if (!MF.getFrameInfo()->hasStackObjects()) return; @@ -54,7 +60,6 @@ if (hasOnlySGPRSpills(MFI, MF.getFrameInfo())) return; - const SISubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); const SIRegisterInfo *TRI = &TII->getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -87,6 +92,8 @@ // pointer. Because we only detect if flat instructions are used at all, // this will be used more often than necessary on VI. + // Debug location must be unknown since the first debug location is used to + // determine the end of the prologue. DebugLoc DL; unsigned FlatScratchInitReg @@ -289,3 +296,44 @@ RS->addScavengingFrameIndex(ScavengeFI); } } + +void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF, + MachineBasicBlock &MBB) const { + const SISubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo *TRI = &TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo(); + + MachineBasicBlock::iterator I = MBB.begin(); + DebugLoc DL; + + // For each dimension: + for (unsigned i = 0; i < 3; ++i) { + // Get work group ID SGPR, and make it live-in again. + unsigned WorkGroupIDSGPR = MFI->getWorkGroupIDSGPR(i); + MF.getRegInfo().addLiveIn(WorkGroupIDSGPR); + MBB.addLiveIn(WorkGroupIDSGPR); + + // Since SGPRs are spilled into VGPRs, copy work group ID SGPR to VGPR in + // order to spill it to scratch. + unsigned WorkGroupIDVGPR = + MF.getRegInfo().createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_e32), WorkGroupIDVGPR) + .addReg(WorkGroupIDSGPR); + + // Spill work group ID. + int WorkGroupIDObjectIdx = MFI->getDebuggerWorkGroupIDStackObjectIndex(i); + TII->storeRegToStackSlot(MBB, I, WorkGroupIDVGPR, false, + WorkGroupIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); + + // Get work item ID VGPR, and make it live-in again. + unsigned WorkItemIDVGPR = MFI->getWorkItemIDVGPR(i); + MF.getRegInfo().addLiveIn(WorkItemIDVGPR); + MBB.addLiveIn(WorkItemIDVGPR); + + // Spill work item ID. + int WorkItemIDObjectIdx = MFI->getDebuggerWorkItemIDStackObjectIndex(i); + TII->storeRegToStackSlot(MBB, I, WorkItemIDVGPR, false, + WorkItemIDObjectIdx, &AMDGPU::VGPR_32RegClass, TRI); + } +} Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h @@ -70,6 +70,8 @@ bool isLegalMUBUFAddressingMode(const AddrMode &AM) const; bool isCFIntrinsic(const SDNode *Intr) const; + + void createDebuggerPrologueStackObjects(MachineFunction &MF) const; public: SITargetLowering(const TargetMachine &tm, const SISubtarget &STI); Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -596,6 +596,11 @@ return DAG.getEntryNode(); } + // Create stack objects that are used for emitting debugger prologue if + // "amdgpu-debugger-emit-prologue" attribute was specified. + if (ST.debuggerEmitPrologue()) + createDebuggerPrologueStackObjects(MF); + SmallVector Splits; BitVector Skipped(Ins.size()); @@ -1258,6 +1263,32 @@ } } +void SITargetLowering::createDebuggerPrologueStackObjects( + MachineFunction &MF) const { + // Create stack objects that are used for emitting debugger prologue. + // + // Debugger prologue writes work group IDs and work item IDs to scratch memory + // at fixed location in the following format: + // offset 0: work group ID x + // offset 4: work group ID y + // offset 8: work group ID z + // offset 16: work item ID x + // offset 20: work item ID y + // offset 24: work item ID z + SIMachineFunctionInfo *Info = MF.getInfo(); + int ObjectIdx = 0; + + // For each dimension: + for (unsigned i = 0; i < 3; ++i) { + // Create fixed stack object for work group ID. + ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4, true); + Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx); + // Create fixed stack object for work item ID. + ObjectIdx = MF.getFrameInfo()->CreateFixedObject(4, i * 4 + 16, true); + Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx); + } +} + /// This transforms the control flow intrinsics to get the branch destination as /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, Index: llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -64,6 +64,10 @@ // Number of reserved VGPRs for debugger usage. unsigned DebuggerReservedVGPRCount; + // Stack object indices for work group IDs. + int DebuggerWorkGroupIDStackObjectIndices[3]; + // Stack object indices for work item IDs. + int DebuggerWorkItemIDStackObjectIndices[3]; public: // FIXME: Make private @@ -334,6 +338,62 @@ return DebuggerReservedVGPRCount; } + /// \returns Stack object index for \p Dim's work group ID. + int getDebuggerWorkGroupIDStackObjectIndex(unsigned Dim) const { + assert(Dim < 3); + return DebuggerWorkGroupIDStackObjectIndices[Dim]; + } + + /// \brief Sets stack object index for \p Dim's work group ID to \p ObjectIdx. + void setDebuggerWorkGroupIDStackObjectIndex(unsigned Dim, int ObjectIdx) { + assert(Dim < 3); + DebuggerWorkGroupIDStackObjectIndices[Dim] = ObjectIdx; + } + + /// \returns Stack object index for \p Dim's work item ID. + int getDebuggerWorkItemIDStackObjectIndex(unsigned Dim) const { + assert(Dim < 3); + return DebuggerWorkItemIDStackObjectIndices[Dim]; + } + + /// \brief Sets stack object index for \p Dim's work item ID to \p ObjectIdx. + void setDebuggerWorkItemIDStackObjectIndex(unsigned Dim, int ObjectIdx) { + assert(Dim < 3); + DebuggerWorkItemIDStackObjectIndices[Dim] = ObjectIdx; + } + + /// \returns SGPR used for \p Dim's work group ID. + unsigned getWorkGroupIDSGPR(unsigned Dim) const { + switch (Dim) { + case 0: + assert(hasWorkGroupIDX()); + return WorkGroupIDXSystemSGPR; + case 1: + assert(hasWorkGroupIDY()); + return WorkGroupIDYSystemSGPR; + case 2: + assert(hasWorkGroupIDZ()); + return WorkGroupIDZSystemSGPR; + } + llvm_unreachable("unexpected dimension"); + } + + /// \returns VGPR used for \p Dim' work item ID. + unsigned getWorkItemIDVGPR(unsigned Dim) const { + switch (Dim) { + case 0: + assert(hasWorkItemIDX()); + return AMDGPU::VGPR0; + case 1: + assert(hasWorkItemIDY()); + return AMDGPU::VGPR1; + case 2: + assert(hasWorkItemIDZ()); + return AMDGPU::VGPR2; + } + llvm_unreachable("unexpected dimension"); + } + unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; }; Index: llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -54,6 +54,8 @@ ReturnsVoid(true), MaximumWorkGroupSize(0), DebuggerReservedVGPRCount(0), + DebuggerWorkGroupIDStackObjectIndices{0, 0, 0}, + DebuggerWorkItemIDStackObjectIndices{0, 0, 0}, LDSWaveSpillSize(0), PSInputEna(0), NumUserSGPRs(0), @@ -92,16 +94,16 @@ WorkItemIDX = true; } - if (F->hasFnAttribute("amdgpu-work-group-id-y")) + if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue()) WorkGroupIDY = true; - if (F->hasFnAttribute("amdgpu-work-group-id-z")) + if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue()) WorkGroupIDZ = true; - if (F->hasFnAttribute("amdgpu-work-item-id-y")) + if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue()) WorkItemIDY = true; - if (F->hasFnAttribute("amdgpu-work-item-id-z")) + if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue()) WorkItemIDZ = true; // X, XY, and XYZ are the only supported combinations, so make sure Y is Index: llvm/trunk/test/CodeGen/AMDGPU/debugger-emit-prologue.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/debugger-emit-prologue.ll +++ llvm/trunk/test/CodeGen/AMDGPU/debugger-emit-prologue.ll @@ -0,0 +1,80 @@ +; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-emit-prologue -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s --check-prefix=NOATTR + +; CHECK: debug_wavefront_private_segment_offset_sgpr = [[SOFF:[0-9]+]] +; CHECK: debug_private_segment_buffer_sgpr = [[SREG:[0-9]+]] + +; CHECK: v_mov_b32_e32 [[WGIDX:v[0-9]+]], s{{[0-9]+}} +; CHECK: buffer_store_dword [[WGIDX]], off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] +; CHECK: buffer_store_dword v0, off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:16 + +; CHECK: v_mov_b32_e32 [[WGIDY:v[0-9]+]], s{{[0-9]+}} +; CHECK: buffer_store_dword [[WGIDY]], off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:4 +; CHECK: buffer_store_dword v1, off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:20 + +; CHECK: v_mov_b32_e32 [[WGIDZ:v[0-9]+]], s{{[0-9]+}} +; CHECK: buffer_store_dword [[WGIDZ]], off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:8 +; CHECK: buffer_store_dword v2, off, s[{{[0-9]+:[0-9]+}}], s[[SOFF]] offset:24 + +; CHECK: DebuggerWavefrontPrivateSegmentOffsetSGPR: s[[SOFF]] +; CHECK: DebuggerPrivateSegmentBufferSGPR: s[[SREG]] + +; NOATTR-NOT: DebuggerWavefrontPrivateSegmentOffsetSGPR +; NOATTR-NOT: DebuggerPrivateSegmentBufferSGPR + +; Function Attrs: nounwind +define void @test(i32 addrspace(1)* %A) #0 !dbg !12 { +entry: + %A.addr = alloca i32 addrspace(1)*, align 4 + store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19 + %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20 + store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21 + %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22 + store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23 + %2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !24 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %2, i32 2, !dbg !24 + store i32 3, i32 addrspace(1)* %arrayidx2, align 4, !dbg !25 + ret void, !dbg !26 +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="fiji" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } + +!llvm.dbg.cu = !{!0} +!opencl.kernels = !{!3} +!llvm.module.flags = !{!9, !10} +!llvm.ident = !{!11} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 269772)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "test01.cl", directory: "/home/kzhuravl/Lightning/testing") +!2 = !{} +!3 = !{void (i32 addrspace(1)*)* @test, !4, !5, !6, !7, !8} +!4 = !{!"kernel_arg_addr_space", i32 1} +!5 = !{!"kernel_arg_access_qual", !"none"} +!6 = !{!"kernel_arg_type", !"int*"} +!7 = !{!"kernel_arg_base_type", !"int*"} +!8 = !{!"kernel_arg_type_qual", !""} +!9 = !{i32 2, !"Dwarf Version", i32 2} +!10 = !{i32 2, !"Debug Info Version", i32 3} +!11 = !{!"clang version 3.9.0 (trunk 269772)"} +!12 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !13, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) +!13 = !DISubroutineType(types: !14) +!14 = !{null, !15} +!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64, align: 32) +!16 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!17 = !DILocalVariable(name: "A", arg: 1, scope: !12, file: !1, line: 1, type: !15) +!18 = !DIExpression() +!19 = !DILocation(line: 1, column: 30, scope: !12) +!20 = !DILocation(line: 2, column: 3, scope: !12) +!21 = !DILocation(line: 2, column: 8, scope: !12) +!22 = !DILocation(line: 3, column: 3, scope: !12) +!23 = !DILocation(line: 3, column: 8, scope: !12) +!24 = !DILocation(line: 4, column: 3, scope: !12) +!25 = !DILocation(line: 4, column: 8, scope: !12) +!26 = !DILocation(line: 5, column: 1, scope: !12)