Index: llvm/trunk/lib/Target/AMDGPU/AMDGPU.td =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPU.td +++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.td @@ -318,6 +318,17 @@ >; //===----------------------------------------------------------------------===// +// Debugger related subtarget features. +//===----------------------------------------------------------------------===// + +def FeatureDebuggerInsertNops : SubtargetFeature< + "amdgpu-debugger-insert-nops", + "DebuggerInsertNops", + "true", + "Insert two nop instructions for each high level source statement" +>; + +//===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { let guessInstructionProperties = 1; Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -95,6 +95,7 @@ int LDSBankCount; unsigned IsaVersion; bool EnableSIScheduler; + bool DebuggerInsertNops; std::unique_ptr FrameLowering; std::unique_ptr TLInfo; @@ -304,6 +305,10 @@ return EnableSIScheduler; } + bool debuggerInsertNops() const { + return DebuggerInsertNops; + } + bool dumpCode() const { return DumpCode; } Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -97,7 +97,9 @@ HasSMemRealTime(false), Has16BitInsts(false), LDSBankCount(0), IsaVersion(ISAVersion0_0_0), - EnableSIScheduler(false), FrameLowering(nullptr), + EnableSIScheduler(false), + DebuggerInsertNops(false), + FrameLowering(nullptr), GISel(), InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -31,7 +31,6 @@ #include "llvm/IR/Verifier.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_os_ostream.h" #include "llvm/Transforms/IPO.h" @@ -149,11 +148,6 @@ namespace { -cl::opt InsertNops( - "amdgpu-insert-nops", - cl::desc("Insert two nop instructions for each high level source statement"), - cl::init(false)); - class AMDGPUPassConfig : public TargetPassConfig { public: AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) @@ -397,7 +391,9 @@ void GCNPassConfig::addPreEmitPass() { addPass(createSIInsertWaitsPass(), false); addPass(createSILowerControlFlowPass(), false); - if (InsertNops) { + + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + if (ST.debuggerInsertNops()) { addPass(createSIInsertNopsPass(), false); } } Index: llvm/trunk/lib/Target/AMDGPU/SIInsertNopsPass.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInsertNopsPass.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInsertNopsPass.cpp @@ -8,14 +8,14 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Insert two S_NOP instructions for every high level source statement. +/// \brief Insert two nop instructions for each high level source statement. /// /// Tools, such as debugger, need to pause execution based on user input (i.e. -/// breakpoint). In order to do this, two S_NOP instructions are inserted for -/// each high level source statement: one before first isa instruction of high -/// level source statement, and one after last isa instruction of high level -/// source statement. Further, debugger may replace S_NOP instructions with -/// S_TRAP instructions based on user input. +/// breakpoint). In order to do this, two nop instructions are inserted for each +/// high level source statement: one before first isa instruction of high level +/// source statement, and one after last isa instruction of high level source +/// statement. Further, debugger may replace nop instructions with trap +/// instructions based on user input. // //===----------------------------------------------------------------------===// @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" using namespace llvm; #define DEBUG_TYPE "si-insert-nops" @@ -53,10 +54,21 @@ } bool SIInsertNops::runOnMachineFunction(MachineFunction &MF) { + // Skip machine functions without debug info. + if (!MF.getMMI().hasDebugInfo()) { + return false; + } + + // Target instruction info. const SIInstrInfo *TII = static_cast(MF.getSubtarget().getInstrInfo()); + // Mapping from high level source statement line number to last corresponding + // isa instruction. DenseMap LineToInst; + // Insert nop instruction before first isa instruction of each high level + // source statement and collect last isa instruction for each high level + // source statement. for (auto MBB = MF.begin(); MBB != MF.end(); ++MBB) { for (auto MI = MBB->begin(); MI != MBB->end(); ++MI) { if (MI->isDebugValue() || !MI->getDebugLoc()) { @@ -74,6 +86,8 @@ } } } + // Insert nop instruction after last isa instruction of each high level source + // statement. for (auto LineToInstEntry = LineToInst.begin(); LineToInstEntry != LineToInst.end(); ++LineToInstEntry) { auto MBB = LineToInstEntry->second->getParent(); @@ -85,6 +99,7 @@ .addImm(0); } } + // Insert nop instruction before prologue. MachineBasicBlock &MBB = MF.front(); MachineInstr &MI = MBB.front(); BuildMI(MBB, MI, DebugLoc(), TII->get(AMDGPU::S_NOP)) Index: llvm/trunk/test/CodeGen/AMDGPU/debugger_insert_nops.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/debugger_insert_nops.ll +++ llvm/trunk/test/CodeGen/AMDGPU/debugger_insert_nops.ll @@ -0,0 +1,75 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s + +; CHECK: debugger_insert_nops.cl:2:3 +; CHECK-NEXT: s_nop 0 +; CHECK: debugger_insert_nops.cl:3:3 +; CHECK-NEXT: s_nop 0 +; CHECK: debugger_insert_nops.cl:4:3 +; CHECK-NEXT: s_nop 0 +; CHECK: debugger_insert_nops.cl:5:3 +; CHECK-NEXT: s_nop 0 +; CHECK: debugger_insert_nops.cl:6:1 +; CHECK-NEXT: s_nop 0 +; CHECK-NEXT: s_endpgm + +; Function Attrs: nounwind +define void @debugger_insert_nops(i32 addrspace(1)* %A) #0 !dbg !12 { +entry: + %A.addr = alloca i32 addrspace(1)*, align 4 + store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19 + %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20 + store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21 + %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22 + store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23 + %2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !24 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %2, i32 2, !dbg !24 + store i32 3, i32 addrspace(1)* %arrayidx2, align 4, !dbg !25 + %3 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !26 + %arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %3, i32 4, !dbg !26 + store i32 4, i32 addrspace(1)* %arrayidx3, align 4, !dbg !27 + ret void, !dbg !28 +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="fiji" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } + +!llvm.dbg.cu = !{!0} +!opencl.kernels = !{!3} +!llvm.module.flags = !{!9, !10} +!llvm.ident = !{!11} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 266620)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2) +!1 = !DIFile(filename: "debugger_insert_nops.cl", directory: "/home/kzhuravl/Sandbox") +!2 = !{} +!3 = !{void (i32 addrspace(1)*)* @debugger_insert_nops, !4, !5, !6, !7, !8} +!4 = !{!"kernel_arg_addr_space", i32 1} +!5 = !{!"kernel_arg_access_qual", !"none"} +!6 = !{!"kernel_arg_type", !"int*"} +!7 = !{!"kernel_arg_base_type", !"int*"} +!8 = !{!"kernel_arg_type_qual", !""} +!9 = !{i32 2, !"Dwarf Version", i32 4} +!10 = !{i32 2, !"Debug Info Version", i32 3} +!11 = !{!"clang version 3.9.0 (trunk 266620)"} +!12 = distinct !DISubprogram(name: "debugger_insert_nops", scope: !1, file: !1, line: 1, type: !13, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2) +!13 = !DISubroutineType(types: !14) +!14 = !{null, !15} +!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64, align: 32) +!16 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!17 = !DILocalVariable(name: "A", arg: 1, scope: !12, file: !1, line: 1, type: !15) +!18 = !DIExpression() +!19 = !DILocation(line: 1, column: 46, scope: !12) +!20 = !DILocation(line: 2, column: 3, scope: !12) +!21 = !DILocation(line: 2, column: 8, scope: !12) +!22 = !DILocation(line: 3, column: 3, scope: !12) +!23 = !DILocation(line: 3, column: 8, scope: !12) +!24 = !DILocation(line: 4, column: 3, scope: !12) +!25 = !DILocation(line: 4, column: 8, scope: !12) +!26 = !DILocation(line: 5, column: 3, scope: !12) +!27 = !DILocation(line: 5, column: 8, scope: !12) +!28 = !DILocation(line: 6, column: 1, scope: !12)