Index: lib/Target/AMDGPU/AMDGPU.td =================================================================== --- lib/Target/AMDGPU/AMDGPU.td +++ lib/Target/AMDGPU/AMDGPU.td @@ -318,6 +318,17 @@ >; //===----------------------------------------------------------------------===// +// Debugger related subtarget features. +//===----------------------------------------------------------------------===// + +def FeatureDebuggerInsertNops : SubtargetFeature< + "amdgpu-debugger-insert-nops", + "DebuggerInsertNops", + "true", + "Insert two nop instructions for each high level source statement" +>; + +//===----------------------------------------------------------------------===// def AMDGPUInstrInfo : InstrInfo { let guessInstructionProperties = 1; Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -94,6 +94,7 @@ int LDSBankCount; unsigned IsaVersion; bool EnableSIScheduler; + bool DebuggerInsertNops; std::unique_ptr FrameLowering; std::unique_ptr TLInfo; @@ -296,6 +297,10 @@ return EnableSIScheduler; } + bool debuggerInsertNops() const { + return DebuggerInsertNops; + } + bool dumpCode() const { return DumpCode; } Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -85,7 +85,9 @@ HasSMemRealTime(false), Has16BitInsts(false), LDSBankCount(0), IsaVersion(ISAVersion0_0_0), - EnableSIScheduler(false), FrameLowering(nullptr), + EnableSIScheduler(false), + DebuggerInsertNops(false), + FrameLowering(nullptr), InstrItins(getInstrItineraryForCPU(GPU)), TargetTriple(TT) { initializeSubtargetDependencies(TT, GPU, FS); Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -30,7 +30,6 @@ #include "llvm/IR/Verifier.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/IR/LegacyPassManager.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_os_ostream.h" #include "llvm/Transforms/IPO.h" @@ -149,11 +148,6 @@ namespace { -cl::opt InsertNops( - "amdgpu-insert-nops", - cl::desc("Insert two nop instructions for each high level source statement"), - cl::init(false)); - class AMDGPUPassConfig : public TargetPassConfig { public: AMDGPUPassConfig(TargetMachine *TM, PassManagerBase &PM) @@ -374,7 +368,9 @@ void GCNPassConfig::addPreEmitPass() { addPass(createSIInsertWaitsPass(), false); addPass(createSILowerControlFlowPass(), false); - if (InsertNops) { + + const AMDGPUSubtarget &ST = *getAMDGPUTargetMachine().getSubtargetImpl(); + if (ST.debuggerInsertNops()) { addPass(createSIInsertNopsPass(), false); } } Index: lib/Target/AMDGPU/SIInsertNopsPass.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertNopsPass.cpp +++ lib/Target/AMDGPU/SIInsertNopsPass.cpp @@ -8,14 +8,14 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief Insert two S_NOP instructions for every high level source statement. +/// \brief Insert two nop instructions for each high level source statement. /// /// Tools, such as debugger, need to pause execution based on user input (i.e. -/// breakpoint). In order to do this, two S_NOP instructions are inserted for -/// each high level source statement: one before first isa instruction of high -/// level source statement, and one after last isa instruction of high level -/// source statement. Further, debugger may replace S_NOP instructions with -/// S_TRAP instructions based on user input. +/// breakpoint). In order to do this, two nop instructions are inserted for each +/// high level source statement: one before first isa instruction of high level +/// source statement, and one after last isa instruction of high level source +/// statement. Further, debugger may replace nop instructions with trap +/// instructions based on user input. // //===----------------------------------------------------------------------===// @@ -24,6 +24,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" using namespace llvm; #define DEBUG_TYPE "si-insert-nops" @@ -53,10 +54,21 @@ } bool SIInsertNops::runOnMachineFunction(MachineFunction &MF) { + // Skip machine functions without debug info. + if (!MF.getMMI().hasDebugInfo()) { + return false; + } + + // Target instruction info. const SIInstrInfo *TII = static_cast(MF.getSubtarget().getInstrInfo()); + // Mapping from high level source statement line number to last corresponding + // isa instruction. DenseMap LineToInst; + // Insert nop instruction before first isa instruction of each high level + // source statement and collect last isa instruction for each high level + // source statement. for (auto MBB = MF.begin(); MBB != MF.end(); ++MBB) { for (auto MI = MBB->begin(); MI != MBB->end(); ++MI) { if (MI->isDebugValue() || !MI->getDebugLoc()) { @@ -74,6 +86,8 @@ } } } + // Insert nop instruction after last isa instruction of each high level source + // statement. for (auto LineToInstEntry = LineToInst.begin(); LineToInstEntry != LineToInst.end(); ++LineToInstEntry) { auto MBB = LineToInstEntry->second->getParent(); @@ -85,6 +99,7 @@ .addImm(0); } } + // Insert nop instruction before prologue. MachineBasicBlock &MBB = MF.front(); MachineInstr &MI = MBB.front(); BuildMI(MBB, MI, DebugLoc(), TII->get(AMDGPU::S_NOP)) Index: test/CodeGen/AMDGPU/debugger_insert_nops.ll =================================================================== --- test/CodeGen/AMDGPU/debugger_insert_nops.ll +++ test/CodeGen/AMDGPU/debugger_insert_nops.ll @@ -0,0 +1,66 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s + +; CHECK: debugger_insert_nops.cl:2:3 +; CHECK-NEXT: s_nop 0 +; CHECK: debugger_insert_nops.cl:3:3 +; CHECK-NEXT: s_nop 0 +; CHECK: debugger_insert_nops.cl:4:3 +; CHECK-NEXT: s_nop 0 + +; Function Attrs: nounwind +define void @debugger_insert_nops(i32 addrspace(1)* %A) #0 !dbg !4 { +entry: + %A.addr = alloca i32 addrspace(1)*, align 4 + store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4 + call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !18, metadata !19), !dbg !20 + %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !21 + %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !21 + store i32 777, i32 addrspace(1)* %arrayidx, align 4, !dbg !22 + %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !23 + %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !23 + store i32 888, i32 addrspace(1)* %arrayidx1, align 4, !dbg !24 + %2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !25 + %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %2, i32 2, !dbg !25 + store i32 999, i32 addrspace(1)* %arrayidx2, align 4, !dbg !26 + ret void, !dbg !27 +} + +; Function Attrs: nounwind readnone +declare void @llvm.dbg.declare(metadata, metadata, metadata) #1 + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="fiji" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } + +!llvm.dbg.cu = !{!0} +!opencl.kernels = !{!9} +!llvm.module.flags = !{!15, !16} +!llvm.ident = !{!17} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 266217)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, subprograms: !3) +!1 = !DIFile(filename: "debugger_insert_nops.cl", directory: "/home/kzhuravl/Sandbox/llvm/build/bin") +!2 = !{} +!3 = !{!4} +!4 = distinct !DISubprogram(name: "debugger_insert_nops", scope: !1, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, variables: !2) +!5 = !DISubroutineType(types: !6) +!6 = !{null, !7} +!7 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64, align: 32) +!8 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!9 = !{void (i32 addrspace(1)*)* @debugger_insert_nops, !10, !11, !12, !13, !14} +!10 = !{!"kernel_arg_addr_space", i32 1} +!11 = !{!"kernel_arg_access_qual", !"none"} +!12 = !{!"kernel_arg_type", !"int*"} +!13 = !{!"kernel_arg_base_type", !"int*"} +!14 = !{!"kernel_arg_type_qual", !""} +!15 = !{i32 2, !"Dwarf Version", i32 4} +!16 = !{i32 2, !"Debug Info Version", i32 3} +!17 = !{!"clang version 3.9.0 (trunk 266217)"} +!18 = !DILocalVariable(name: "A", arg: 1, scope: !4, file: !1, line: 1, type: !7) +!19 = !DIExpression() +!20 = !DILocation(line: 1, column: 50, scope: !4) +!21 = !DILocation(line: 2, column: 3, scope: !4) +!22 = !DILocation(line: 2, column: 8, scope: !4) +!23 = !DILocation(line: 3, column: 3, scope: !4) +!24 = !DILocation(line: 3, column: 8, scope: !4) +!25 = !DILocation(line: 4, column: 3, scope: !4) +!26 = !DILocation(line: 4, column: 8, scope: !4) +!27 = !DILocation(line: 5, column: 1, scope: !4)