diff --git a/llvm/include/llvm/CodeGen/FastISel.h b/llvm/include/llvm/CodeGen/FastISel.h --- a/llvm/include/llvm/CodeGen/FastISel.h +++ b/llvm/include/llvm/CodeGen/FastISel.h @@ -565,8 +565,7 @@ MachineInstr *FirstTerminator = nullptr; unsigned FirstTerminatorOrder = std::numeric_limits::max(); - void initialize(MachineBasicBlock *MBB, - MachineBasicBlock::iterator LastFlushPoint); + void initialize(MachineBasicBlock *MBB); }; /// Sinks the local value materialization instruction LocalMI to its first use diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -242,8 +242,7 @@ /// Build a map of instruction orders. Return the first terminator and its /// order. Consider EH_LABEL instructions to be terminators as well, since local /// values for phis after invokes must be materialized before the call. -void FastISel::InstOrderMap::initialize( - MachineBasicBlock *MBB, MachineBasicBlock::iterator LastFlushPoint) { +void FastISel::InstOrderMap::initialize(MachineBasicBlock *MBB) { unsigned Order = 0; for (MachineInstr &I : *MBB) { if (!FirstTerminator && @@ -252,28 +251,37 @@ FirstTerminatorOrder = Order; } Orders[&I] = Order++; - - // We don't need to order instructions past the last flush point. - if (I.getIterator() == LastFlushPoint) - break; } } void FastISel::sinkLocalValueMaterialization(MachineInstr &LocalMI, Register DefReg, InstOrderMap &OrderMap) { - // If this register is used by a register fixup, MRI will not contain all - // the uses until after register fixups, so don't attempt to sink or DCE - // this instruction. Register fixups typically come from no-op cast - // instructions, which replace the cast instruction vreg with the local + // If DefReg is used by a register fixup, MRI will not directly describe + // all the uses until after register fixups. Chase down what those fixups + // would be and look for all of them when determining whether we can sink + // this instruction. Register fixups typically come from no-op cast or + // GEP instructions, which replace the instruction vreg with the local // value vreg. - if (FuncInfo.RegsWithFixups.count(DefReg)) - return; + SmallVector Regs; + Regs.push_back(DefReg); + + // It's sad we have to iterate the entire map for each fixup. + // But it's probably no cheaper to maintain an inverse map, especially + // as we can have DefReg show up more than once as a fixup target. + // Note: Checking size() each iteration as we're adding entries. + for (size_t RegsIndex = 0; RegsIndex < Regs.size(); ++RegsIndex) { + if (FuncInfo.RegsWithFixups.count(Regs[RegsIndex])) + for (auto &Fixup : FuncInfo.RegFixups) + if (Fixup.getSecond() == Regs[RegsIndex]) + Regs.push_back(Fixup.getFirst()); + } - // We can DCE this instruction if there are no uses and it wasn't a - // materialized for a successor PHI node. + // We can DCE this instruction if there are no uses, within or outside of + // this block. If it has fixups, it has uses; without fixups, it might be + // used locally, or by a PHI in a successor. bool UsedByPHI = isRegUsedByPhiNodes(DefReg, FuncInfo); - if (!UsedByPHI && MRI.use_nodbg_empty(DefReg)) { + if (Regs.size() == 1 && !UsedByPHI && MRI.use_nodbg_empty(DefReg)) { if (EmitStartPt == &LocalMI) EmitStartPt = EmitStartPt->getPrevNode(); LLVM_DEBUG(dbgs() << "removing dead local value materialization " @@ -286,19 +294,27 @@ // Number the instructions if we haven't yet so we can efficiently find the // earliest use. if (OrderMap.Orders.empty()) - OrderMap.initialize(FuncInfo.MBB, LastFlushPoint); + OrderMap.initialize(FuncInfo.MBB); // Find the first user in the BB. MachineInstr *FirstUser = nullptr; unsigned FirstOrder = std::numeric_limits::max(); - for (MachineInstr &UseInst : MRI.use_nodbg_instructions(DefReg)) { - auto I = OrderMap.Orders.find(&UseInst); - assert(I != OrderMap.Orders.end() && - "local value used by instruction outside local region"); - unsigned UseOrder = I->second; - if (UseOrder < FirstOrder) { - FirstOrder = UseOrder; - FirstUser = &UseInst; + for (Register R : Regs) { + for (MachineInstr &UseInst : MRI.use_nodbg_instructions(R)) { + auto I = OrderMap.Orders.find(&UseInst); + // Ignore uses outside this BB; they will be other uses of the + // value produced by the IR instruction, and this BB necessarily + // dominates all those uses. + if (I == OrderMap.Orders.end()) { + assert(Regs.size() > 1 && + "local value used by instruction outside local region"); + continue; + } + unsigned UseOrder = I->second; + if (UseOrder < FirstOrder) { + FirstOrder = UseOrder; + FirstUser = &UseInst; + } } } @@ -306,25 +322,29 @@ // whichever came first. If there was no terminator, this must be a // fallthrough block and the insertion point is the end of the block. MachineBasicBlock::instr_iterator SinkPos; - if (UsedByPHI && OrderMap.FirstTerminatorOrder < FirstOrder) { + if ((UsedByPHI || Regs.size() > 1) && + OrderMap.FirstTerminatorOrder < FirstOrder) { FirstOrder = OrderMap.FirstTerminatorOrder; SinkPos = OrderMap.FirstTerminator->getIterator(); } else if (FirstUser) { SinkPos = FirstUser->getIterator(); } else { - assert(UsedByPHI && "must be users if not used by a phi"); + assert((UsedByPHI || Regs.size() > 1) && + "must be users if not used outside the block"); SinkPos = FuncInfo.MBB->instr_end(); } // Collect all DBG_VALUEs before the new insertion position so that we can // sink them. SmallVector DbgValues; - for (MachineInstr &DbgVal : MRI.use_instructions(DefReg)) { - if (!DbgVal.isDebugValue()) - continue; - unsigned UseOrder = OrderMap.Orders[&DbgVal]; - if (UseOrder < FirstOrder) - DbgValues.push_back(&DbgVal); + for (Register R : Regs) { + for (MachineInstr &DbgVal : MRI.use_instructions(R)) { + if (!DbgVal.isDebugValue()) + continue; + unsigned UseOrder = OrderMap.Orders[&DbgVal]; + if (UseOrder < FirstOrder) + DbgValues.push_back(&DbgVal); + } } // Sink LocalMI before SinkPos and assign it the same DebugLoc. diff --git a/llvm/test/CodeGen/X86/sink-instr-value.ll b/llvm/test/CodeGen/X86/sink-instr-value.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/sink-instr-value.ll @@ -0,0 +1,224 @@ +; RUN: %llc_dwarf -O0 < %s | FileCheck %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +;; Show that sinking an instruction value to its first use can improve +;; line tables. This kind of instruction is handled as a "local value" +;; and so we lose the original source location; moving it to the first +;; use can give it back a reasonable location. + +;; Test functions derived from the following, using clang -g1. +;; +;; int other(char *arr, int i); +;; int another(char *arr); +;; +;; int test1(int a, int b) { +;; char nums[10]; +;; int c = a + b; +;; return other(nums, c); +;; } +;; +;; int test2(int a, int b) { +;; char nums[10]; +;; int c = 10; +;; if (a) +;; c = a + b; +;; return other(nums, c); +;; } +;; +;; int test3() { +;; char a[10]; +;; char b[10]; +;; return other(a, another(b)); +;; } +;; +;; struct C { char *d; char *e; }; +;; C test4(char *b, char *p) { +;; return { another(b) ? p : b, +;; p}; +;; } + +;; Show leaq being moved out of the prologue. +; +; CHECK-LABEL: _Z5test1ii: +; CHECK-NOT: leaq +; CHECK: .loc 1 7 10 +; CHECK-NEXT: leaq +; CHECK-NEXT: callq +; +define dso_local i32 @_Z5test1ii(i32 %a, i32 %b) !dbg !7 { +entry: + %a.addr = alloca i32, align 4 + %b.addr = alloca i32, align 4 + %nums = alloca [10 x i8], align 1 + %c = alloca i32, align 4 + store i32 %a, i32* %a.addr, align 4 + store i32 %b, i32* %b.addr, align 4 + %0 = load i32, i32* %a.addr, align 4, !dbg !9 + %1 = load i32, i32* %b.addr, align 4, !dbg !10 + %add = add nsw i32 %0, %1, !dbg !11 + store i32 %add, i32* %c, align 4, !dbg !12 + %arraydecay = getelementptr inbounds [10 x i8], [10 x i8]* %nums, i64 0, i64 0, !dbg !13 + %2 = load i32, i32* %c, align 4, !dbg !14 + %call = call i32 @_Z5otherPci(i8* %arraydecay, i32 %2), !dbg !15 + ret i32 %call, !dbg !16 +} + + +;; Show an leaq being moved across a call (past the previous flush point). +;; This means we need to consider uses in the entire block. +; +; CHECK-LABEL: _Z5test2ii: +; CHECK: .loc 1 15 22 +; CHECK-NEXT: movl +; CHECK-NEXT: .loc 1 15 10 +; CHECK-NEXT: leaq +; CHECK-NEXT: callq +; +define dso_local i32 @_Z5test2ii(i32 %a, i32 %b) #0 !dbg !17 { +entry: + %a.addr = alloca i32, align 4 + %b.addr = alloca i32, align 4 + %nums = alloca [10 x i8], align 1 + %c = alloca i32, align 4 + store i32 %a, i32* %a.addr, align 4 + store i32 %b, i32* %b.addr, align 4 + store i32 10, i32* %c, align 4, !dbg !18 + %0 = load i32, i32* %a.addr, align 4, !dbg !19 + %tobool = icmp ne i32 %0, 0, !dbg !19 + br i1 %tobool, label %if.then, label %if.end, !dbg !19 + +if.then: ; preds = %entry + %1 = load i32, i32* %a.addr, align 4, !dbg !20 + %2 = load i32, i32* %b.addr, align 4, !dbg !21 + %add = add nsw i32 %1, %2, !dbg !22 + store i32 %add, i32* %c, align 4, !dbg !23 + br label %if.end, !dbg !24 + +if.end: ; preds = %if.then, %entry + %arraydecay = getelementptr inbounds [10 x i8], [10 x i8]* %nums, i64 0, i64 0, !dbg !25 + %3 = load i32, i32* %c, align 4, !dbg !26 + %call = call i32 @_Z5otherPci(i8* %arraydecay, i32 %3), !dbg !27 + ret i32 %call, !dbg !28 +} + +;; Show an leaq being moved across a call (past the previous flush point). +;; This means we need to consider uses in the entire block. +; +; CHECK-LABEL: _Z5test3v: +; CHECK-NOT: leaq +; CHECK: .loc 1 21 19 +; CHECK-NEXT: leaq +; CHECK-NEXT: callq _Z7anotherPc +; CHECK: .loc 1 21 10 +; CHECK-NEXT: leaq +; CHECK-NEXT: callq _Z5otherPci +; +define dso_local i32 @_Z5test3v() #0 !dbg !29 { +entry: + %a = alloca [10 x i8], align 1 + %b = alloca [10 x i8], align 1 + %arraydecay = getelementptr inbounds [10 x i8], [10 x i8]* %a, i64 0, i64 0, !dbg !30 + %arraydecay1 = getelementptr inbounds [10 x i8], [10 x i8]* %b, i64 0, i64 0, !dbg !31 + %call = call i32 @_Z7anotherPc(i8* %arraydecay1), !dbg !32 + %call2 = call i32 @_Z5otherPci(i8* %arraydecay, i32 %call), !dbg !33 + ret i32 %call2, !dbg !34 +} + + +;; Show a case where the original instruction has no uses at all in its +;; block, so we move it to the end. This can save spill/restore cost. +; +; CHECK-LABEL: _Z5test4PcS_: +; CHECK-NOT: leaq +; CHECK: .loc 1 26 13 +; CHECK-NEXT: callq _Z7anotherPc +; CHECK-NEXT: cmpl +; CHECK-NEXT: leaq +; +%struct.C = type { i8*, i8* } + +define dso_local { i8*, i8* } @_Z5test4PcS_(i8* %b, i8* %p) #0 !dbg !35 { +entry: + %retval = alloca %struct.C, align 8 + %b.addr = alloca i8*, align 8 + %p.addr = alloca i8*, align 8 + store i8* %b, i8** %b.addr, align 8 + store i8* %p, i8** %p.addr, align 8 + %d = getelementptr inbounds %struct.C, %struct.C* %retval, i32 0, i32 0, !dbg !36 + %0 = load i8*, i8** %b.addr, align 8, !dbg !37 + %call = call i32 @_Z7anotherPc(i8* %0), !dbg !38 + %tobool = icmp ne i32 %call, 0, !dbg !38 + br i1 %tobool, label %cond.true, label %cond.false, !dbg !38 + +cond.true: ; preds = %entry + %1 = load i8*, i8** %p.addr, align 8, !dbg !39 + br label %cond.end, !dbg !38 + +cond.false: ; preds = %entry + %2 = load i8*, i8** %b.addr, align 8, !dbg !40 + br label %cond.end, !dbg !38 + +cond.end: ; preds = %cond.false, %cond.true + %cond = phi i8* [ %1, %cond.true ], [ %2, %cond.false ], !dbg !38 + store i8* %cond, i8** %d, align 8, !dbg !36 + %e = getelementptr inbounds %struct.C, %struct.C* %retval, i32 0, i32 1, !dbg !36 + %3 = load i8*, i8** %p.addr, align 8, !dbg !41 + store i8* %3, i8** %e, align 8, !dbg !36 + %4 = bitcast %struct.C* %retval to { i8*, i8* }*, !dbg !42 + %5 = load { i8*, i8* }, { i8*, i8* }* %4, align 8, !dbg !42 + ret { i8*, i8* } %5, !dbg !42 +} + +declare dso_local i32 @_Z5otherPci(i8*, i32) #1 +declare dso_local i32 @_Z7anotherPc(i8*) #1 + + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 12.0.0 (ALL clang version 99.99.0.57224 b9d61c39 debug)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None) +!1 = !DIFile(filename: "test4.cpp", directory: "/home/probinson/projects/scratch") +!2 = !{} +!3 = !{i32 7, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 12.0.0 (ALL clang version 99.99.0.57224 b9d61c39 debug)"} +!7 = distinct !DISubprogram(name: "test1", scope: !1, file: !1, line: 4, type: !8, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) +!8 = !DISubroutineType(types: !2) +!9 = !DILocation(line: 6, column: 11, scope: !7) +!10 = !DILocation(line: 6, column: 15, scope: !7) +!11 = !DILocation(line: 6, column: 13, scope: !7) +!12 = !DILocation(line: 6, column: 7, scope: !7) +!13 = !DILocation(line: 7, column: 16, scope: !7) +!14 = !DILocation(line: 7, column: 22, scope: !7) +!15 = !DILocation(line: 7, column: 10, scope: !7) +!16 = !DILocation(line: 7, column: 3, scope: !7) +!17 = distinct !DISubprogram(name: "test2", scope: !1, file: !1, line: 10, type: !8, scopeLine: 10, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) +!18 = !DILocation(line: 12, column: 7, scope: !17) +!19 = !DILocation(line: 13, column: 7, scope: !17) +!20 = !DILocation(line: 14, column: 9, scope: !17) +!21 = !DILocation(line: 14, column: 13, scope: !17) +!22 = !DILocation(line: 14, column: 11, scope: !17) +!23 = !DILocation(line: 14, column: 7, scope: !17) +!24 = !DILocation(line: 14, column: 5, scope: !17) +!25 = !DILocation(line: 15, column: 16, scope: !17) +!26 = !DILocation(line: 15, column: 22, scope: !17) +!27 = !DILocation(line: 15, column: 10, scope: !17) +!28 = !DILocation(line: 15, column: 3, scope: !17) +!29 = distinct !DISubprogram(name: "test3", scope: !1, file: !1, line: 18, type: !8, scopeLine: 18, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) +!30 = !DILocation(line: 21, column: 16, scope: !29) +!31 = !DILocation(line: 21, column: 27, scope: !29) +!32 = !DILocation(line: 21, column: 19, scope: !29) +!33 = !DILocation(line: 21, column: 10, scope: !29) +!34 = !DILocation(line: 21, column: 3, scope: !29) +!35 = distinct !DISubprogram(name: "test4", scope: !1, file: !1, line: 25, type: !8, scopeLine: 25, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) +!36 = !DILocation(line: 26, column: 11, scope: !35) +!37 = !DILocation(line: 26, column: 21, scope: !35) +!38 = !DILocation(line: 26, column: 13, scope: !35) +!39 = !DILocation(line: 26, column: 26, scope: !35) +!40 = !DILocation(line: 26, column: 30, scope: !35) +!41 = !DILocation(line: 27, column: 13, scope: !35) +!42 = !DILocation(line: 26, column: 3, scope: !35)