diff --git a/bolt/include/bolt/Passes/FrameAnalysis.h b/bolt/include/bolt/Passes/FrameAnalysis.h --- a/bolt/include/bolt/Passes/FrameAnalysis.h +++ b/bolt/include/bolt/Passes/FrameAnalysis.h @@ -121,6 +121,10 @@ /// Set of functions that require the stack to be 16B aligned DenseSet FunctionsRequireAlignment; + /// Set of functions that performs computations with stack addresses and + /// complicates our understanding of aliasing of stack spaces. + DenseSet FunctionsWithStackArithmetic; + /// Owns ArgAccesses for all instructions. References to elements are /// attached to instructions as indexes to this vector, in MCAnnotations. std::vector ArgAccessesVector; @@ -183,6 +187,12 @@ return FunctionsRequireAlignment.count(&Func); } + /// Return true if \p Func does computation with the address of any stack + /// position, meaning we have limited alias analysis on this function. + bool hasStackArithmetic(const BinaryFunction &Func) const { + return FunctionsWithStackArithmetic.count(&Func); + } + /// Functions for retrieving our specific MCAnnotation data from instructions ErrorOr getArgAccessesFor(const MCInst &Inst); diff --git a/bolt/lib/Passes/FrameAnalysis.cpp b/bolt/lib/Passes/FrameAnalysis.cpp --- a/bolt/lib/Passes/FrameAnalysis.cpp +++ b/bolt/lib/Passes/FrameAnalysis.cpp @@ -38,6 +38,11 @@ static cl::opt TimeFA("time-fa", cl::desc("time frame analysis steps"), cl::ReallyHidden, cl::cat(BoltOptCategory)); +static cl::opt + ExperimentalSW("experimental-shrink-wrapping", + cl::desc("process functions with stack pointer arithmetic"), + cl::ReallyHidden, cl::ZeroOrMore, cl::cat(BoltOptCategory)); + bool shouldFrameOptimize(const llvm::bolt::BinaryFunction &Function) { if (Function.hasUnknownControlFlow()) return false; @@ -111,6 +116,7 @@ const MCInst *Prev{nullptr}; /// Info about the last frame access bool IsValidAccess{false}; + bool EscapesStackAddress{false}; FrameIndexEntry FIE; bool decodeFrameAccess(const MCInst &Inst) { @@ -124,14 +130,15 @@ return true; } - if (IsIndexed || FIE.Size == 0) { + if (IsIndexed || (!FIE.Size && (FIE.IsLoad || FIE.IsStore))) { LLVM_DEBUG(dbgs() << "Giving up on indexed memory access/unknown size\n"); LLVM_DEBUG(dbgs() << "Blame insn: "); + LLVM_DEBUG(BC.printInstruction(outs(), Inst, 0, &BF, true, false, false)); LLVM_DEBUG(Inst.dump()); return false; } - assert(FIE.Size != 0); + assert(FIE.Size != 0 || (!FIE.IsLoad && !FIE.IsStore)); FIE.RegOrImm = SrcImm; if (FIE.IsLoad || FIE.IsStoreFromReg) @@ -171,9 +178,11 @@ const FrameIndexEntry &getFIE() const { return FIE; } int getSPOffset() const { return SPOffset; } bool isValidAccess() const { return IsValidAccess; } + bool doesEscapeStackAddress() const { return EscapesStackAddress; } bool doNext(const BinaryBasicBlock &BB, const MCInst &Inst) { IsValidAccess = false; + EscapesStackAddress = false; std::tie(SPOffset, FPOffset) = Prev ? *SPT.getStateAt(*Prev) : *SPT.getStateAt(BB); Prev = &Inst; @@ -214,11 +223,14 @@ } if (BC.MIB->escapesVariable(Inst, SPT.HasFramePointer)) { - LLVM_DEBUG( - dbgs() << "Leaked stack address, giving up on this function.\n"); - LLVM_DEBUG(dbgs() << "Blame insn: "); - LLVM_DEBUG(Inst.dump()); - return false; + EscapesStackAddress = true; + if (!opts::ExperimentalSW) { + LLVM_DEBUG( + dbgs() << "Leaked stack address, giving up on this function.\n"); + LLVM_DEBUG(dbgs() << "Blame insn: "); + LLVM_DEBUG(Inst.dump()); + return false; + } } return decodeFrameAccess(Inst); @@ -405,7 +417,7 @@ FAA.enterNewBB(); for (MCInst &Inst : *BB) { - if (!FAA.doNext(*BB, Inst)) { + if (!FAA.doNext(*BB, Inst) || FAA.doesEscapeStackAddress()) { ArgsTouchedMap[&BF].emplace(std::make_pair(-1, 0)); NoInfo = true; break; @@ -475,6 +487,12 @@ dbgs() << "\t\t\tSP offset is " << FAA.getSPOffset() << "\n"; }); + if (FAA.doesEscapeStackAddress()) { + if (!FunctionsWithStackArithmetic.count(&BF)) + FunctionsWithStackArithmetic.insert(&BF); + continue; + } + if (!FAA.isValidAccess()) continue; diff --git a/bolt/lib/Passes/FrameOptimizer.cpp b/bolt/lib/Passes/FrameOptimizer.cpp --- a/bolt/lib/Passes/FrameOptimizer.cpp +++ b/bolt/lib/Passes/FrameOptimizer.cpp @@ -268,13 +268,15 @@ { NamedRegionTimer T1("removeloads", "remove loads", "FOP", "FOP breakdown", opts::TimeOpts); - removeUnnecessaryLoads(*RA, *FA, I.second); + if (!FA->hasStackArithmetic(I.second)) + removeUnnecessaryLoads(*RA, *FA, I.second); } if (opts::RemoveStores) { NamedRegionTimer T1("removestores", "remove stores", "FOP", "FOP breakdown", opts::TimeOpts); - removeUnusedStores(*FA, I.second); + if (!FA->hasStackArithmetic(I.second)) + removeUnusedStores(*FA, I.second); } // Don't even start shrink wrapping if no profiling info is available if (I.second.getKnownExecutionCount() == 0) diff --git a/bolt/lib/Passes/ShrinkWrapping.cpp b/bolt/lib/Passes/ShrinkWrapping.cpp --- a/bolt/lib/Passes/ShrinkWrapping.cpp +++ b/bolt/lib/Passes/ShrinkWrapping.cpp @@ -1066,6 +1066,14 @@ }); return false; } + if (FA.hasStackArithmetic(BF)) { + LLVM_DEBUG({ + dbgs() << "Reg " << CSR + << " is not using push/pops due to function " + "taking the address of a stack position.\n"; + }); + return false; + } for (MCInst *Save : CSA.getSavesByReg(CSR)) { if (!SLM.canCollapseRegion(Save)) { LLVM_DEBUG(dbgs() << "Reg " << CSR << " cannot collapse region.\n"); diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -1091,8 +1091,10 @@ bool IsLoad = MCII.mayLoad(); bool IsStore = MCII.mayStore(); // Is it LEA? (deals with memory but is not loading nor storing) - if (!IsLoad && !IsStore) - return false; + if (!IsLoad && !IsStore) { + I = {0, IsLoad, IsStore, false, false}; + break; + } uint8_t Sz = getMemDataSize(Inst, MemOpNo); I = {Sz, IsLoad, IsStore, false, false}; break; diff --git a/bolt/test/X86/frame-opt-lea.s b/bolt/test/X86/frame-opt-lea.s new file mode 100644 --- /dev/null +++ b/bolt/test/X86/frame-opt-lea.s @@ -0,0 +1,60 @@ +# This checks that frame optimizer does not try to optimize away caller-saved +# regs when we do not have complete aliasing info (when there is an LEA +# instruction and the function does arithmetic with stack addresses). + + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \ +# RUN: %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib +# RUN: llvm-bolt %t.exe -relocs -o %t.out -data %t.fdata \ +# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \ +# RUN: -lite=0 -eliminate-unreachable=false | FileCheck %s +# RUN: llvm-objdump -d %t.out --print-imm-hex | \ +# RUN: FileCheck --check-prefix CHECK-OBJDUMP %s + + .globl foo + .type foo, %function +foo: + .cfi_startproc + movq $0, (%rsi) + ret + .cfi_endproc + .size foo, .-foo + + + .globl _start + .type _start, %function +_start: + .cfi_startproc +# FDATA: 0 [unknown] 0 1 _start 0 0 1 + push %rbp + mov %rsp, %rbp + subq $0x20, %rsp + je b +c: + addq $0x20, %rsp + pop %rbp + ret +b: + movq %rdi, %r13 + movq %r13, -0x08(%rbp) + leaq -0x08(%rbp), %rsi + callq foo + movq -0x08(%rbp), %r13 + jmp c + .cfi_endproc + .size _start, .-_start + + +# CHECK: BOLT-INFO: FOP deleted 0 load(s) (dyn count: 0) and 0 store(s) + +# CHECK-OBJDUMP: <_start>: +# CHECK-OBJDUMP: movq %rdi, %r13 +# CHECK-OBJDUMP-NEXT: movq %r13, -0x8(%rbp) +# CHECK-OBJDUMP-NEXT: leaq +# CHECK-OBJDUMP-NEXT: callq +# CHECK-OBJDUMP-NEXT: movq -0x8(%rbp), %r13 diff --git a/bolt/test/X86/shrinkwrapping-lea.s b/bolt/test/X86/shrinkwrapping-lea.s new file mode 100644 --- /dev/null +++ b/bolt/test/X86/shrinkwrapping-lea.s @@ -0,0 +1,65 @@ +# This checks that shrink wrapping correctly drops moving push/pops when +# there is an LEA instruction. + + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \ +# RUN: %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: ld.lld %t.o -o %t.exe -q +# RUN: llvm-bolt %t.exe -relocs -o %t.out -data %t.fdata \ +# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \ +# RUN: -experimental-shrink-wrapping \ +# RUN: -eliminate-unreachable=false | FileCheck %s +# RUN: llvm-objdump -d %t.out --print-imm-hex | \ +# RUN: FileCheck --check-prefix CHECK-OBJDUMP %s + + .globl _start + .type _start, %function +_start: + .cfi_startproc +# FDATA: 0 [unknown] 0 1 _start 0 0 1 + push %rbp + mov %rsp, %rbp + push %rbx + push %r14 + subq $0x20, %rsp + je b +c: + addq $0x20, %rsp + pop %r14 + pop %rbx + pop %rbp + ret +b: + je f + jmp *JT(,%rdi,8) +d: + mov %r14, %rdi + mov %rbx, %rdi + leaq -0x20(%rbp), %r14 + movq -0x20(%rbp), %rdi +f: + addq $0x20, %rsp + pop %r14 + pop %rbx + pop %rbp + ret + .cfi_endproc + .size _start, .-_start + .data +JT: + .quad c + .quad d + .quad f + + +# CHECK: BOLT-INFO: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops + +# Checks that offsets of instructions accessing the stack were not changed +# CHECK-OBJDUMP: <_start>: +# CHECK-OBJDUMP: movq %rbx, %rdi +# CHECK-OBJDUMP-NEXT: leaq -0x20(%rbp), %r14 +# CHECK-OBJDUMP: movq -0x20(%rbp), %rdi diff --git a/bolt/test/X86/shrinkwrapping-mov.s b/bolt/test/X86/shrinkwrapping-mov.s new file mode 100644 --- /dev/null +++ b/bolt/test/X86/shrinkwrapping-mov.s @@ -0,0 +1,58 @@ +# This checks that shrink wrapping correctly drops moving push/pops when +# there is a MOV instruction loading the value of the stack pointer in +# order to do pointer arithmetic with a stack address. + + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \ +# RUN: %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: ld.lld %t.o -o %t.exe -q +# RUN: llvm-bolt %t.exe -relocs -o %t.out -data %t.fdata \ +# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \ +# RUN: -experimental-shrink-wrapping \ +# RUN: -eliminate-unreachable=false | FileCheck %s + + .globl _start + .type _start, %function +_start: + .cfi_startproc +# FDATA: 0 [unknown] 0 1 _start 0 0 1 + push %rbp + mov %rsp, %rbp + push %rbx + push %r14 + subq $0x20, %rsp + je b +c: + addq $0x20, %rsp + pop %r14 + pop %rbx + pop %rbp + ret +b: + je f + jmp *JT(,%rdi,8) +d: + mov %r14, %rdi + mov %rbx, %rdi + mov %rbp, %rdi + sub 0x20, %rdi +f: + addq $0x20, %rsp + pop %r14 + pop %rbx + pop %rbp + ret + .cfi_endproc + .size _start, .-_start + .data +JT: + .quad c + .quad d + .quad f + + +# CHECK: BOLT-INFO: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops diff --git a/bolt/test/X86/shrinkwrapping-popf.s b/bolt/test/X86/shrinkwrapping-popf.s --- a/bolt/test/X86/shrinkwrapping-popf.s +++ b/bolt/test/X86/shrinkwrapping-popf.s @@ -21,6 +21,7 @@ pushf push %rbx push %rbp + addq $0x50, -0x30(%rbp) pop %rbp pop %rbx popf diff --git a/bolt/test/X86/shrinkwrapping-restore-position.s b/bolt/test/X86/shrinkwrapping-restore-position.s new file mode 100644 --- /dev/null +++ b/bolt/test/X86/shrinkwrapping-restore-position.s @@ -0,0 +1,65 @@ +# This checks that shrink wrapping uses the red zone defined in the X86 ABI by +# placing restores that access elements already deallocated by the stack. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple x86_64-unknown-unknown \ +# RUN: %s -o %t.o +# RUN: link_fdata %s %t.o %t.fdata +# RUN: llvm-strip --strip-unneeded %t.o +# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib +# RUN: llvm-bolt -relocs %t.exe -o %t.out -data %t.fdata \ +# RUN: -frame-opt=all -simplify-conditional-tail-calls=false \ +# RUN: -experimental-shrink-wrapping \ +# RUN: -eliminate-unreachable=false | FileCheck %s +# RUN: llvm-objdump -d %t.out --print-imm-hex | \ +# RUN: FileCheck --check-prefix CHECK-OBJDUMP %s + + +# Here we create a CFG where the restore position matches the previous (deleted) +# restore position. Shrink wrapping then will put a stack access to an element +# that was deallocated at the previously deleted POP, which falls in the red +# zone and should be safe for X86 Linux ABI. + .globl _start + .type _start, %function +_start: + .cfi_startproc +# FDATA: 0 [unknown] 0 1 _start 0 0 1 + push %rbp + mov %rsp, %rbp + push %rbx + push %r14 + subq $0x20, %rsp +b: je hot_path +# FDATA: 1 _start #b# 1 _start #hot_path# 0 1 +cold_path: + mov %r14, %rdi + mov %rbx, %rdi + movq rel(%rip), %rdi # Add this to create a relocation and run bolt w/ relocs + leaq -0x20(%rbp), %r14 + movq -0x20(%rbp), %rdi + leaq -0x10(%rbp), %rsp + pop %r14 + pop %rbx + pop %rbp + ret +hot_path: + addq $0x20, %rsp + pop %r14 + pop %rbx + pop %rbp + ret + .cfi_endproc + .size _start, .-_start + + .data +rel: .quad cold_path + +# CHECK: BOLT-INFO: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops + +# CHECK-OBJDUMP: <_start>: +# CHECK-OBJDUMP: leaq (%rbp), %rsp +# CHECK-OBJDUMP-NEXT: popq %rbp +# CHECK-OBJDUMP-NEXT: movq -0x10(%rsp), %rbx +# CHECK-OBJDUMP-NEXT: movq -0x18(%rsp), %r14 +# CHECK-OBJDUMP-NEXT: retq