Index: llvm/trunk/include/llvm/CodeGen/MachineModuleInfo.h
===================================================================
--- llvm/trunk/include/llvm/CodeGen/MachineModuleInfo.h
+++ llvm/trunk/include/llvm/CodeGen/MachineModuleInfo.h
@@ -245,6 +245,11 @@
   bool hasDebugInfo() const { return DbgInfoAvailable; }
   void setDebugInfoAvailability(bool avail) { DbgInfoAvailable = avail; }
 
+  // Returns true if we need to generate precise CFI. Currently
+  // this is equivalent to hasDebugInfo(), but if we ever implement
+  // async EH, it will require precise CFI as well.
+  bool usePreciseUnwindInfo() const { return hasDebugInfo(); }
+
   bool callsEHReturn() const { return CallsEHReturn; }
   void setCallsEHReturn(bool b) { CallsEHReturn = b; }
 
Index: llvm/trunk/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ llvm/trunk/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -216,6 +216,9 @@
   case MCCFIInstruction::OpDefCfaOffset:
     OutStreamer->EmitCFIDefCfaOffset(Inst.getOffset());
     break;
+  case MCCFIInstruction::OpAdjustCfaOffset:
+    OutStreamer->EmitCFIAdjustCfaOffset(Inst.getOffset());
+    break;
   case MCCFIInstruction::OpDefCfa:
     OutStreamer->EmitCFIDefCfa(Inst.getRegister(), Inst.getOffset());
     break;
Index: llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp
+++ llvm/trunk/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -103,7 +103,8 @@
   const char *getPassName() const override { return "X86 Optimize Call Frame"; }
 
   const TargetInstrInfo *TII;
-  const TargetFrameLowering *TFL;
+  const X86FrameLowering *TFL;
+  const X86Subtarget *STI;
   const MachineRegisterInfo *MRI;
   static char ID;
 };
@@ -127,13 +128,15 @@
   // No point in running this in 64-bit mode, since some arguments are
   // passed in-register in all common calling conventions, so the pattern
   // we're looking for will never match.
-  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
-  if (STI.is64Bit())
+  if (STI->is64Bit())
     return false;
 
-  // We can't encode multiple DW_CFA_GNU_args_size in the compact
-  // unwind encoding that Darwin uses.
-  if (STI.isTargetDarwin() && !MF.getMMI().getLandingPads().empty())
+  // We can't encode multiple DW_CFA_GNU_args_size or DW_CFA_def_cfa_offset
+  // in the compact unwind encoding that Darwin uses. So, bail if there
+  // is a danger of that being generated.
+  if (STI->isTargetDarwin() && 
+     (!MF.getMMI().getLandingPads().empty() || 
+       (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF))))
     return false;
 
   // You would expect straight-line code between call-frame setup and
@@ -216,8 +219,9 @@
 }
 
 bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
-  TII = MF.getSubtarget().getInstrInfo();
-  TFL = MF.getSubtarget().getFrameLowering();
+  STI = &MF.getSubtarget<X86Subtarget>();
+  TII = STI->getInstrInfo();
+  TFL = STI->getFrameLowering();
   MRI = &MF.getRegInfo();
 
   if (!isLegal(MF))
@@ -312,7 +316,7 @@
   // Check that this particular call sequence is amenable to the
   // transformation.
   const X86RegisterInfo &RegInfo = *static_cast<const X86RegisterInfo *>(
-                                       MF.getSubtarget().getRegisterInfo());
+                                       STI->getRegisterInfo());
   unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
 
   // We expect to enter this at the beginning of a call sequence
@@ -455,6 +459,7 @@
   for (int Idx = (Context.ExpectedDist / 4) - 1; Idx >= 0; --Idx) {
     MachineBasicBlock::iterator MOV = *Context.MovVector[Idx];
     MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+    MachineBasicBlock::iterator Push = nullptr;
     if (MOV->getOpcode() == X86::MOV32mi) {
       unsigned PushOpcode = X86::PUSHi32;
       // If the operand is a small (8-bit) immediate, we can use a
@@ -466,21 +471,20 @@
         if (isInt<8>(Val))
           PushOpcode = X86::PUSH32i8;
       }
-      BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).addOperand(PushOp);
+      Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
+          .addOperand(PushOp);
     } else {
       unsigned int Reg = PushOp.getReg();
 
       // If PUSHrmm is not slow on this target, try to fold the source of the
       // push into the instruction.
-      const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
-      bool SlowPUSHrmm = ST.isAtom() || ST.isSLM();
+      bool SlowPUSHrmm = STI->isAtom() || STI->isSLM();
 
       // Check that this is legal to fold. Right now, we're extremely
       // conservative about that.
       MachineInstr *DefMov = nullptr;
       if (!SlowPUSHrmm && (DefMov = canFoldIntoRegPush(FrameSetup, Reg))) {
-        MachineInstr *Push =
-            BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm));
+        Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32rmm));
 
         unsigned NumOps = DefMov->getDesc().getNumOperands();
         for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
@@ -488,12 +492,18 @@
 
         DefMov->eraseFromParent();
       } else {
-        BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r))
+        Push = BuildMI(MBB, Context.Call, DL, TII->get(X86::PUSH32r))
             .addReg(Reg)
             .getInstr();
       }
     }
 
+    // For debugging, when using SP-based CFA, we need to adjust the CFA
+    // offset after each push.
+    if (!TFL->hasFP(MF) && MF.getMMI().usePreciseUnwindInfo())
+      TFL->BuildCFI(MBB, std::next(Push), DL, 
+                    MCCFIInstruction::createAdjustCfaOffset(nullptr, 4));
+
     MBB.erase(MOV);
   }
 
Index: llvm/trunk/lib/Target/X86/X86FrameLowering.h
===================================================================
--- llvm/trunk/lib/Target/X86/X86FrameLowering.h
+++ llvm/trunk/lib/Target/X86/X86FrameLowering.h
@@ -125,13 +125,13 @@
   /// \p MBB will be correctly handled by the target.
   bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
 
-private:
-  uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
-
   /// Wraps up getting a CFI index and building a MachineInstr for it.
   void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                 DebugLoc DL, MCCFIInstruction CFIInst) const;
 
+private:
+  uint64_t calculateMaxStackAlign(const MachineFunction &MF) const;
+
   /// Aligns the stack pointer by ANDing it with -MaxAlign.
   void BuildStackAlignAND(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
Index: llvm/trunk/lib/Target/X86/X86FrameLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86FrameLowering.cpp
+++ llvm/trunk/lib/Target/X86/X86FrameLowering.cpp
@@ -2105,18 +2105,23 @@
     unsigned StackAlign = getStackAlignment();
     Amount = RoundUpToAlignment(Amount, StackAlign);
 
+    MachineModuleInfo &MMI = MF.getMMI();
+    const Function *Fn = MF.getFunction();
+    bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+    bool DwarfCFI = !WindowsCFI && 
+                    (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
+
     // If we have any exception handlers in this function, and we adjust
-    // the SP before calls, we may need to indicate this to the unwinder,
-    // using GNU_ARGS_SIZE. Note that this may be necessary
-    // even when Amount == 0, because the preceding function may have
-    // set a non-0 GNU_ARGS_SIZE.
+    // the SP before calls, we may need to indicate this to the unwinder
+    // using GNU_ARGS_SIZE. Note that this may be necessary even when
+    // Amount == 0, because the preceding function may have set a non-0
+    // GNU_ARGS_SIZE.
     // TODO: We don't need to reset this between subsequent functions,
     // if it didn't change.
-    bool HasDwarfEHHandlers =
-      !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
-      !MF.getMMI().getLandingPads().empty();
+    bool HasDwarfEHHandlers = !WindowsCFI &&
+                              !MF.getMMI().getLandingPads().empty();
 
-    if (HasDwarfEHHandlers && !isDestroy && 
+    if (HasDwarfEHHandlers && !isDestroy &&
         MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences())
       BuildCFI(MBB, I, DL,
                MCCFIInstruction::createGnuArgsSize(nullptr, Amount));
@@ -2128,15 +2133,37 @@
     // (Pushes of argument for frame setup, callee pops for frame destroy)
     Amount -= InternalAmt;
 
+    // If this is a callee-pop calling convention, and we're emitting precise
+    // SP-based CFI, emit a CFA adjust for the amount the callee popped.
+    if (isDestroy && InternalAmt && DwarfCFI && !hasFP(MF) && 
+        MMI.usePreciseUnwindInfo())
+      BuildCFI(MBB, I, DL, 
+               MCCFIInstruction::createAdjustCfaOffset(nullptr, -InternalAmt));
+
     if (Amount) {
       // Add Amount to SP to destroy a frame, and subtract to setup.
       int Offset = isDestroy ? Amount : -Amount;
 
-      if (!(MF.getFunction()->optForMinSize() && 
+      if (!(Fn->optForMinSize() && 
             adjustStackWithPops(MBB, I, DL, Offset)))
         BuildStackAdjustment(MBB, I, DL, Offset, /*InEpilogue=*/false);
     }
 
+    if (DwarfCFI && !hasFP(MF)) {
+      // If we don't have FP, but need to generate unwind information,
+      // we need to set the correct CFA offset after the stack adjustment.
+      // How much we adjust the CFA offset depends on whether we're emitting
+      // CFI only for EH purposes or for debugging. EH only requires the CFA
+      // offset to be correct at each call site, while for debugging we want
+      // it to be more precise.
+      int CFAOffset = Amount;
+      if (!MMI.usePreciseUnwindInfo())
+        CFAOffset += InternalAmt;
+      CFAOffset = isDestroy ? -CFAOffset : CFAOffset;
+      BuildCFI(MBB, I, DL, 
+               MCCFIInstruction::createAdjustCfaOffset(nullptr, CFAOffset));
+    }
+
     return;
   }
 
Index: llvm/trunk/test/CodeGen/X86/debugloc-argsize.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/debugloc-argsize.ll
+++ llvm/trunk/test/CodeGen/X86/debugloc-argsize.ll
@@ -30,7 +30,7 @@
 
 declare void @__cxa_end_catch()
 
-attributes #0 = { optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { optsize }
 attributes #2 = { nounwind }
 
Index: llvm/trunk/test/CodeGen/X86/fold-push.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/fold-push.ll
+++ llvm/trunk/test/CodeGen/X86/fold-push.ll
@@ -3,7 +3,7 @@
 
 declare void @foo(i32 %r)
 
-define void @test(i32 %a, i32 %b) optsize {
+define void @test(i32 %a, i32 %b) optsize nounwind {
 ; CHECK-LABEL: test:
 ; CHECK: movl [[EAX:%e..]], (%esp)
 ; CHECK-NEXT: pushl [[EAX]]
@@ -22,7 +22,7 @@
   ret void
 }
 
-define void @test_min(i32 %a, i32 %b) minsize {
+define void @test_min(i32 %a, i32 %b) minsize nounwind {
 ; CHECK-LABEL: test_min:
 ; CHECK: movl [[EAX:%e..]], (%esp)
 ; CHECK-NEXT: pushl [[EAX]]
Index: llvm/trunk/test/CodeGen/X86/pop-stack-cleanup.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/pop-stack-cleanup.ll
+++ llvm/trunk/test/CodeGen/X86/pop-stack-cleanup.ll
@@ -9,7 +9,7 @@
 declare void @param8(i64, i64, i64, i64, i64, i64, i64, i64)
 
 
-define void @test() minsize {
+define void @test() minsize nounwind {
 ; CHECK-LABEL: test:
 ; CHECK: calll _param1
 ; CHECK-NEXT: popl %eax
@@ -48,7 +48,7 @@
   ret void
 }
 
-define void @spill(i32 inreg %a, i32 inreg %b, i32 inreg %c) minsize {
+define void @spill(i32 inreg %a, i32 inreg %b, i32 inreg %c) minsize nounwind {
 ; CHECK-LABEL: spill:
 ; CHECK-DAG: movl %ecx,
 ; CHECK-DAG: movl %edx,
@@ -63,7 +63,7 @@
   ret void
 }
 
-define void @test_linux64(i32 %size) minsize {
+define void @test_linux64(i32 %size) minsize nounwind {
 ; LINUX64-LABEL: test_linux64:
 ; LINUX64: pushq %rbp
 ; LINUX64: callq param8
Index: llvm/trunk/test/CodeGen/X86/push-cfi-debug.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/push-cfi-debug.ll
+++ llvm/trunk/test/CodeGen/X86/push-cfi-debug.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -mtriple=i686-pc-linux | FileCheck %s
+
+
+; Function Attrs: optsize
+declare void @foo(i32, i32) #0
+declare x86_stdcallcc void @stdfoo(i32, i32) #0
+
+; CHECK-LABEL: test1:
+; CHECK: subl $8, %esp
+; CHECK: .cfi_adjust_cfa_offset 8
+; CHECK: pushl $2
+; CHECK: .cfi_adjust_cfa_offset 4
+; CHECK: pushl $1
+; CHECK: .cfi_adjust_cfa_offset 4
+; CHECK: calll foo
+; CHECK: addl $16, %esp
+; CHECK: .cfi_adjust_cfa_offset -16
+; CHECK: subl $8, %esp
+; CHECK: .cfi_adjust_cfa_offset 8
+; CHECK: pushl $4
+; CHECK: .cfi_adjust_cfa_offset 4
+; CHECK: pushl $3
+; CHECK: .cfi_adjust_cfa_offset 4
+; CHECK: calll stdfoo
+; CHECK: .cfi_adjust_cfa_offset -8
+; CHECK: addl $8, %esp
+; CHECK: .cfi_adjust_cfa_offset -8
+define void @test1() #0 {
+entry:
+  tail call void @foo(i32 1, i32 2) #1, !dbg !10
+  tail call x86_stdcallcc void @stdfoo(i32 3, i32 4) #1, !dbg !11
+  ret void, !dbg !12
+}
+
+attributes #0 = { nounwind optsize }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.8.0 (trunk 250289)", isOptimized: true, runtimeVersion: 0, emissionKind: 1, enums: !2, subprograms: !3)
+!1 = !DIFile(filename: "foo.c", directory: "foo")
+!2 = !{}
+!3 = !{!4}
+!4 = distinct !DISubprogram(name: "test1", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, function: void ()* @test1, variables: !2)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null}
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{!"clang version 3.8.0 (trunk 250289)"}
+!10 = !DILocation(line: 4, column: 3, scope: !4)
+!11 = !DILocation(line: 5, column: 3, scope: !4)
+!12 = !DILocation(line: 6, column: 1, scope: !4)
Index: llvm/trunk/test/CodeGen/X86/push-cfi-obj.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/push-cfi-obj.ll
+++ llvm/trunk/test/CodeGen/X86/push-cfi-obj.ll
@@ -1,36 +1,36 @@
-; RUN: llc < %s -mtriple=i686-pc-linux -filetype=obj | llvm-readobj -s -sr -sd | FileCheck %s
+; RUN: llc < %s -mtriple=i686-pc-linux -filetype=obj | llvm-readobj -s -sr -sd | FileCheck %s -check-prefix=LINUX
 ; RUN: llc < %s -mtriple=i686-darwin-macosx10.7 -filetype=obj | llvm-readobj -sections | FileCheck -check-prefix=DARWIN %s
 
 ; On darwin, check that we manage to generate the compact unwind section
 ; DARWIN: Name: __compact_unwind
 ; DARWIN: Segment: __LD
 
-; CHECK:         Index: 8
-; CHECK-NEXT:    Name: .eh_frame (41)
-; CHECK-NEXT:    Type: SHT_PROGBITS (0x1)
-; CHECK-NEXT:    Flags [ (0x2)
-; CHECK-NEXT:      SHF_ALLOC (0x2)
-; CHECK-NEXT:    ]
-; CHECK-NEXT:    Address: 0x0
-; CHECK-NEXT:    Offset: 0x64
-; CHECK-NEXT:    Size: 60
-; CHECK-NEXT:    Link: 0
-; CHECK-NEXT:    Info: 0
-; CHECK-NEXT:    AddressAlignment: 4
-; CHECK-NEXT:    EntrySize: 0
-; CHECK-NEXT:    Relocations [
-; CHECK-NEXT:    ]
-; CHECK-NEXT:    SectionData (
-; CHECK-NEXT:      0000: 1C000000 00000000 017A504C 5200017C  |.........zPLR..||
-; CHECK-NEXT:      0010: 08070000 00000000 1B0C0404 88010000  |................|
-; CHECK-NEXT:      0020: 18000000 24000000 00000000 19000000  |....$...........|
-; CHECK-NEXT:      0030: 04000000 00430E10 2E100000           |.....C......|
-; CHECK-NEXT:    )
+; LINUX:         Index: 8
+; LINUX-NEXT:    Name: .eh_frame (41)
+; LINUX-NEXT:    Type: SHT_PROGBITS (0x1)
+; LINUX-NEXT:    Flags [ (0x2)
+; LINUX-NEXT:      SHF_ALLOC (0x2)
+; LINUX-NEXT:    ]
+; LINUX-NEXT:    Address: 0x0
+; LINUX-NEXT:    Offset: 0x68
+; LINUX-NEXT:    Size: 64
+; LINUX-NEXT:    Link: 0
+; LINUX-NEXT:    Info: 0
+; LINUX-NEXT:    AddressAlignment: 4
+; LINUX-NEXT:    EntrySize: 0
+; LINUX-NEXT:    Relocations [
+; LINUX-NEXT:    ]
+; LINUX-NEXT:    SectionData (
+; LINUX-NEXT:      0000: 1C000000 00000000 017A504C 5200017C  |.........zPLR..||
+; LINUX-NEXT:      0010: 08070000 00000000 1B0C0404 88010000  |................|
+; LINUX-NEXT:      0020: 1C000000 24000000 00000000 1D000000  |....$...........|
+; LINUX-NEXT:      0030: 04000000 00410E08 8502420D 05432E10  |.....A....B..C..|
+; LINUX-NEXT:    )
 
 declare i32 @__gxx_personality_v0(...)
 declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
 
-define void @test() optsize personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+define void @test() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 entry:
   invoke void @good(i32 1, i32 2, i32 3, i32 4)
           to label %continue unwind label %cleanup
@@ -41,3 +41,5 @@
      cleanup
   ret void
 }
+
+attributes #0 = { optsize "no-frame-pointer-elim"="true" }
Index: llvm/trunk/test/CodeGen/X86/push-cfi.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/push-cfi.ll
+++ llvm/trunk/test/CodeGen/X86/push-cfi.ll
@@ -1,21 +1,51 @@
-; RUN: llc < %s -mtriple=i686-pc-linux | FileCheck %s
+; RUN: llc < %s -mtriple=i686-pc-linux | FileCheck %s -check-prefix=LINUX -check-prefix=CHECK
+; RUN: llc < %s -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=DARWIN -check-prefix=CHECK
 
 declare i32 @__gxx_personality_v0(...)
 declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
 declare void @large(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f)
 declare void @empty()
 
-; We use an invoke, and expect a .cfi_escape GNU_ARGS_SIZE with size 16
-; before the invocation
-; CHECK-LABEL: test1:
-; CHECK: .cfi_escape 0x2e, 0x10
-; CHECK-NEXT: pushl   $4
-; CHECK-NEXT: pushl   $3
-; CHECK-NEXT: pushl   $2
-; CHECK-NEXT: pushl   $1
-; CHECK-NEXT: call
-; CHECK-NEXT: addl $16, %esp
-define void @test1() optsize personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; When we use an invoke, and have FP, we expect a .cfi_escape GNU_ARGS_SIZE
+; with size 16 before the invocation. Without FP, we expect.cfi_adjust_cfa_offset
+; before and after.
+; Darwin should not generate pushes in neither circumstance.
+; CHECK-LABEL: test1_nofp:
+; LINUX: .cfi_escape 0x2e, 0x10
+; LINUX: .cfi_adjust_cfa_offset 16
+; LINUX-NEXT: pushl   $4
+; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: call
+; LINUX-NEXT: addl $16, %esp
+; LINUX: .cfi_adjust_cfa_offset -16
+; DARWIN-NOT: .cfi_escape
+; DARWIN-NOT: pushl
+define void @test1_nofp() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @good(i32 1, i32 2, i32 3, i32 4)
+          to label %continue unwind label %cleanup
+continue:
+  ret void
+cleanup:  
+  landingpad { i8*, i32 }
+     cleanup
+  ret void
+}
+
+; CHECK-LABEL: test1_fp:
+; LINUX: .cfi_escape 0x2e, 0x10
+; LINUX-NEXT: pushl   $4
+; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: call
+; LINUX-NEXT: addl $16, %esp
+; DARWIN: pushl %ebp
+; DARWIN-NOT: .cfi_escape
+; DARWIN-NOT: pushl
+define void @test1_fp() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 entry:
   invoke void @good(i32 1, i32 2, i32 3, i32 4)
           to label %continue unwind label %cleanup
@@ -28,27 +58,69 @@
 }
 
 ; If the function has no handlers, we don't need to generate GNU_ARGS_SIZE,
-; even if it has an unwind table.
-; CHECK-LABEL: test2:
+; even if it has an unwind table. Without FP, we still need cfi_adjust_cfa_offset,
+; so darwin should not generate pushes.
+; CHECK-LABEL: test2_nofp:
+; LINUX-NOT: .cfi_escape
+; LINUX: .cfi_adjust_cfa_offset 16
+; LINUX-NEXT: pushl   $4
+; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: call
+; LINUX-NEXT: addl $16, %esp
+; LINUX: .cfi_adjust_cfa_offset -16
+; DARWIN-NOT: .cfi_escape
+; DARWIN-NOT: pushl
+define void @test2_nofp() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; CHECK-LABEL: test2_fp:
 ; CHECK-NOT: .cfi_escape
+; CHECK-NOT: .cfi_adjust_cfa_offset
 ; CHECK: pushl   $4
 ; CHECK-NEXT: pushl   $3
 ; CHECK-NEXT: pushl   $2
 ; CHECK-NEXT: pushl   $1
 ; CHECK-NEXT: call
-; CHECK-NEXT: addl $16, %esp
-define void @test2() optsize personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-NEXT: addl $24, %esp
+define void @test2_fp() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 entry:
   call void @good(i32 1, i32 2, i32 3, i32 4)
   ret void
 }
 
-; If we did not end up using any pushes, no need for GNU_ARGS_SIZE anywhere
-; CHECK-LABEL: test3:
-; CHECK-NOT: .cfi_escape
-; CHECK-NOT: pushl
-; CHECK: retl
-define void @test3() optsize personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; If we did not end up using any pushes, no need for GNU_ARGS_SIZE or
+; cfi_adjust_cfa_offset.
+; CHECK-LABEL: test3_nofp:
+; LINUX-NOT: .cfi_escape
+; LINUX-NOT: .cfi_adjust_cfa_offset
+; LINUX-NOT: pushl
+; LINUX: retl
+define void @test3_nofp() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @empty()
+          to label %continue unwind label %cleanup
+continue:
+  ret void
+cleanup:  
+  landingpad { i8*, i32 }
+     cleanup
+  ret void
+}
+
+; If we did not end up using any pushes, no need for GNU_ARGS_SIZE or
+; cfi_adjust_cfa_offset.
+; CHECK-LABEL: test3_fp:
+; LINUX: pushl %ebp
+; LINUX-NOT: .cfi_escape
+; LINUX-NOT: .cfi_adjust_cfa_offset
+; LINUX-NOT: pushl
+; LINUX: retl
+define void @test3_fp() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 entry:
   invoke void @empty()
           to label %continue unwind label %cleanup
@@ -62,24 +134,24 @@
 
 ; Different sized stacks need different GNU_ARGS_SIZEs
 ; CHECK-LABEL: test4:
-; CHECK: .cfi_escape 0x2e, 0x10
-; CHECK-NEXT: pushl   $4
-; CHECK-NEXT: pushl   $3
-; CHECK-NEXT: pushl   $2
-; CHECK-NEXT: pushl   $1
-; CHECK-NEXT: call
-; CHECK-NEXT: addl $16, %esp
-; CHECK: .cfi_escape 0x2e, 0x20
-; CHECK-NEXT: subl    $8, %esp
-; CHECK-NEXT: pushl   $11
-; CHECK-NEXT: pushl   $10
-; CHECK-NEXT: pushl   $9
-; CHECK-NEXT: pushl   $8
-; CHECK-NEXT: pushl   $7
-; CHECK-NEXT: pushl   $6
-; CHECK-NEXT: calll   large
-; CHECK-NEXT: addl $32, %esp
-define void @test4() optsize personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; LINUX: .cfi_escape 0x2e, 0x10
+; LINUX-NEXT: pushl   $4
+; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: call
+; LINUX-NEXT: addl $16, %esp
+; LINUX: .cfi_escape 0x2e, 0x20
+; LINUX: subl    $8, %esp
+; LINUX-NEXT: pushl   $11
+; LINUX-NEXT: pushl   $10
+; LINUX-NEXT: pushl   $9
+; LINUX-NEXT: pushl   $8
+; LINUX-NEXT: pushl   $7
+; LINUX-NEXT: pushl   $6
+; LINUX-NEXT: calll   large
+; LINUX-NEXT: addl $32, %esp
+define void @test4() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 entry:
   invoke void @good(i32 1, i32 2, i32 3, i32 4)
           to label %continue1 unwind label %cleanup
@@ -95,18 +167,48 @@
 }
 
 ; If we did use pushes, we need to reset GNU_ARGS_SIZE before a call
-; without parameters
-; CHECK-LABEL: test5:
-; CHECK: .cfi_escape 0x2e, 0x10
-; CHECK-NEXT: pushl   $4
-; CHECK-NEXT: pushl   $3
-; CHECK-NEXT: pushl   $2
-; CHECK-NEXT: pushl   $1
-; CHECK-NEXT: call
-; CHECK-NEXT: addl $16, %esp
-; CHECK: .cfi_escape 0x2e, 0x00
-; CHECK-NEXT: call
-define void @test5() optsize personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; without parameters, but don't need to adjust the cfa offset
+; CHECK-LABEL: test5_nofp:
+; LINUX: .cfi_escape 0x2e, 0x10
+; LINUX: .cfi_adjust_cfa_offset 16
+; LINUX-NEXT: pushl   $4
+; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: call
+; LINUX-NEXT: addl $16, %esp
+; LINUX: .cfi_adjust_cfa_offset -16
+; LINUX-NOT: .cfi_adjust_cfa_offset
+; LINUX: .cfi_escape 0x2e, 0x00
+; LINUX-NOT: .cfi_adjust_cfa_offset
+; LINUX: call
+define void @test5_nofp() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @good(i32 1, i32 2, i32 3, i32 4)
+          to label %continue1 unwind label %cleanup
+continue1:
+  invoke void @empty()
+          to label %continue2 unwind label %cleanup
+continue2:
+  ret void          
+cleanup:  
+  landingpad { i8*, i32 }
+     cleanup
+  ret void
+}
+
+; CHECK-LABEL: test5_fp:
+; LINUX: .cfi_escape 0x2e, 0x10
+; LINUX-NEXT: pushl   $4
+; LINUX-NEXT: pushl   $3
+; LINUX-NEXT: pushl   $2
+; LINUX-NEXT: pushl   $1
+; LINUX-NEXT: call
+; LINUX-NEXT: addl $16, %esp
+; LINUX: .cfi_escape 0x2e, 0x00
+; LINUX-NOT: .cfi_adjust_cfa_offset
+; LINUX: call
+define void @test5_fp() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 entry:
   invoke void @good(i32 1, i32 2, i32 3, i32 4)
           to label %continue1 unwind label %cleanup
@@ -121,13 +223,13 @@
   ret void
 }
 
-; This is actually inefficient - we don't need to repeat the .cfi_escape twice.
+; FIXME: This is actually inefficient - we don't need to repeat the .cfi_escape twice.
 ; CHECK-LABEL: test6:
-; CHECK: .cfi_escape 0x2e, 0x10
-; CHECK: call
-; CHECK: .cfi_escape 0x2e, 0x10
-; CHECK: call
-define void @test6() optsize personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; LINUX: .cfi_escape 0x2e, 0x10
+; LINUX: call
+; LINUX: .cfi_escape 0x2e, 0x10
+; LINUX: call
+define void @test6() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 entry:
   invoke void @good(i32 1, i32 2, i32 3, i32 4)
           to label %continue1 unwind label %cleanup
@@ -141,3 +243,41 @@
      cleanup
   ret void
 }
+
+; Darwin should generate pushes in the presense of FP and an unwind table,
+; but not FP and invoke.
+; CHECK-LABEL: test7:
+; DARWIN: pushl %ebp
+; DARWIN: movl %esp, %ebp
+; DARWIN: .cfi_def_cfa_register %ebp
+; DARWIN-NOT: .cfi_adjust_cfa_offset
+; DARWIN: pushl   $4
+; DARWIN-NEXT: pushl   $3
+; DARWIN-NEXT: pushl   $2
+; DARWIN-NEXT: pushl   $1
+; DARWIN-NEXT: call
+define void @test7() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; CHECK-LABEL: test8:
+; DARWIN: pushl %ebp
+; DARWIN: movl %esp, %ebp
+; DARWIN-NOT: .cfi_adjust_cfa_offset
+; DARWIN-NOT: pushl
+define void @test8() #1 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+entry:
+  invoke void @good(i32 1, i32 2, i32 3, i32 4)
+          to label %continue unwind label %cleanup
+continue:
+  ret void
+cleanup:  
+  landingpad { i8*, i32 }
+     cleanup
+  ret void
+}
+
+attributes #0 = { optsize }
+attributes #1 = { optsize "no-frame-pointer-elim"="true" }