Index: lib/Target/X86/X86CallFrameOptimization.cpp
===================================================================
--- lib/Target/X86/X86CallFrameOptimization.cpp
+++ lib/Target/X86/X86CallFrameOptimization.cpp
@@ -20,6 +20,7 @@
 #include <algorithm>
 
 #include "X86.h"
+#include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
@@ -51,31 +52,59 @@
   bool runOnMachineFunction(MachineFunction &MF) override;
 
 private:
-  // Information we know about a particular call site
+  // Information about the setup for an inalloca call.
+  struct InAllocaInfo {
+    // Frame setup for the _chkstk call.
+    MachineBasicBlock::iterator FrameSetup;
+
+    // Move of _chkstk amount into virtual register.
+    MachineBasicBlock::iterator AmountInstr;
+
+    // Move of virtual register with _chkstk amount into %eax.
+    MachineBasicBlock::iterator AmountMov;
+
+    // Call to _chkstk.
+    MachineBasicBlock::iterator ChkstkCall;
+
+    // Copy of _chkstk result into virtual register.
+    MachineBasicBlock::iterator ChkstkResultCopy;
+
+    // Frame destroy for the _chkstk call.
+    MachineBasicBlock::iterator FrameDestroy;
+  };
+
+  // Information we know about a particular call site.
   struct CallContext {
     CallContext()
-        : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0),
-          MovVector(4, nullptr), NoStackParams(false), UsePush(false) {}
+        : FrameSetup(nullptr), IsInAlloca(false), Call(nullptr), SPCopy(nullptr),
+          ExpectedDist(0), MovVector(4, nullptr), NoStackParams(false),
+          UsePush(false) {}
 
-    // Iterator referring to the frame setup instruction
+    // Iterator referring to the frame setup instruction.
     MachineBasicBlock::iterator FrameSetup;
 
-    // Actual call instruction
+    // Whether this is an inalloca call.
+    bool IsInAlloca;
+
+    // If this is an inalloca call, information about the setup for that.
+    InAllocaInfo InAllocaSetup;
+
+    // Actual call instruction.
     MachineInstr *Call;
 
-    // A copy of the stack pointer
+    // A copy of the stack pointer.
     MachineInstr *SPCopy;
 
-    // The total displacement of all passed parameters
+    // The total displacement of all passed parameters.
     int64_t ExpectedDist;
 
-    // The sequence of movs used to pass the parameters
+    // The sequence of movs used to pass the parameters.
     SmallVector<MachineInstr *, 4> MovVector;
 
-    // True if this call site has no stack parameters
+    // True if this call site has no stack parameters.
     bool NoStackParams;
 
-    // True if this call site can use push instructions
+    // True if this call site can use push instructions.
     bool UsePush;
   };
 
@@ -85,6 +114,9 @@
 
   bool isProfitable(MachineFunction &MF, ContextVector &CallSeqMap);
 
+  bool matchInAlloca(MachineBasicBlock::iterator &I, InAllocaInfo &Info,
+                     unsigned int &MaxAdjust);
+
   void collectCallInfo(MachineFunction &MF, MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator I, CallContext &Context);
 
@@ -314,6 +346,60 @@
   return Skip;
 }
 
+bool X86CallFrameOptimization::matchInAlloca(MachineBasicBlock::iterator &I,
+                                             InAllocaInfo &Info,
+                                             unsigned int &MaxAdjust) {
+  // inalloca is only expected to occur in 32-bit code.
+	if (STI->is64Bit())
+    return false;
+
+  assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
+
+  // FrameSetup for the _chkstk call.
+  if (I->getOperand(0).getImm() != 0 || I->getOperand(1).getImm() != 0)
+    return false;
+  Info.FrameSetup = I++;
+
+  // Often, there's an instruction here that moves the _chkstk amount into a
+  // virtual register. Ignore it for now; we'll look into that below.
+  if (I->getOpcode() == X86::MOV32ri)
+    I++;
+
+  // Match move of virtual register to %eax, the _chkstk argument.
+  if (!I->isCopy() || !I->getOperand(0).isReg() || !I->getOperand(1).isReg() ||
+      I->getOperand(0).getReg() != X86::EAX)
+    return false;
+  Info.AmountMov = I++;
+
+  // Get the definition of that virtual register.
+  unsigned ChkstkAmountVreg = Info.AmountMov->getOperand(1).getReg();
+  MachineInstr *Def = MRI->getUniqueVRegDef(ChkstkAmountVreg);
+  if (!Def || Def->getOpcode() != X86::MOV32ri || !Def->getOperand(1).isImm())
+    return false;
+  Info.AmountInstr = Def;
+  MaxAdjust = Def->getOperand(1).getImm() >> Log2SlotSize;
+
+  // Match call to chkstk.
+  if (!I->isCall() || !I->getOperand(0).isSymbol() ||
+      StringRef(I->getOperand(0).getSymbolName()) != "_chkstk")
+    return false;
+  Info.ChkstkCall = I++;
+
+  // Match copy of %esp (the result of _chkstk) to a register.
+  if (!I->isCopy() || !I->getOperand(0).isReg() || !I->getOperand(1).isReg() ||
+      I->getOperand(1).getReg() != X86::ESP)
+    return false;
+  Info.ChkstkResultCopy = I++;
+
+  // Match FrameDestroy for _chkstk call.
+  if (I->getOpcode() != TII->getCallFrameDestroyOpcode() ||
+      I->getOperand(0).getImm() != 0 || I->getOperand(1).getImm() != 0)
+    return false;
+  Info.FrameDestroy = I++;
+
+  return true;
+}
+
 void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
                                                MachineBasicBlock &MBB,
                                                MachineBasicBlock::iterator I,
@@ -322,10 +408,11 @@
   // transformation.
   const X86RegisterInfo &RegInfo =
       *static_cast<const X86RegisterInfo *>(STI->getRegisterInfo());
+  unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
   unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
 
   // We expect to enter this at the beginning of a call sequence
-  assert(I->getOpcode() == TII->getCallFrameSetupOpcode());
+  assert(I->getOpcode() == FrameSetupOpcode);
   MachineBasicBlock::iterator FrameSetup = I++;
   Context.FrameSetup = FrameSetup;
 
@@ -334,10 +421,16 @@
   unsigned int MaxAdjust =
       FrameSetup->getOperand(0).getImm() >> Log2SlotSize;
 
-  // A zero adjustment means no stack parameters
   if (!MaxAdjust) {
-    Context.NoStackParams = true;
-    return;
+    // It might be an inalloca call. Back up and check for that.
+    --I;
+    if (matchInAlloca(I, Context.InAllocaSetup, MaxAdjust)) {
+      Context.IsInAlloca = true;
+    } else {
+      // Otherwise, a zero adjustment means no stack parameters.
+      Context.NoStackParams = true;
+      return;
+    }
   }
 
   // For globals in PIC mode, we can have some LEAs here.
@@ -346,15 +439,20 @@
   while (I->getOpcode() == X86::LEA32r)
     ++I;
 
-  // We expect a copy instruction here.
-  // TODO: The copy instruction is a lowering artifact.
-  //       We should also support a copy-less version, where the stack
-  //       pointer is used directly.
-  if (!I->isCopy() || !I->getOperand(0).isReg())
-    return;
-  Context.SPCopy = I++;
+  unsigned StackPtr;
 
-  unsigned StackPtr = Context.SPCopy->getOperand(0).getReg();
+  if (Context.IsInAlloca) {
+    StackPtr = Context.InAllocaSetup.ChkstkResultCopy->getOperand(0).getReg();
+  } else {
+    // We expect a copy instruction here.
+    // TODO: The copy instruction is a lowering artifact.
+    //       We should also support a copy-less version, where the stack
+    //       pointer is used directly.
+    if (!I->isCopy() || !I->getOperand(0).isReg())
+      return;
+    Context.SPCopy = I++;
+    StackPtr = Context.SPCopy->getOperand(0).getReg();
+  }
 
   // Scan the call setup sequence for the pattern we're looking for.
   // We only handle a simple case - a sequence of store instructions that
@@ -419,6 +517,13 @@
     ++I;
   }
 
+  // For an inalloca call, the FrameSetup instruction for the call is here.
+  if (Context.IsInAlloca) {
+    if (I == MBB.end() || I->getOpcode() != FrameSetupOpcode)
+      return;
+    Context.FrameSetup = I++;
+  }
+
   // We now expect the end of the sequence. If we stopped early,
   // or reached the end of the block without finding a call, bail.
   if (I == MBB.end() || !I->isCall())
@@ -450,6 +555,13 @@
 
 void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
                                                   const CallContext &Context) {
+  if (Context.IsInAlloca) {
+    // Move the FrameSetup instruction for the call to before the moves.
+    assert(Context.MovVector.size() > 0 && "No moves?");
+    auto *MBB = Context.MovVector[0]->getParent();
+    MBB->insert(Context.MovVector[0], Context.FrameSetup->removeFromParent());
+  }
+
   // Ok, we can in fact do the transformation for this call.
   // Do not remove the FrameSetup instruction, but adjust the parameters.
   // PEI will end up finalizing the handling of this.
@@ -537,10 +649,35 @@
     MBB.erase(MOV);
   }
 
-  // The stack-pointer copy is no longer used in the call sequences.
-  // There should not be any other users, but we can't commit to that, so:
-  if (MRI->use_empty(Context.SPCopy->getOperand(0).getReg()))
-    Context.SPCopy->eraseFromParent();
+  if (!Context.IsInAlloca) {
+    // The stack-pointer copy is no longer used in the call sequences.
+    // There should not be any other users, but we can't commit to that, so:
+    if (MRI->use_empty(Context.SPCopy->getOperand(0).getReg()))
+      Context.SPCopy->eraseFromParent();
+  } else {
+    // Remove the incalloca call setup (in reverse to delete uses before defs).
+    Context.InAllocaSetup.FrameDestroy->eraseFromParent();
+
+    unsigned ChkstkRes =
+        Context.InAllocaSetup.ChkstkResultCopy->getOperand(0).getReg();
+    if (!MRI->use_empty(ChkstkRes)) {
+      // Something is using the result of _chkstk. Provide a replacement.
+      unsigned NewReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+      addRegOffset(BuildMI(MBB, Context.InAllocaSetup.ChkstkResultCopy, DL,
+                           TII->get(X86::LEA32r), NewReg),
+                   X86::ESP, false, -Context.ExpectedDist);
+      MRI->replaceRegWith(ChkstkRes, NewReg);
+    }
+    Context.InAllocaSetup.ChkstkResultCopy->eraseFromParent();
+
+    Context.InAllocaSetup.ChkstkCall->eraseFromParent();
+    Context.InAllocaSetup.AmountMov->eraseFromParent();
+    if (MRI->use_empty(
+            Context.InAllocaSetup.AmountInstr->getOperand(0).getReg())) {
+      Context.InAllocaSetup.AmountInstr->eraseFromParent();
+    }
+    Context.InAllocaSetup.FrameSetup->eraseFromParent();
+  }
 
   // Once we've done this, we need to make sure PEI doesn't assume a reserved
   // frame.
Index: test/CodeGen/X86/inalloca-callframeopt.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/inalloca-callframeopt.ll
@@ -0,0 +1,96 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+
+%struct.S = type { i32 }
+declare void @f(<{ %struct.S }>* inalloca)
+
+define void @basic() {
+entry:
+  %argmem = alloca inalloca <{ %struct.S }>, align 4
+  %x = getelementptr inbounds <{ %struct.S }>, <{ %struct.S }>* %argmem, i32 0, i32 0, i32 0
+  store i32 42, i32* %x, align 4
+  call void @f(<{ %struct.S }>* inalloca %argmem)
+  ret void
+
+; CHECK-LABEL: basic:
+; TODO: We've removed the dynamic alloca; make frame pointer omission possible.
+; CHECK:      pushl %ebp
+; CHECK-NEXT: movl %esp, %ebp
+; CHECK-NOT: calll __chkstk
+; CHECK-NOT: movl $42
+; CHECK-NEXT: pushl $42
+; CHECK-NEXT: calll _f
+; CHECK-NEXT: movl %ebp, %esp
+; CHECK-NEXT: popl %ebp
+; CHECK-NEXT: retl
+}
+
+
+%struct.T = type { i32*, [1 x i32] }
+declare void @g(<{ %struct.T }>* inalloca)
+
+define void @stack_reference_arg() {
+entry:
+  %argmem = alloca inalloca <{ %struct.T }>, align 4
+  %arrayinit.begin.i = getelementptr inbounds <{ %struct.T }>, <{ %struct.T }>* %argmem, i32 0, i32 0, i32 1, i32 0
+  store i32 1, i32* %arrayinit.begin.i, align 4
+  %p.i = getelementptr inbounds <{ %struct.T }>, <{ %struct.T }>* %argmem, i32 0, i32 0, i32 0
+  store i32* %arrayinit.begin.i, i32** %p.i, align 4
+  call void @g(<{ %struct.T }>* inalloca %argmem)
+  ret void
+
+; One of the arguments is a pointer into the call frame.
+; FIXME: It would be cool if we could fold away the add instruction.
+; CHECK-LABEL: stack_reference_arg:
+; CHECK:      pushl %ebp
+; CHECK-NEXT: movl %esp, %ebp
+; CHECK-NEXT: leal -8(%esp), %eax
+; CHECK-NEXT: addl $4, %eax
+; CHECK-NEXT: pushl $1
+; CHECK-NEXT: pushl %eax
+; CHECK-NEXT: calll _g
+; CHECK-NEXT: movl %ebp, %esp
+; CHECK-NEXT: popl %ebp
+; CHECK-NEXT: retl
+}
+
+
+define void @two_calls() {
+entry:
+  %inalloca.save = tail call i8* @llvm.stacksave()
+  %argmem = alloca inalloca <{ %struct.S }>, align 4
+  %x = getelementptr inbounds <{ %struct.S }>, <{ %struct.S }>* %argmem, i32 0, i32 0, i32 0
+  store i32 42, i32* %x, align 4
+  call void @f(<{ %struct.S }>* inalloca nonnull %argmem)
+  call void @llvm.stackrestore(i8* %inalloca.save)
+  %argmem3 = alloca inalloca <{ %struct.S }>, align 4
+  %x2 = getelementptr inbounds <{ %struct.S }>, <{ %struct.S }>* %argmem3, i32 0, i32 0, i32 0
+  store i32 42, i32* %x2, align 4
+  call void @f(<{ %struct.S }>* inalloca nonnull %argmem3)
+  ret void
+
+; Two inalloca calls after eachother. The vreg used for the _chkstk argument
+; is shared between them.
+; FIXME: Clang puts stacksave/restore around the first call, which are
+; redundant since the stack is adjusted back after the call anyway. If
+; we wanted to be aggressive, we could even skip adjusting the stack
+; back between the calls.
+; CHECK-LABEL: two_calls
+; CHECK:      pushl %ebp
+; CHECK-NEXT: movl %esp, %ebp
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: movl %esp, %esi
+; CHECK-NEXT: pushl $42
+; CHECK-NEXT: calll _f
+; CHECK-NEXT: addl $4, %esp
+; CHECK-NEXT: movl %esi, %esp
+; CHECK-NEXT: pushl $42
+; CHECK-NEXT: calll _f
+; CHECK-NEXT: leal -4(%ebp), %esp
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: popl %ebp
+; CHECK-NEXT: retl
+}
+
+
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)
Index: test/CodeGen/X86/inalloca-stdcall.ll
===================================================================
--- test/CodeGen/X86/inalloca-stdcall.ll
+++ test/CodeGen/X86/inalloca-stdcall.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-pc-win32 -no-x86-call-frame-opt | FileCheck %s
 
 %Foo = type { i32, i32 }
 
@@ -19,8 +19,8 @@
 ; CHECK: movl    $42, 4(%eax)
   call x86_stdcallcc void @f(%Foo* inalloca %b)
 ; CHECK: calll   _f@8
-; CHECK-NOT: %esp
-; CHECK: pushl
+; CHECK: subl    $4, %esp
+; CHECK: movl    $0, (%esp)
 ; CHECK: calll   _i@4
   call x86_stdcallcc void @i(i32 0)
   ret void
Index: test/CodeGen/X86/inalloca.ll
===================================================================
--- test/CodeGen/X86/inalloca.ll
+++ test/CodeGen/X86/inalloca.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-pc-win32 -no-x86-call-frame-opt | FileCheck %s
 
 %Foo = type { i32, i32 }