diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -797,6 +797,12 @@
     /// pointer, and a SRCVALUE.
     VAEND, VASTART,
 
+    // PREALLOCATED_SETUP - This has one operand: an input chain.
+    PREALLOCATED_SETUP,
+    // PREALLOCATED_ARG - This has 3 operands: an input chain, a SRCVALUE
+    // with the preallocated call, and a constant int.
+    PREALLOCATED_ARG,
+
     /// SRCVALUE - This is a node type that holds a Value* that is used to
     /// make reference to a value in the LLVM IR.
     SRCVALUE,
diff --git a/llvm/include/llvm/CodeGen/TargetCallingConv.h b/llvm/include/llvm/CodeGen/TargetCallingConv.h
--- a/llvm/include/llvm/CodeGen/TargetCallingConv.h
+++ b/llvm/include/llvm/CodeGen/TargetCallingConv.h
@@ -35,6 +35,7 @@
     unsigned IsReturned : 1; ///< Always returned
     unsigned IsSplit : 1;
     unsigned IsInAlloca : 1;   ///< Passed with inalloca
+    unsigned IsPreallocated : 1; ///< ByVal without the copy
     unsigned IsSplitEnd : 1;   ///< Last part of a split
     unsigned IsSwiftSelf : 1;  ///< Swift self parameter
     unsigned IsSwiftError : 1; ///< Swift error parameter
@@ -56,9 +57,9 @@
   public:
     ArgFlagsTy()
         : IsZExt(0), IsSExt(0), IsInReg(0), IsSRet(0), IsByVal(0), IsNest(0),
-          IsReturned(0), IsSplit(0), IsInAlloca(0), IsSplitEnd(0),
-          IsSwiftSelf(0), IsSwiftError(0), IsCFGuardTarget(0), IsHva(0),
-          IsHvaStart(0), IsSecArgPass(0), ByValAlign(0), OrigAlign(0),
+          IsReturned(0), IsSplit(0), IsInAlloca(0), IsPreallocated(0),
+          IsSplitEnd(0), IsSwiftSelf(0), IsSwiftError(0), IsCFGuardTarget(0),
+          IsHva(0), IsHvaStart(0), IsSecArgPass(0), ByValAlign(0), OrigAlign(0),
           IsInConsecutiveRegsLast(0), IsInConsecutiveRegs(0),
           IsCopyElisionCandidate(0), IsPointer(0), ByValSize(0),
           PointerAddrSpace(0) {
@@ -83,6 +84,9 @@
     bool isInAlloca() const { return IsInAlloca; }
     void setInAlloca() { IsInAlloca = 1; }
 
+    bool isPreallocated() const { return IsPreallocated; }
+    void setPreallocated() { IsPreallocated = 1; }
+
     bool isSwiftSelf() const { return IsSwiftSelf; }
     void setSwiftSelf() { IsSwiftSelf = 1; }
 
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -273,17 +273,20 @@
     bool IsNest : 1;
     bool IsByVal : 1;
     bool IsInAlloca : 1;
+    bool IsPreallocated : 1;
     bool IsReturned : 1;
     bool IsSwiftSelf : 1;
     bool IsSwiftError : 1;
     bool IsCFGuardTarget : 1;
     MaybeAlign Alignment = None;
     Type *ByValType = nullptr;
+    Type *PreallocatedType = nullptr;
 
     ArgListEntry()
         : IsSExt(false), IsZExt(false), IsInReg(false), IsSRet(false),
-          IsNest(false), IsByVal(false), IsInAlloca(false), IsReturned(false),
-          IsSwiftSelf(false), IsSwiftError(false), IsCFGuardTarget(false) {}
+          IsNest(false), IsByVal(false), IsInAlloca(false),
+          IsPreallocated(false), IsReturned(false), IsSwiftSelf(false),
+          IsSwiftError(false), IsCFGuardTarget(false) {}
 
     void setAttributes(const CallBase *Call, unsigned ArgIdx);
   };
@@ -3608,6 +3611,7 @@
     bool IsReturnValueUsed : 1;
     bool IsConvergent      : 1;
     bool IsPatchPoint      : 1;
+    bool IsPreallocated : 1;
 
     // IsTailCall should be modified by implementations of
     // TargetLowering::LowerCall that perform tail call conversions.
@@ -3631,7 +3635,7 @@
     CallLoweringInfo(SelectionDAG &DAG)
         : RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false),
           DoesNotReturn(false), IsReturnValueUsed(true), IsConvergent(false),
-          IsPatchPoint(false), DAG(DAG) {}
+          IsPatchPoint(false), IsPreallocated(false), DAG(DAG) {}
 
     CallLoweringInfo &setDebugLoc(const SDLoc &dl) {
       DL = dl;
@@ -3737,6 +3741,11 @@
       return *this;
     }
 
+    CallLoweringInfo &setIsPreallocated(bool Value = true) {
+      IsPreallocated = Value;
+      return *this;
+    }
+
     CallLoweringInfo &setIsPostTypeLegalization(bool Value=true) {
       IsPostTypeLegalization = Value;
       return *this;
diff --git a/llvm/include/llvm/IR/Argument.h b/llvm/include/llvm/IR/Argument.h
--- a/llvm/include/llvm/IR/Argument.h
+++ b/llvm/include/llvm/IR/Argument.h
@@ -71,9 +71,9 @@
   /// Return true if this argument has the swifterror attribute.
   bool hasSwiftErrorAttr() const;
 
-  /// Return true if this argument has the byval attribute or inalloca
+  /// Return true if this argument has the byval, inalloca, or preallocated
   /// attribute. These attributes represent arguments being passed by value.
-  bool hasByValOrInAllocaAttr() const;
+  bool isPassPointeeByValue() const;
 
   /// If this is a byval or inalloca argument, return its alignment.
   /// FIXME: Remove this function once transition to Align is over.
diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -626,6 +626,9 @@
   /// Return the byval type for the specified function parameter.
   Type *getParamByValType(unsigned ArgNo) const;
 
+  /// Return the preallocated type for the specified function parameter.
+  Type *getParamPreallocatedType(unsigned ArgNo) const;
+
   /// Get the stack alignment.
   MaybeAlign getStackAlignment(unsigned Index) const;
 
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1602,6 +1602,12 @@
     return Ty ? Ty : getArgOperand(ArgNo)->getType()->getPointerElementType();
   }
 
+  /// Extract the preallocated type for a call or parameter.
+  Type *getParamPreallocatedType(unsigned ArgNo) const {
+    Type *Ty = Attrs.getParamPreallocatedType(ArgNo);
+    return Ty ? Ty : getArgOperand(ArgNo)->getType()->getPointerElementType();
+  }
+
   /// Extract the number of dereferenceable bytes for a call or
   /// parameter (0=unknown).
   uint64_t getDereferenceableBytes(unsigned i) const {
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -127,6 +127,12 @@
 /// additionally expand this pseudo after register allocation.
 HANDLE_TARGET_OPCODE(LOAD_STACK_GUARD)
 
+/// These are used to support call sites that must have the stack adjusted
+/// before the call (e.g. to initialize an argument passed by value).
+/// See llvm.call.preallocated.{setup,arg} in the LangRef for more details.
+HANDLE_TARGET_OPCODE(PREALLOCATED_SETUP)
+HANDLE_TARGET_OPCODE(PREALLOCATED_ARG)
+
 /// Call instruction with associated vm state for deoptimization and list
 /// of live pointers for relocation by the garbage collector.  It is
 /// intended to support garbage collection with fully precise relocating
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1173,6 +1173,18 @@
   let hasSideEffects = 0;
   bit isPseudo = 1;
 }
+def PREALLOCATED_SETUP : StandardPseudoInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$a);
+  let usesCustomInserter = 1;
+  let hasSideEffects = 1;
+}
+def PREALLOCATED_ARG : StandardPseudoInstruction {
+  let OutOperandList = (outs ptr_rc:$loc);
+  let InOperandList = (ins i32imm:$a, i32imm:$b);
+  let usesCustomInserter = 1;
+  let hasSideEffects = 1;
+}
 def LOCAL_ESCAPE : StandardPseudoInstruction {
   // This instruction is really just a label. It has to be part of the chain so
   // that it doesn't get dropped from the DAG, but it produces nothing and has
diff --git a/llvm/include/llvm/Target/TargetCallingConv.td b/llvm/include/llvm/Target/TargetCallingConv.td
--- a/llvm/include/llvm/Target/TargetCallingConv.td
+++ b/llvm/include/llvm/Target/TargetCallingConv.td
@@ -41,6 +41,11 @@
 class CCIfByVal<CCAction A> : CCIf<"ArgFlags.isByVal()", A> {
 }
 
+/// CCIfPreallocated - If the current argument has Preallocated parameter attribute,
+/// apply Action A.
+class CCIfPreallocated<CCAction A> : CCIf<"ArgFlags.isPreallocated()", A> {
+}
+
 /// CCIfSwiftSelf - If the current argument has swiftself parameter attribute,
 /// apply Action A.
 class CCIfSwiftSelf<CCAction A> : CCIf<"ArgFlags.isSwiftSelf()", A> {
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -673,7 +673,7 @@
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
   // No interprocedural analysis is done at the moment.
-  if (!A.hasByValOrInAllocaAttr()) {
+  if (!A.isPassPointeeByValue()) {
     ++ObjectVisitorArgument;
     return unknown();
   }
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -2339,7 +2339,7 @@
 
     // A byval, inalloca, or nonnull argument is never null.
     if (const Argument *A = dyn_cast<Argument>(V))
-      if (A->hasByValOrInAllocaAttr() || A->hasNonNullAttr())
+      if (A->isPassPointeeByValue() || A->hasNonNullAttr())
         return true;
 
     // A Load tagged with nonnull metadata is never null.
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -96,10 +96,12 @@
     Flags.setSwiftError();
   if (Attrs.hasAttribute(OpIdx, Attribute::ByVal))
     Flags.setByVal();
+  if (Attrs.hasAttribute(OpIdx, Attribute::Preallocated))
+    Flags.setPreallocated();
   if (Attrs.hasAttribute(OpIdx, Attribute::InAlloca))
     Flags.setInAlloca();
 
-  if (Flags.isByVal() || Flags.isInAlloca()) {
+  if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated()) {
     Type *ElementTy = cast<PointerType>(Arg.Ty)->getElementType();
 
     auto Ty = Attrs.getAttribute(OpIdx, Attribute::ByVal).getValueAsType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1873,9 +1873,6 @@
 }
 
 SDValue SelectionDAG::getSrcValue(const Value *V) {
-  assert((!V || V->getType()->isPointerTy()) &&
-         "SrcValue is not a pointer?");
-
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), None);
   ID.AddPointer(V);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5796,6 +5796,45 @@
     updateDAGForMaybeTailCall(MC);
     return;
   }
+  case Intrinsic::call_preallocated_setup:
+  case Intrinsic::call_preallocated_arg: {
+    const Value *PreallocatedCall =
+        Intrinsic == Intrinsic::call_preallocated_setup ? &I : I.getOperand(0);
+    assert(cast<CallBase>(PreallocatedCall)
+                   ->getCalledFunction()
+                   ->getIntrinsicID() == Intrinsic::call_preallocated_setup &&
+           "expected call_preallocated_setup Value");
+    const CallBase *Call = nullptr;
+    for (auto U : PreallocatedCall->users()) {
+      auto *UseCall = cast<CallBase>(U);
+      const Function *Fn = UseCall->getCalledFunction();
+      if (Fn->getIntrinsicID() != Intrinsic::call_preallocated_arg) {
+        Call = UseCall;
+        break;
+      }
+    }
+    assert(Call && "expected corresponding call to preallocated setup/arg");
+    SDValue SrcValue = DAG.getSrcValue(Call);
+    if (Intrinsic == Intrinsic::call_preallocated_setup) {
+      SDValue Res = DAG.getNode(ISD::PREALLOCATED_SETUP, sdl, MVT::Other,
+                                getRoot(), SrcValue);
+      setValue(&I, Res);
+      DAG.setRoot(Res);
+    } else {
+      SDValue Ops[3];
+      Ops[0] = getRoot();
+      Ops[1] = SrcValue;
+      Ops[2] = DAG.getTargetConstant(*cast<ConstantInt>(I.getArgOperand(1)),
+                                     sdl, MVT::i32); // arg index
+      SDValue Res = DAG.getNode(
+          ISD::PREALLOCATED_ARG, sdl,
+          DAG.getVTList(TLI.getPointerTy(DAG.getDataLayout()), MVT::Other),
+          Ops);
+      setValue(&I, Res);
+      DAG.setRoot(Res.getValue(1));
+    }
+    return;
+  }
   case Intrinsic::dbg_addr:
   case Intrinsic::dbg_declare: {
     const auto &DI = cast<DbgVariableIntrinsic>(I);
@@ -7116,7 +7155,9 @@
       .setChain(getRoot())
       .setCallee(RetTy, FTy, Callee, std::move(Args), CB)
       .setTailCall(isTailCall)
-      .setConvergent(CB.isConvergent());
+      .setConvergent(CB.isConvergent())
+      .setIsPreallocated(
+          CB.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0);
   std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);
 
   if (Result.first.getNode()) {
@@ -7642,9 +7683,9 @@
   // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
   // have to do anything here to lower funclet bundles.
   // CFGuardTarget bundles are lowered in LowerCallTo.
-  assert(!I.hasOperandBundlesOtherThan({LLVMContext::OB_deopt,
-                                        LLVMContext::OB_funclet,
-                                        LLVMContext::OB_cfguardtarget}) &&
+  assert(!I.hasOperandBundlesOtherThan(
+             {LLVMContext::OB_deopt, LLVMContext::OB_funclet,
+              LLVMContext::OB_cfguardtarget, LLVMContext::OB_preallocated}) &&
          "Cannot lower calls with arbitrary operand bundles!");
 
   SDValue Callee = getValue(I.getCalledOperand());
@@ -8605,7 +8646,9 @@
       .setChain(getRoot())
       .setCallee(Call->getCallingConv(), ReturnTy, Callee, std::move(Args))
       .setDiscardResult(Call->use_empty())
-      .setIsPatchPoint(IsPatchPoint);
+      .setIsPatchPoint(IsPatchPoint)
+      .setIsPreallocated(
+          Call->countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0);
 }
 
 /// Add a stack map intrinsic call's live variable operands to a stackmap
@@ -9125,6 +9168,8 @@
         Flags.setCFGuardTarget();
       if (Args[i].IsByVal)
         Flags.setByVal();
+      if (Args[i].IsPreallocated)
+        Flags.setPreallocated();
       if (Args[i].IsInAlloca) {
         Flags.setInAlloca();
         // Set the byval flag for CCAssignFn callbacks that don't know about
@@ -9134,7 +9179,7 @@
         // in the various CC lowering callbacks.
         Flags.setByVal();
       }
-      if (Args[i].IsByVal || Args[i].IsInAlloca) {
+      if (Args[i].IsByVal || Args[i].IsInAlloca || Args[i].IsPreallocated) {
         PointerType *Ty = cast<PointerType>(Args[i].Ty);
         Type *ElementTy = Ty->getElementType();
 
@@ -9633,12 +9678,21 @@
         // in the various CC lowering callbacks.
         Flags.setByVal();
       }
+      if (Arg.hasAttribute(Attribute::Preallocated)) {
+        Flags.setPreallocated();
+        // Set the byval flag for CCAssignFn callbacks that don't know about
+        // inalloca.  This way we can know how many bytes we should've allocated
+        // and how many bytes a callee cleanup function will pop.  If we port
+        // inalloca to more targets, we'll have to add custom inalloca handling
+        // in the various CC lowering callbacks.
+        Flags.setByVal();
+      }
       if (F.getCallingConv() == CallingConv::X86_INTR) {
         // IA Interrupt passes frame (1st parameter) by value in the stack.
         if (ArgNo == 0)
           Flags.setByVal();
       }
-      if (Flags.isByVal() || Flags.isInAlloca()) {
+      if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated()) {
         Type *ElementTy = Arg.getParamByValType();
 
         // For ByVal, size and alignment should be passed from FE.  BE will
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -393,6 +393,10 @@
   case ISD::GC_TRANSITION_END:          return "gc_transition.end";
   case ISD::GET_DYNAMIC_AREA_OFFSET:    return "get.dynamic.area.offset";
   case ISD::FREEZE:                     return "freeze";
+  case ISD::PREALLOCATED_SETUP:
+    return "call_setup";
+  case ISD::PREALLOCATED_ARG:
+    return "call_alloc";
 
   // Bit manipulation
   case ISD::ABS:                        return "abs";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -110,14 +110,18 @@
   IsSRet = Call->paramHasAttr(ArgIdx, Attribute::StructRet);
   IsNest = Call->paramHasAttr(ArgIdx, Attribute::Nest);
   IsByVal = Call->paramHasAttr(ArgIdx, Attribute::ByVal);
+  IsPreallocated = Call->paramHasAttr(ArgIdx, Attribute::Preallocated);
   IsInAlloca = Call->paramHasAttr(ArgIdx, Attribute::InAlloca);
   IsReturned = Call->paramHasAttr(ArgIdx, Attribute::Returned);
   IsSwiftSelf = Call->paramHasAttr(ArgIdx, Attribute::SwiftSelf);
   IsSwiftError = Call->paramHasAttr(ArgIdx, Attribute::SwiftError);
   Alignment = Call->getParamAlign(ArgIdx);
   ByValType = nullptr;
-  if (Call->paramHasAttr(ArgIdx, Attribute::ByVal))
+  if (IsByVal)
     ByValType = Call->getParamByValType(ArgIdx);
+  PreallocatedType = nullptr;
+  if (IsPreallocated)
+    PreallocatedType = Call->getParamPreallocatedType(ArgIdx);
 }
 
 /// Generate a libcall taking the given operands as arguments and returning a
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -1443,6 +1443,10 @@
   return getAttributes(Index+FirstArgIndex).getByValType();
 }
 
+Type *AttributeList::getParamPreallocatedType(unsigned Index) const {
+  return getAttributes(Index + FirstArgIndex).getPreallocatedType();
+}
+
 MaybeAlign AttributeList::getStackAlignment(unsigned Index) const {
   return getAttributes(Index).getStackAlignment();
 }
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -114,11 +114,12 @@
   return hasAttribute(Attribute::InAlloca);
 }
 
-bool Argument::hasByValOrInAllocaAttr() const {
+bool Argument::isPassPointeeByValue() const {
   if (!getType()->isPointerTy()) return false;
   AttributeList Attrs = getParent()->getAttributes();
   return Attrs.hasParamAttribute(getArgNo(), Attribute::ByVal) ||
-         Attrs.hasParamAttribute(getArgNo(), Attribute::InAlloca);
+         Attrs.hasParamAttribute(getArgNo(), Attribute::InAlloca) ||
+         Attrs.hasParamAttribute(getArgNo(), Attribute::Preallocated);
 }
 
 unsigned Argument::getParamAlignment() const {
diff --git a/llvm/lib/IR/Mangler.cpp b/llvm/lib/IR/Mangler.cpp
--- a/llvm/lib/IR/Mangler.cpp
+++ b/llvm/lib/IR/Mangler.cpp
@@ -98,7 +98,7 @@
        AI != AE; ++AI) {
     Type *Ty = AI->getType();
     // 'Dereference' type in case of byval or inalloca parameter attribute.
-    if (AI->hasByValOrInAllocaAttr())
+    if (AI->isPassPointeeByValue())
       Ty = cast<PointerType>(Ty)->getElementType();
     // Size should be aligned to pointer size.
     unsigned PtrSize = DL.getPointerSize();
diff --git a/llvm/lib/Target/ARM/ARMCallLowering.cpp b/llvm/lib/Target/ARM/ARMCallLowering.cpp
--- a/llvm/lib/Target/ARM/ARMCallLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMCallLowering.cpp
@@ -429,7 +429,7 @@
   for (auto &Arg : F.args()) {
     if (!isSupportedType(DL, TLI, Arg.getType()))
       return false;
-    if (Arg.hasByValOrInAllocaAttr())
+    if (Arg.isPassPointeeByValue())
       return false;
   }
 
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -789,8 +789,9 @@
 /// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
 /// values are spilled on the stack.
 def CC_X86_32_Common : CallingConv<[
-  // Handles byval parameters.
+  // Handles byval/preallocated parameters.
   CCIfByVal<CCPassByVal<4, 4>>,
+  CCIfPreallocated<CCPassByVal<4, 4>>,
 
   // The first 3 float or double arguments, if marked 'inreg' and if the call
   // is not a vararg call and if SSE2 is available, are passed in SSE registers.
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -57,7 +57,8 @@
 
 bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   return !MF.getFrameInfo().hasVarSizedObjects() &&
-         !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+         !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences() &&
+         !MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall();
 }
 
 /// canSimplifyCallFramePseudos - If there is a reserved call frame, the
@@ -67,6 +68,7 @@
 bool
 X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
   return hasReservedCallFrame(MF) ||
+         MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
          (hasFP(MF) && !TRI->needsStackRealignment(MF)) ||
          TRI->hasBasePointer(MF);
 }
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5625,6 +5625,39 @@
     CurDAG->RemoveDeadNode(Node);
     return;
   }
+  case ISD::PREALLOCATED_SETUP: {
+    auto MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+    auto CallId = MFI->PreallocatedIdForCallSite(
+        cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
+    SDValue Chain = Node->getOperand(0);
+    SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
+    MachineSDNode *New = CurDAG->getMachineNode(
+        TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
+    ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
+  case ISD::PREALLOCATED_ARG: {
+    auto MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+    auto CallId = MFI->PreallocatedIdForCallSite(
+        cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
+    SDValue Chain = Node->getOperand(0);
+    SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
+    SDValue ArgIndex = Node->getOperand(2);
+    SDValue Ops[3];
+    Ops[0] = CallIdValue;
+    Ops[1] = ArgIndex;
+    Ops[2] = Chain;
+    MachineSDNode *New = CurDAG->getMachineNode(
+        TargetOpcode::PREALLOCATED_ARG, dl,
+        CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
+                          MVT::Other),
+        Ops);
+    ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
+    ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
   }
 
   SelectCode(Node);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3893,6 +3893,22 @@
     if (ArgLocs.back().getLocMemOffset() != 0)
       report_fatal_error("any parameter with the inalloca attribute must be "
                          "the only memory argument");
+  } else if (CLI.IsPreallocated) {
+    if (!ArgLocs.back().isMemLoc()) {
+      report_fatal_error("cannot use preallocated attribute on a register "
+                         "parameter");
+    }
+    SmallVector<size_t, 4> PreallocatedOffsets;
+    for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
+      if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
+        PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
+      }
+    }
+    auto MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
+    size_t PreallocatedId = MFI->PreallocatedIdForCallSite(CLI.CB);
+    MFI->SetPreallocatedStackSize(PreallocatedId, NumBytes);
+    MFI->SetPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
+    NumBytesToPush = 0;
   }
 
   if (!IsSibcall && !IsMustTail)
@@ -3920,9 +3936,9 @@
   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
        ++I, ++OutIndex) {
     assert(OutIndex < Outs.size() && "Invalid Out index");
-    // Skip inalloca arguments, they have already been written.
+    // Skip inalloca/preallocated arguments, they have already been written.
     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
-    if (Flags.isInAlloca())
+    if (Flags.isInAlloca() || Flags.isPreallocated())
       continue;
 
     CCValAssign &VA = ArgLocs[I];
@@ -4110,8 +4126,8 @@
       assert(VA.isMemLoc());
       SDValue Arg = OutVals[OutsIndex];
       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
-      // Skip inalloca arguments.  They don't require any work.
-      if (Flags.isInAlloca())
+      // Skip inalloca/preallocated arguments.  They don't require any work.
+      if (Flags.isInAlloca() || Flags.isPreallocated())
         continue;
       // Create frame index.
       int32_t Offset = VA.getLocMemOffset()+FPDiff;
@@ -33072,6 +33088,38 @@
       BB->addLiveIn(BasePtr);
     return BB;
   }
+  case TargetOpcode::PREALLOCATED_SETUP: {
+    assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
+    auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+    MFI->setHasPreallocatedCall(true);
+    // Offsets from the stack pointer may be wrong because we have adjusted it
+    MFI->setForceFramePointer(true);
+    int64_t PreallocatedId = MI.getOperand(0).getImm();
+    size_t StackAdjustment = MFI->GetPreallocatedStackSize(PreallocatedId);
+    assert(StackAdjustment != 0 && "0 stack adjustment");
+    LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
+                      << StackAdjustment << "\n");
+    BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
+        .addReg(X86::ESP)
+        .addImm(StackAdjustment);
+    MI.eraseFromParent();
+    return BB;
+  }
+  case TargetOpcode::PREALLOCATED_ARG: {
+    assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
+    int64_t PreallocatedId = MI.getOperand(1).getImm();
+    int64_t ArgIdx = MI.getOperand(2).getImm();
+    auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+    size_t ArgOffset = MFI->GetPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
+    LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
+                      << ", arg offset " << ArgOffset << "\n");
+    // copy stack pointer + offset into reg
+    addRegOffset(
+        BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
+        X86::ESP, false, ArgOffset);
+    MI.eraseFromParent();
+    return BB;
+  }
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -103,6 +103,14 @@
   /// True if this function has WIN_ALLOCA instructions.
   bool HasWinAlloca = false;
 
+  /// True if this function has any preallocated calls.
+  bool HasPreallocatedCall = false;
+
+  size_t PreallocatedNextId = 0;
+  ValueMap<const Value *, size_t> PreallocatedIds;
+  DenseMap<size_t, size_t> PreallocatedStackSizes;
+  DenseMap<size_t, SmallVector<size_t, 4>> PreallocatedArgOffsets;
+
 private:
   /// ForwardedMustTailRegParms - A list of virtual and physical registers
   /// that must be forwarded to every musttail call.
@@ -184,6 +192,40 @@
 
   bool hasWinAlloca() const { return HasWinAlloca; }
   void setHasWinAlloca(bool v) { HasWinAlloca = v; }
+
+  bool hasPreallocatedCall() const { return HasPreallocatedCall; }
+  void setHasPreallocatedCall(bool v) { HasPreallocatedCall = v; }
+
+  size_t PreallocatedIdForCallSite(const Value *CS) {
+    auto Id = PreallocatedIds.find(CS);
+    if (Id == PreallocatedIds.end()) {
+      size_t NewId = PreallocatedNextId++;
+      PreallocatedIds.insert({CS, NewId});
+      return NewId;
+    } else {
+      return Id->second;
+    }
+  }
+
+  void SetPreallocatedStackSize(size_t Id, size_t StackSize) {
+    PreallocatedStackSizes[Id] = StackSize;
+  }
+
+  size_t GetPreallocatedStackSize(const size_t Id) {
+    assert(PreallocatedStackSizes.find(Id) != PreallocatedStackSizes.end() &&
+           "stack size not set");
+    return PreallocatedStackSizes[Id];
+  }
+
+  void SetPreallocatedArgOffsets(size_t Id, SmallVector<size_t, 4> AO) {
+    PreallocatedArgOffsets[Id] = AO;
+  }
+
+  const SmallVector<size_t, 4> &GetPreallocatedArgOffsets(const size_t Id) {
+    assert(PreallocatedArgOffsets.find(Id) != PreallocatedArgOffsets.end() &&
+           "arg offsets not set");
+    return PreallocatedArgOffsets[Id];
+  }
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -627,18 +627,22 @@
 }
 
 bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
-   const MachineFrameInfo &MFI = MF.getFrameInfo();
-
-   if (!EnableBasePointer)
-     return false;
-
-   // When we need stack realignment, we can't address the stack from the frame
-   // pointer.  When we have dynamic allocas or stack-adjusting inline asm, we
-   // can't address variables from the stack pointer.  MS inline asm can
-   // reference locals while also adjusting the stack pointer.  When we can't
-   // use both the SP and the FP, we need a separate base pointer register.
-   bool CantUseFP = needsStackRealignment(MF);
-   return CantUseFP && CantUseSP(MFI);
+  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  if (X86FI->hasPreallocatedCall())
+    return true;
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  if (!EnableBasePointer)
+    return false;
+
+  // When we need stack realignment, we can't address the stack from the frame
+  // pointer.  When we have dynamic allocas or stack-adjusting inline asm, we
+  // can't address variables from the stack pointer.  MS inline asm can
+  // reference locals while also adjusting the stack pointer.  When we can't
+  // use both the SP and the FP, we need a separate base pointer register.
+  bool CantUseFP = needsStackRealignment(MF);
+  return CantUseFP && CantUseSP(MFI);
 }
 
 bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -289,7 +289,8 @@
   bool Changed = false;
 
   for (Argument &Arg : Fn.args()) {
-    if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() && !Arg.hasByValOrInAllocaAttr()) {
+    if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() &&
+        !Arg.isPassPointeeByValue()) {
       if (Arg.isUsedByMetadata()) {
         Arg.replaceAllUsesWith(UndefValue::get(Arg.getType()));
         Changed = true;
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -839,7 +839,7 @@
   // Treat byval or inalloca arguments the same, stores to them are dead at the
   // end of the function.
   for (Argument &AI : BB.getParent()->args())
-    if (AI.hasByValOrInAllocaAttr())
+    if (AI.isPassPointeeByValue())
       DeadStackObjects.insert(&AI);
 
   const DataLayout &DL = BB.getModule()->getDataLayout();
@@ -1549,7 +1549,7 @@
     // Treat byval or inalloca arguments the same as Allocas, stores to them are
     // dead at the end of the function.
     for (Argument &AI : F.args())
-      if (AI.hasByValOrInAllocaAttr())
+      if (AI.isPassPointeeByValue())
         State.InvisibleToCallerBeforeRet.insert(&AI);
     return State;
   }
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1242,7 +1242,7 @@
   Function *CalledFunc = CB.getCalledFunction();
   for (Argument &Arg : CalledFunc->args()) {
     unsigned Align = Arg.getType()->isPointerTy() ? Arg.getParamAlignment() : 0;
-    if (Align && !Arg.hasByValOrInAllocaAttr() && !Arg.hasNUses(0)) {
+    if (Align && !Arg.isPassPointeeByValue() && !Arg.hasNUses(0)) {
       if (!DTCalculated) {
         DT.recalculate(*CB.getCaller());
         DTCalculated = true;
diff --git a/llvm/test/CodeGen/X86/preallocated-nocall.ll b/llvm/test/CodeGen/X86/preallocated-nocall.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/preallocated-nocall.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+; XFAIL: *
+
+declare token @llvm.call.preallocated.setup(i32)
+declare i8* @llvm.call.preallocated.arg(token, i32)
+
+%Foo = type { i32, i32 }
+
+declare void @init(%Foo*)
+
+
+
+declare void @foo_p(%Foo* preallocated(%Foo))
+
+define void @no_call() {
+; CHECK-LABEL: _no_call:
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+  call void @init(%Foo* %b)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/preallocated-x64.ll b/llvm/test/CodeGen/X86/preallocated-x64.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/preallocated-x64.ll
@@ -0,0 +1,18 @@
+; RUN: not --crash llc %s -mtriple=x86_64-windows-msvc -o /dev/null 2>&1 | FileCheck %s
+
+declare token @llvm.call.preallocated.setup(i32)
+declare i8* @llvm.call.preallocated.arg(token, i32)
+
+%Foo = type { i32, i32 }
+
+declare x86_thiscallcc void @f(i32, %Foo* preallocated(%Foo))
+
+define void @g() {
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+  call void @f(i32 0, %Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+  ret void
+}
+
+; CHECK: cannot use preallocated attribute on a register parameter
\ No newline at end of file
diff --git a/llvm/test/CodeGen/X86/preallocated.ll b/llvm/test/CodeGen/X86/preallocated.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/preallocated.ll
@@ -0,0 +1,174 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+
+declare token @llvm.call.preallocated.setup(i32)
+declare i8* @llvm.call.preallocated.arg(token, i32)
+
+%Foo = type { i32, i32 }
+
+declare void @init(%Foo*)
+
+
+
+declare void @foo_p(%Foo* preallocated(%Foo))
+
+define void @one_preallocated() {
+; CHECK-LABEL: _one_preallocated:
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+; CHECK: subl $8, %esp
+; CHECK: calll _foo_p
+  call void @foo_p(%Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+  ret void
+}
+
+define void @preallocated_with_store() {
+; CHECK-LABEL: _preallocated_with_store:
+; CHECK: subl $8, %esp
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+; CHECK: leal (%esp), [[REGISTER:%[a-z]+]]
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+  %p0 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
+  %p1 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
+  store i32 13, i32* %p0
+  store i32 42, i32* %p1
+; CHECK-DAG: movl $13, ([[REGISTER]])
+; CHECK-DAG: movl $42, 4([[REGISTER]])
+; CHECK-NOT: subl {{\$[0-9]+}}, %esp
+; CHECK-NOT: pushl
+; CHECK: calll _foo_p
+  call void @foo_p(%Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+  ret void
+}
+
+define void @preallocated_with_init() {
+; CHECK-LABEL: _preallocated_with_init:
+; CHECK: subl $8, %esp
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+; CHECK: leal (%esp), [[REGISTER:%[a-z]+]]
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+; CHECK: pushl [[REGISTER]]
+; CHECK: calll _init
+  call void @init(%Foo* %b)
+; CHECK-NOT: subl {{\$[0-9]+}}, %esp
+; CHECK-NOT: pushl
+; CHECK: calll _foo_p
+  call void @foo_p(%Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+  ret void
+}
+
+declare void @foo_p_p(%Foo* preallocated(%Foo), %Foo* preallocated(%Foo))
+
+define void @two_preallocated() {
+; CHECK-LABEL: _two_preallocated:
+  %t = call token @llvm.call.preallocated.setup(i32 2)
+  %a1 = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b1 = bitcast i8* %a1 to %Foo*
+  %a2 = call i8* @llvm.call.preallocated.arg(token %t, i32 1) preallocated(%Foo)
+  %b2 = bitcast i8* %a2 to %Foo*
+; CHECK: subl $16, %esp
+; CHECK: calll _foo_p_p
+  call void @foo_p_p(%Foo* preallocated(%Foo) %b1, %Foo* preallocated(%Foo) %b2) ["preallocated"(token %t)]
+  ret void
+}
+
+declare void @foo_p_int(%Foo* preallocated(%Foo), i32)
+
+define void @one_preallocated_one_normal() {
+; CHECK-LABEL: _one_preallocated_one_normal:
+; CHECK: subl $12, %esp
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+; CHECK: leal (%esp), [[REGISTER:%[a-z]+]]
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+; CHECK: pushl [[REGISTER]]
+; CHECK: calll _init
+  call void @init(%Foo* %b)
+; CHECK-NOT: subl {{\$[0-9]+}}, %esp
+; CHECK-NOT: pushl
+; CHECK: movl $2, 8(%esp)
+; CHECK: calll _foo_p_int
+  call void @foo_p_int(%Foo* preallocated(%Foo) %b, i32 2) ["preallocated"(token %t)]
+  ret void
+}
+
+declare void @foo_ret_p(%Foo* sret, %Foo* preallocated(%Foo))
+
+define void @nested_with_init() {
+; CHECK-LABEL: _nested_with_init:
+  %tmp = alloca %Foo
+
+  %t1 = call token @llvm.call.preallocated.setup(i32 1)
+; CHECK: subl $12, %esp
+  %a1 = call i8* @llvm.call.preallocated.arg(token %t1, i32 0) preallocated(%Foo)
+  %b1 = bitcast i8* %a1 to %Foo*
+; CHECK: leal 4(%esp), [[REGISTER1:%[a-z]+]]
+
+  %t2 = call token @llvm.call.preallocated.setup(i32 1)
+; CHECK: subl $12, %esp
+  %a2 = call i8* @llvm.call.preallocated.arg(token %t2, i32 0) preallocated(%Foo)
+; CHECK: leal 4(%esp), [[REGISTER2:%[a-z]+]]
+  %b2 = bitcast i8* %a2 to %Foo*
+
+  call void @init(%Foo* %b2)
+; CHECK: pushl [[REGISTER2]]
+; CHECK: calll _init
+
+  call void @foo_ret_p(%Foo* %b1, %Foo* preallocated(%Foo) %b2) ["preallocated"(token %t2)]
+; CHECK-NOT: subl {{\$[0-9]+}}, %esp
+; CHECK-NOT: pushl
+; CHECK: calll _foo_ret_p
+  call void @foo_ret_p(%Foo* %tmp, %Foo* preallocated(%Foo) %b1) ["preallocated"(token %t1)]
+; CHECK-NOT: subl {{\$[0-9]+}}, %esp
+; CHECK-NOT: pushl
+; CHECK: calll _foo_ret_p
+  ret void
+}
+
+declare void @foo_inreg_p(i32 inreg, %Foo* preallocated(%Foo))
+
+define void @inreg() {
+; CHECK-LABEL: _inreg:
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+; CHECK: subl $8, %esp
+; CHECK: movl $9, %eax
+; CHECK: calll _foo_inreg_p
+  call void @foo_inreg_p(i32 9, %Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+  ret void
+}
+
+declare x86_thiscallcc void @foo_thiscall_p(i8*, %Foo* preallocated(%Foo))
+
+define void @thiscall() {
+; CHECK-LABEL: _thiscall:
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+; CHECK: subl $8, %esp
+; CHECK: xorl %ecx, %ecx
+; CHECK: calll _foo_thiscall_p
+  call x86_thiscallcc void @foo_thiscall_p(i8* null, %Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+  ret void
+}
+
+declare x86_stdcallcc void @foo_stdcall_p(%Foo* preallocated(%Foo))
+declare x86_stdcallcc void @i(i32)
+
+define void @stdcall() {
+; CHECK-LABEL: _stdcall:
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+; CHECK: subl $8, %esp
+; CHECK: calll _foo_stdcall_p@8
+  call x86_stdcallcc void @foo_stdcall_p(%Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+; CHECK-NOT: %esp
+; CHECK: pushl
+; CHECK: calll _i@4
+  call x86_stdcallcc void @i(i32 0)
+  ret void
+}