diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -797,6 +797,13 @@
     /// pointer, and a SRCVALUE.
     VAEND, VASTART,
 
+    // PREALLOCATED_SETUP - This has 2 operands: an input chain and a SRCVALUE
+    // with the preallocated call Value.
+    PREALLOCATED_SETUP,
+    // PREALLOCATED_ARG - This has 3 operands: an input chain, a SRCVALUE
+    // with the preallocated call Value, and a constant int.
+    PREALLOCATED_ARG,
+
     /// SRCVALUE - This is a node type that holds a Value* that is used to
     /// make reference to a value in the LLVM IR.
     SRCVALUE,
diff --git a/llvm/include/llvm/CodeGen/TargetCallingConv.h b/llvm/include/llvm/CodeGen/TargetCallingConv.h
--- a/llvm/include/llvm/CodeGen/TargetCallingConv.h
+++ b/llvm/include/llvm/CodeGen/TargetCallingConv.h
@@ -35,6 +35,7 @@
     unsigned IsReturned : 1; ///< Always returned
     unsigned IsSplit : 1;
     unsigned IsInAlloca : 1;   ///< Passed with inalloca
+    unsigned IsPreallocated : 1; ///< ByVal without the copy
     unsigned IsSplitEnd : 1;   ///< Last part of a split
     unsigned IsSwiftSelf : 1;  ///< Swift self parameter
     unsigned IsSwiftError : 1; ///< Swift error parameter
@@ -56,9 +57,9 @@
   public:
     ArgFlagsTy()
         : IsZExt(0), IsSExt(0), IsInReg(0), IsSRet(0), IsByVal(0), IsNest(0),
-          IsReturned(0), IsSplit(0), IsInAlloca(0), IsSplitEnd(0),
-          IsSwiftSelf(0), IsSwiftError(0), IsCFGuardTarget(0), IsHva(0),
-          IsHvaStart(0), IsSecArgPass(0), ByValAlign(0), OrigAlign(0),
+          IsReturned(0), IsSplit(0), IsInAlloca(0), IsPreallocated(0),
+          IsSplitEnd(0), IsSwiftSelf(0), IsSwiftError(0), IsCFGuardTarget(0),
+          IsHva(0), IsHvaStart(0), IsSecArgPass(0), ByValAlign(0), OrigAlign(0),
           IsInConsecutiveRegsLast(0), IsInConsecutiveRegs(0),
           IsCopyElisionCandidate(0), IsPointer(0), ByValSize(0),
           PointerAddrSpace(0) {
@@ -83,6 +84,9 @@
     bool isInAlloca() const { return IsInAlloca; }
     void setInAlloca() { IsInAlloca = 1; }
 
+    bool isPreallocated() const { return IsPreallocated; }
+    void setPreallocated() { IsPreallocated = 1; }
+
     bool isSwiftSelf() const { return IsSwiftSelf; }
     void setSwiftSelf() { IsSwiftSelf = 1; }
 
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -273,17 +273,20 @@
     bool IsNest : 1;
     bool IsByVal : 1;
     bool IsInAlloca : 1;
+    bool IsPreallocated : 1;
     bool IsReturned : 1;
     bool IsSwiftSelf : 1;
     bool IsSwiftError : 1;
     bool IsCFGuardTarget : 1;
     MaybeAlign Alignment = None;
     Type *ByValType = nullptr;
+    Type *PreallocatedType = nullptr;
 
     ArgListEntry()
         : IsSExt(false), IsZExt(false), IsInReg(false), IsSRet(false),
-          IsNest(false), IsByVal(false), IsInAlloca(false), IsReturned(false),
-          IsSwiftSelf(false), IsSwiftError(false), IsCFGuardTarget(false) {}
+          IsNest(false), IsByVal(false), IsInAlloca(false),
+          IsPreallocated(false), IsReturned(false), IsSwiftSelf(false),
+          IsSwiftError(false), IsCFGuardTarget(false) {}
 
     void setAttributes(const CallBase *Call, unsigned ArgIdx);
   };
@@ -3608,6 +3611,7 @@
     bool IsReturnValueUsed : 1;
     bool IsConvergent      : 1;
     bool IsPatchPoint      : 1;
+    bool IsPreallocated : 1;
 
     // IsTailCall should be modified by implementations of
     // TargetLowering::LowerCall that perform tail call conversions.
@@ -3631,7 +3635,7 @@
     CallLoweringInfo(SelectionDAG &DAG)
         : RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false),
           DoesNotReturn(false), IsReturnValueUsed(true), IsConvergent(false),
-          IsPatchPoint(false), DAG(DAG) {}
+          IsPatchPoint(false), IsPreallocated(false), DAG(DAG) {}
 
     CallLoweringInfo &setDebugLoc(const SDLoc &dl) {
       DL = dl;
@@ -3737,6 +3741,11 @@
       return *this;
     }
 
+    CallLoweringInfo &setIsPreallocated(bool Value = true) {
+      IsPreallocated = Value;
+      return *this;
+    }
+
     CallLoweringInfo &setIsPostTypeLegalization(bool Value=true) {
       IsPostTypeLegalization = Value;
       return *this;
diff --git a/llvm/include/llvm/IR/Argument.h b/llvm/include/llvm/IR/Argument.h
--- a/llvm/include/llvm/IR/Argument.h
+++ b/llvm/include/llvm/IR/Argument.h
@@ -110,6 +110,9 @@
   /// Return true if this argument has the inalloca attribute.
   bool hasInAllocaAttr() const;
 
+  /// Return true if this argument has the preallocated attribute.
+  bool hasPreallocatedAttr() const;
+
   /// Return true if this argument has the zext attribute.
   bool hasZExtAttr() const;
 
diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -623,6 +623,9 @@
   /// Return the byval type for the specified function parameter.
   Type *getParamByValType(unsigned ArgNo) const;
 
+  /// Return the preallocated type for the specified function parameter.
+  Type *getParamPreallocatedType(unsigned ArgNo) const;
+
   /// Get the stack alignment.
   MaybeAlign getStackAlignment(unsigned Index) const;
 
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1604,6 +1604,12 @@
     return Ty ? Ty : getArgOperand(ArgNo)->getType()->getPointerElementType();
   }
 
+  /// Extract the preallocated type for a call or parameter.
+  Type *getParamPreallocatedType(unsigned ArgNo) const {
+    Type *Ty = Attrs.getParamPreallocatedType(ArgNo);
+    return Ty ? Ty : getArgOperand(ArgNo)->getType()->getPointerElementType();
+  }
+
   /// Extract the number of dereferenceable bytes for a call or
   /// parameter (0=unknown).
   uint64_t getDereferenceableBytes(unsigned i) const {
diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -127,6 +127,12 @@
 /// additionally expand this pseudo after register allocation.
 HANDLE_TARGET_OPCODE(LOAD_STACK_GUARD)
 
+/// These are used to support call sites that must have the stack adjusted
+/// before the call (e.g. to initialize an argument passed by value).
+/// See llvm.call.preallocated.{setup,arg} in the LangRef for more details.
+HANDLE_TARGET_OPCODE(PREALLOCATED_SETUP)
+HANDLE_TARGET_OPCODE(PREALLOCATED_ARG)
+
 /// Call instruction with associated vm state for deoptimization and list
 /// of live pointers for relocation by the garbage collector.  It is
 /// intended to support garbage collection with fully precise relocating
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1173,6 +1173,18 @@
   let hasSideEffects = 0;
   bit isPseudo = 1;
 }
+def PREALLOCATED_SETUP : StandardPseudoInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins i32imm:$a);
+  let usesCustomInserter = 1;
+  let hasSideEffects = 1;
+}
+def PREALLOCATED_ARG : StandardPseudoInstruction {
+  let OutOperandList = (outs ptr_rc:$loc);
+  let InOperandList = (ins i32imm:$a, i32imm:$b);
+  let usesCustomInserter = 1;
+  let hasSideEffects = 1;
+}
 def LOCAL_ESCAPE : StandardPseudoInstruction {
   // This instruction is really just a label. It has to be part of the chain so
   // that it doesn't get dropped from the DAG, but it produces nothing and has
diff --git a/llvm/include/llvm/Target/TargetCallingConv.td b/llvm/include/llvm/Target/TargetCallingConv.td
--- a/llvm/include/llvm/Target/TargetCallingConv.td
+++ b/llvm/include/llvm/Target/TargetCallingConv.td
@@ -41,6 +41,11 @@
 class CCIfByVal<CCAction A> : CCIf<"ArgFlags.isByVal()", A> {
 }
 
+/// CCIfPreallocated - If the current argument has Preallocated parameter attribute,
+/// apply Action A.
+class CCIfPreallocated<CCAction A> : CCIf<"ArgFlags.isPreallocated()", A> {
+}
+
 /// CCIfSwiftSelf - If the current argument has swiftself parameter attribute,
 /// apply Action A.
 class CCIfSwiftSelf<CCAction A> : CCIf<"ArgFlags.isSwiftSelf()", A> {
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -96,10 +96,12 @@
     Flags.setSwiftError();
   if (Attrs.hasAttribute(OpIdx, Attribute::ByVal))
     Flags.setByVal();
+  if (Attrs.hasAttribute(OpIdx, Attribute::Preallocated))
+    Flags.setPreallocated();
   if (Attrs.hasAttribute(OpIdx, Attribute::InAlloca))
     Flags.setInAlloca();
 
-  if (Flags.isByVal() || Flags.isInAlloca()) {
+  if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated()) {
     Type *ElementTy = cast<PointerType>(Arg.Ty)->getElementType();
 
     auto Ty = Attrs.getAttribute(OpIdx, Attribute::ByVal).getValueAsType();
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1214,7 +1214,16 @@
       // the various CC lowering callbacks.
       Flags.setByVal();
     }
-    if (Arg.IsByVal || Arg.IsInAlloca) {
+    if (Arg.IsPreallocated) {
+      Flags.setPreallocated();
+      // Set the byval flag for CCAssignFn callbacks that don't know about
+      // preallocated. This way we can know how many bytes we should've
+      // allocated and how many bytes a callee cleanup function will pop.  If we
+      // port preallocated to more targets, we'll have to add custom
+      // preallocated handling in the various CC lowering callbacks.
+      Flags.setByVal();
+    }
+    if (Arg.IsByVal || Arg.IsInAlloca || Arg.IsPreallocated) {
       PointerType *Ty = cast<PointerType>(Arg.Ty);
       Type *ElementTy = Ty->getElementType();
       unsigned FrameSize =
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1873,9 +1873,6 @@
 }
 
 SDValue SelectionDAG::getSrcValue(const Value *V) {
-  assert((!V || V->getType()->isPointerTy()) &&
-         "SrcValue is not a pointer?");
-
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), None);
   ID.AddPointer(V);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5799,6 +5799,45 @@
     updateDAGForMaybeTailCall(MC);
     return;
   }
+  case Intrinsic::call_preallocated_setup:
+  case Intrinsic::call_preallocated_arg: {
+    const Value *PreallocatedCall =
+        Intrinsic == Intrinsic::call_preallocated_setup ? &I : I.getOperand(0);
+    assert(cast<CallBase>(PreallocatedCall)
+                   ->getCalledFunction()
+                   ->getIntrinsicID() == Intrinsic::call_preallocated_setup &&
+           "expected call_preallocated_setup Value");
+    const CallBase *Call = nullptr;
+    for (auto U : PreallocatedCall->users()) {
+      auto *UseCall = cast<CallBase>(U);
+      const Function *Fn = UseCall->getCalledFunction();
+      if (!Fn || Fn->getIntrinsicID() != Intrinsic::call_preallocated_arg) {
+        Call = UseCall;
+        break;
+      }
+    }
+    assert(Call && "expected corresponding call to preallocated setup/arg");
+    SDValue SrcValue = DAG.getSrcValue(Call);
+    if (Intrinsic == Intrinsic::call_preallocated_setup) {
+      SDValue Res = DAG.getNode(ISD::PREALLOCATED_SETUP, sdl, MVT::Other,
+                                getRoot(), SrcValue);
+      setValue(&I, Res);
+      DAG.setRoot(Res);
+    } else {
+      SDValue Ops[3];
+      Ops[0] = getRoot();
+      Ops[1] = SrcValue;
+      Ops[2] = DAG.getTargetConstant(*cast<ConstantInt>(I.getArgOperand(1)),
+                                     sdl, MVT::i32); // arg index
+      SDValue Res = DAG.getNode(
+          ISD::PREALLOCATED_ARG, sdl,
+          DAG.getVTList(TLI.getPointerTy(DAG.getDataLayout()), MVT::Other),
+          Ops);
+      setValue(&I, Res);
+      DAG.setRoot(Res.getValue(1));
+    }
+    return;
+  }
   case Intrinsic::dbg_addr:
   case Intrinsic::dbg_declare: {
     const auto &DI = cast<DbgVariableIntrinsic>(I);
@@ -7119,7 +7158,9 @@
       .setChain(getRoot())
       .setCallee(RetTy, FTy, Callee, std::move(Args), CB)
       .setTailCall(isTailCall)
-      .setConvergent(CB.isConvergent());
+      .setConvergent(CB.isConvergent())
+      .setIsPreallocated(
+          CB.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0);
   std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);
 
   if (Result.first.getNode()) {
@@ -7645,9 +7686,9 @@
   // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
   // have to do anything here to lower funclet bundles.
   // CFGuardTarget bundles are lowered in LowerCallTo.
-  assert(!I.hasOperandBundlesOtherThan({LLVMContext::OB_deopt,
-                                        LLVMContext::OB_funclet,
-                                        LLVMContext::OB_cfguardtarget}) &&
+  assert(!I.hasOperandBundlesOtherThan(
+             {LLVMContext::OB_deopt, LLVMContext::OB_funclet,
+              LLVMContext::OB_cfguardtarget, LLVMContext::OB_preallocated}) &&
          "Cannot lower calls with arbitrary operand bundles!");
 
   SDValue Callee = getValue(I.getCalledOperand());
@@ -8608,7 +8649,9 @@
       .setChain(getRoot())
       .setCallee(Call->getCallingConv(), ReturnTy, Callee, std::move(Args))
       .setDiscardResult(Call->use_empty())
-      .setIsPatchPoint(IsPatchPoint);
+      .setIsPatchPoint(IsPatchPoint)
+      .setIsPreallocated(
+          Call->countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0);
 }
 
 /// Add a stack map intrinsic call's live variable operands to a stackmap
@@ -9128,6 +9171,15 @@
         Flags.setCFGuardTarget();
       if (Args[i].IsByVal)
         Flags.setByVal();
+      if (Args[i].IsPreallocated) {
+        Flags.setPreallocated();
+        // Set the byval flag for CCAssignFn callbacks that don't know about
+        // preallocated.  This way we can know how many bytes we should've
+        // allocated and how many bytes a callee cleanup function will pop.  If
+        // we port preallocated to more targets, we'll have to add custom
+        // preallocated handling in the various CC lowering callbacks.
+        Flags.setByVal();
+      }
       if (Args[i].IsInAlloca) {
         Flags.setInAlloca();
         // Set the byval flag for CCAssignFn callbacks that don't know about
@@ -9137,7 +9189,7 @@
         // in the various CC lowering callbacks.
         Flags.setByVal();
       }
-      if (Args[i].IsByVal || Args[i].IsInAlloca) {
+      if (Args[i].IsByVal || Args[i].IsInAlloca || Args[i].IsPreallocated) {
         PointerType *Ty = cast<PointerType>(Args[i].Ty);
         Type *ElementTy = Ty->getElementType();
 
@@ -9451,7 +9503,7 @@
     // initializes the alloca. Don't elide copies from the same argument twice.
     const Value *Val = SI->getValueOperand()->stripPointerCasts();
     const auto *Arg = dyn_cast<Argument>(Val);
-    if (!Arg || Arg->hasInAllocaAttr() || Arg->hasByValAttr() ||
+    if (!Arg || Arg->hasPassPointeeByValueAttr() ||
         Arg->getType()->isEmptyTy() ||
         DL.getTypeStoreSize(Arg->getType()) !=
             DL.getTypeAllocSize(AI->getAllocatedType()) ||
@@ -9638,12 +9690,21 @@
         // in the various CC lowering callbacks.
         Flags.setByVal();
       }
+      if (Arg.hasAttribute(Attribute::Preallocated)) {
+        Flags.setPreallocated();
+        // Set the byval flag for CCAssignFn callbacks that don't know about
+        // preallocated.  This way we can know how many bytes we should've
+        // allocated and how many bytes a callee cleanup function will pop.  If
+        // we port preallocated to more targets, we'll have to add custom
+        // preallocated handling in the various CC lowering callbacks.
+        Flags.setByVal();
+      }
       if (F.getCallingConv() == CallingConv::X86_INTR) {
         // IA Interrupt passes frame (1st parameter) by value in the stack.
         if (ArgNo == 0)
           Flags.setByVal();
       }
-      if (Flags.isByVal() || Flags.isInAlloca()) {
+      if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated()) {
         Type *ElementTy = Arg.getParamByValType();
 
         // For ByVal, size and alignment should be passed from FE.  BE will
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -393,6 +393,10 @@
   case ISD::GC_TRANSITION_END:          return "gc_transition.end";
   case ISD::GET_DYNAMIC_AREA_OFFSET:    return "get.dynamic.area.offset";
   case ISD::FREEZE:                     return "freeze";
+  case ISD::PREALLOCATED_SETUP:
+    return "call_setup";
+  case ISD::PREALLOCATED_ARG:
+    return "call_alloc";
 
   // Bit manipulation
   case ISD::ABS:                        return "abs";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -110,14 +110,18 @@
   IsSRet = Call->paramHasAttr(ArgIdx, Attribute::StructRet);
   IsNest = Call->paramHasAttr(ArgIdx, Attribute::Nest);
   IsByVal = Call->paramHasAttr(ArgIdx, Attribute::ByVal);
+  IsPreallocated = Call->paramHasAttr(ArgIdx, Attribute::Preallocated);
   IsInAlloca = Call->paramHasAttr(ArgIdx, Attribute::InAlloca);
   IsReturned = Call->paramHasAttr(ArgIdx, Attribute::Returned);
   IsSwiftSelf = Call->paramHasAttr(ArgIdx, Attribute::SwiftSelf);
   IsSwiftError = Call->paramHasAttr(ArgIdx, Attribute::SwiftError);
   Alignment = Call->getParamAlign(ArgIdx);
   ByValType = nullptr;
-  if (Call->paramHasAttr(ArgIdx, Attribute::ByVal))
+  if (IsByVal)
     ByValType = Call->getParamByValType(ArgIdx);
+  PreallocatedType = nullptr;
+  if (IsPreallocated)
+    PreallocatedType = Call->getParamPreallocatedType(ArgIdx);
 }
 
 /// Generate a libcall taking the given operands as arguments and returning a
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -1433,6 +1433,10 @@
   return getAttributes(Index+FirstArgIndex).getByValType();
 }
 
+Type *AttributeList::getParamPreallocatedType(unsigned Index) const {
+  return getAttributes(Index + FirstArgIndex).getPreallocatedType();
+}
+
 MaybeAlign AttributeList::getStackAlignment(unsigned Index) const {
   return getAttributes(Index).getStackAlignment();
 }
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -114,6 +114,12 @@
   return hasAttribute(Attribute::InAlloca);
 }
 
+bool Argument::hasPreallocatedAttr() const {
+  if (!getType()->isPointerTy())
+    return false;
+  return hasAttribute(Attribute::Preallocated);
+}
+
 bool Argument::hasPassPointeeByValueAttr() const {
   if (!getType()->isPointerTy()) return false;
   AttributeList Attrs = getParent()->getAttributes();
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -789,8 +789,9 @@
 /// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
 /// values are spilled on the stack.
 def CC_X86_32_Common : CallingConv<[
-  // Handles byval parameters.
+  // Handles byval/preallocated parameters.
   CCIfByVal<CCPassByVal<4, 4>>,
+  CCIfPreallocated<CCPassByVal<4, 4>>,
 
   // The first 3 float or double arguments, if marked 'inreg' and if the call
   // is not a vararg call and if SSE2 is available, are passed in SSE registers.
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3245,7 +3245,7 @@
     return false;
 
   for (auto Flag : CLI.OutFlags)
-    if (Flag.isSwiftError())
+    if (Flag.isSwiftError() || Flag.isPreallocated())
       return false;
 
   SmallVector<MVT, 16> OutVTs;
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -57,7 +57,8 @@
 
 bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   return !MF.getFrameInfo().hasVarSizedObjects() &&
-         !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+         !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences() &&
+         !MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall();
 }
 
 /// canSimplifyCallFramePseudos - If there is a reserved call frame, the
@@ -67,6 +68,7 @@
 bool
 X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
   return hasReservedCallFrame(MF) ||
+         MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
          (hasFP(MF) && !TRI->needsStackRealignment(MF)) ||
          TRI->hasBasePointer(MF);
 }
@@ -90,10 +92,10 @@
 bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
-          TRI->needsStackRealignment(MF) ||
-          MFI.hasVarSizedObjects() ||
+          TRI->needsStackRealignment(MF) || MFI.hasVarSizedObjects() ||
           MFI.isFrameAddressTaken() || MFI.hasOpaqueSPAdjustment() ||
           MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+          MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
           MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() ||
           MFI.hasStackMap() || MFI.hasPatchPoint() ||
           MFI.hasCopyImplyingStackAdjustment());
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -5625,6 +5625,39 @@
     CurDAG->RemoveDeadNode(Node);
     return;
   }
+  case ISD::PREALLOCATED_SETUP: {
+    auto MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+    auto CallId = MFI->PreallocatedIdForCallSite(
+        cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
+    SDValue Chain = Node->getOperand(0);
+    SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
+    MachineSDNode *New = CurDAG->getMachineNode(
+        TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
+    ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
+  case ISD::PREALLOCATED_ARG: {
+    auto MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+    auto CallId = MFI->PreallocatedIdForCallSite(
+        cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
+    SDValue Chain = Node->getOperand(0);
+    SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
+    SDValue ArgIndex = Node->getOperand(2);
+    SDValue Ops[3];
+    Ops[0] = CallIdValue;
+    Ops[1] = ArgIndex;
+    Ops[2] = Chain;
+    MachineSDNode *New = CurDAG->getMachineNode(
+        TargetOpcode::PREALLOCATED_ARG, dl,
+        CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
+                          MVT::Other),
+        Ops);
+    ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
+    ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
   }
 
   SelectCode(Node);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3882,6 +3882,22 @@
     if (ArgLocs.back().getLocMemOffset() != 0)
       report_fatal_error("any parameter with the inalloca attribute must be "
                          "the only memory argument");
+  } else if (CLI.IsPreallocated) {
+    if (!ArgLocs.back().isMemLoc()) {
+      report_fatal_error("cannot use preallocated attribute on a register "
+                         "parameter");
+    }
+    SmallVector<size_t, 4> PreallocatedOffsets;
+    for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
+      if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
+        PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
+      }
+    }
+    auto MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
+    size_t PreallocatedId = MFI->PreallocatedIdForCallSite(CLI.CB);
+    MFI->SetPreallocatedStackSize(PreallocatedId, NumBytes);
+    MFI->SetPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
+    NumBytesToPush = 0;
   }
 
   if (!IsSibcall && !IsMustTail)
@@ -3909,9 +3925,9 @@
   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
        ++I, ++OutIndex) {
     assert(OutIndex < Outs.size() && "Invalid Out index");
-    // Skip inalloca arguments, they have already been written.
+    // Skip inalloca/preallocated arguments, they have already been written.
     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
-    if (Flags.isInAlloca())
+    if (Flags.isInAlloca() || Flags.isPreallocated())
       continue;
 
     CCValAssign &VA = ArgLocs[I];
@@ -4099,8 +4115,8 @@
       assert(VA.isMemLoc());
       SDValue Arg = OutVals[OutsIndex];
       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
-      // Skip inalloca arguments.  They don't require any work.
-      if (Flags.isInAlloca())
+      // Skip inalloca/preallocated arguments.  They don't require any work.
+      if (Flags.isInAlloca() || Flags.isPreallocated())
         continue;
       // Create frame index.
       int32_t Offset = VA.getLocMemOffset()+FPDiff;
@@ -33037,6 +33053,36 @@
       BB->addLiveIn(BasePtr);
     return BB;
   }
+  case TargetOpcode::PREALLOCATED_SETUP: {
+    assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
+    auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+    MFI->setHasPreallocatedCall(true);
+    int64_t PreallocatedId = MI.getOperand(0).getImm();
+    size_t StackAdjustment = MFI->GetPreallocatedStackSize(PreallocatedId);
+    assert(StackAdjustment != 0 && "0 stack adjustment");
+    LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
+                      << StackAdjustment << "\n");
+    BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
+        .addReg(X86::ESP)
+        .addImm(StackAdjustment);
+    MI.eraseFromParent();
+    return BB;
+  }
+  case TargetOpcode::PREALLOCATED_ARG: {
+    assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
+    int64_t PreallocatedId = MI.getOperand(1).getImm();
+    int64_t ArgIdx = MI.getOperand(2).getImm();
+    auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+    size_t ArgOffset = MFI->GetPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
+    LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
+                      << ", arg offset " << ArgOffset << "\n");
+    // stack pointer + offset
+    addRegOffset(
+        BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
+        X86::ESP, false, ArgOffset);
+    MI.eraseFromParent();
+    return BB;
+  }
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -103,6 +103,14 @@
   /// True if this function has WIN_ALLOCA instructions.
   bool HasWinAlloca = false;
 
+  /// True if this function has any preallocated calls.
+  bool HasPreallocatedCall = false;
+
+  size_t PreallocatedNextId = 0;
+  ValueMap<const Value *, size_t> PreallocatedIds;
+  DenseMap<size_t, size_t> PreallocatedStackSizes;
+  DenseMap<size_t, SmallVector<size_t, 4>> PreallocatedArgOffsets;
+
 private:
   /// ForwardedMustTailRegParms - A list of virtual and physical registers
   /// that must be forwarded to every musttail call.
@@ -184,6 +192,40 @@
 
   bool hasWinAlloca() const { return HasWinAlloca; }
   void setHasWinAlloca(bool v) { HasWinAlloca = v; }
+
+  bool hasPreallocatedCall() const { return HasPreallocatedCall; }
+  void setHasPreallocatedCall(bool v) { HasPreallocatedCall = v; }
+
+  size_t PreallocatedIdForCallSite(const Value *CS) {
+    auto Id = PreallocatedIds.find(CS);
+    if (Id == PreallocatedIds.end()) {
+      size_t NewId = PreallocatedNextId++;
+      PreallocatedIds.insert({CS, NewId});
+      return NewId;
+    } else {
+      return Id->second;
+    }
+  }
+
+  void SetPreallocatedStackSize(size_t Id, size_t StackSize) {
+    PreallocatedStackSizes[Id] = StackSize;
+  }
+
+  size_t GetPreallocatedStackSize(const size_t Id) {
+    assert(PreallocatedStackSizes.find(Id) != PreallocatedStackSizes.end() &&
+           "stack size not set");
+    return PreallocatedStackSizes[Id];
+  }
+
+  void SetPreallocatedArgOffsets(size_t Id, SmallVector<size_t, 4> AO) {
+    PreallocatedArgOffsets[Id] = AO;
+  }
+
+  const SmallVector<size_t, 4> &GetPreallocatedArgOffsets(const size_t Id) {
+    assert(PreallocatedArgOffsets.find(Id) != PreallocatedArgOffsets.end() &&
+           "arg offsets not set");
+    return PreallocatedArgOffsets[Id];
+  }
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -627,18 +627,22 @@
 }
 
 bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
-   const MachineFrameInfo &MFI = MF.getFrameInfo();
-
-   if (!EnableBasePointer)
-     return false;
-
-   // When we need stack realignment, we can't address the stack from the frame
-   // pointer.  When we have dynamic allocas or stack-adjusting inline asm, we
-   // can't address variables from the stack pointer.  MS inline asm can
-   // reference locals while also adjusting the stack pointer.  When we can't
-   // use both the SP and the FP, we need a separate base pointer register.
-   bool CantUseFP = needsStackRealignment(MF);
-   return CantUseFP && CantUseSP(MFI);
+  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  if (X86FI->hasPreallocatedCall())
+    return true;
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  if (!EnableBasePointer)
+    return false;
+
+  // When we need stack realignment, we can't address the stack from the frame
+  // pointer.  When we have dynamic allocas or stack-adjusting inline asm, we
+  // can't address variables from the stack pointer.  MS inline asm can
+  // reference locals while also adjusting the stack pointer.  When we can't
+  // use both the SP and the FP, we need a separate base pointer register.
+  bool CantUseFP = needsStackRealignment(MF);
+  return CantUseFP && CantUseSP(MFI);
 }
 
 bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -1014,9 +1014,9 @@
 
   // CI should not has any ABI-impacting function attributes.
   static const Attribute::AttrKind ABIAttrs[] = {
-      Attribute::StructRet, Attribute::ByVal,    Attribute::InAlloca,
-      Attribute::InReg,     Attribute::Returned, Attribute::SwiftSelf,
-      Attribute::SwiftError};
+      Attribute::StructRet,    Attribute::ByVal,     Attribute::InAlloca,
+      Attribute::Preallocated, Attribute::InReg,     Attribute::Returned,
+      Attribute::SwiftSelf,    Attribute::SwiftError};
   AttributeList Attrs = CI.getAttributes();
   for (auto AK : ABIAttrs)
     if (Attrs.hasParamAttribute(0, AK))
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -1371,7 +1371,8 @@
   AttributeList FnAttributeList = Fn->getAttributes();
   if (FnAttributeList.hasAttrSomewhere(Attribute::Nest) ||
       FnAttributeList.hasAttrSomewhere(Attribute::StructRet) ||
-      FnAttributeList.hasAttrSomewhere(Attribute::InAlloca)) {
+      FnAttributeList.hasAttrSomewhere(Attribute::InAlloca) ||
+      FnAttributeList.hasAttrSomewhere(Attribute::Preallocated)) {
     LLVM_DEBUG(
         dbgs() << "[Attributor] Cannot rewrite due to complex attribute\n");
     return false;
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -4355,7 +4355,8 @@
     AAValueSimplifyImpl::initialize(A);
     if (!getAnchorScope() || getAnchorScope()->isDeclaration())
       indicatePessimisticFixpoint();
-    if (hasAttr({Attribute::InAlloca, Attribute::StructRet, Attribute::Nest},
+    if (hasAttr({Attribute::InAlloca, Attribute::Preallocated,
+                 Attribute::StructRet, Attribute::Nest},
                 /* IgnoreSubsumingPositions */ true))
       indicatePessimisticFixpoint();
 
@@ -5582,7 +5583,7 @@
 
     // TODO: From readattrs.ll: "inalloca parameters are always
     //                           considered written"
-    if (hasAttr({Attribute::InAlloca})) {
+    if (hasAttr({Attribute::InAlloca, Attribute::Preallocated})) {
       removeKnownBits(NO_WRITES);
       removeAssumedBits(NO_WRITES);
     }
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -483,9 +483,10 @@
 // We consider arguments of non-internal functions to be intrinsically alive as
 // well as arguments to functions which have their "address taken".
 void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
-  // Functions with inalloca parameters are expecting args in a particular
-  // register and memory layout.
-  if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca)) {
+  // Functions with inalloca/preallocated parameters are expecting args in a
+  // particular register and memory layout.
+  if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
+      F.getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
     MarkLive(F);
     return;
   }
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -447,7 +447,7 @@
   SmallPtrSet<Use *, 32> Visited;
 
   // inalloca arguments are always clobbered by the call.
-  if (A->hasInAllocaAttr())
+  if (A->hasInAllocaAttr() || A->hasPreallocatedAttr())
     return Attribute::None;
 
   bool IsRead = false;
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -2333,6 +2333,7 @@
     // wouldn't be safe in the presence of inalloca.
     // FIXME: We should also hoist alloca affected by this to the entry
     // block if possible.
+    // FIXME: handle preallocated
     if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca) &&
         !F->hasAddressTaken()) {
       RemoveAttribute(F, Attribute::InAlloca);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -4739,6 +4739,7 @@
   //
   //  Similarly, avoid folding away bitcasts of byval calls.
   if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
+      Callee->getAttributes().hasAttrSomewhere(Attribute::Preallocated) ||
       Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal))
     return false;
 
diff --git a/llvm/test/CodeGen/X86/arg-copy-elide.ll b/llvm/test/CodeGen/X86/arg-copy-elide.ll
--- a/llvm/test/CodeGen/X86/arg-copy-elide.ll
+++ b/llvm/test/CodeGen/X86/arg-copy-elide.ll
@@ -253,6 +253,20 @@
 ; CHECK: calll _addrof_i32
 ; CHECK: retl
 
+define void @avoid_preallocated(i32* preallocated(i32) %x) {
+entry:
+  %x.p.p = alloca i32*
+  store i32* %x, i32** %x.p.p
+  call void @addrof_i32(i32* %x)
+  ret void
+}
+
+; CHECK-LABEL: _avoid_preallocated:
+; CHECK: leal {{[0-9]+}}(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
 ; Don't elide the copy when the alloca is escaped with a store.
 define void @escape_with_store(i32 %x) {
   %x1 = alloca i32
diff --git a/llvm/test/CodeGen/X86/musttail-indirect.ll b/llvm/test/CodeGen/X86/musttail-indirect.ll
--- a/llvm/test/CodeGen/X86/musttail-indirect.ll
+++ b/llvm/test/CodeGen/X86/musttail-indirect.ll
@@ -22,6 +22,9 @@
 ; Each member pointer creates a thunk.  The ones with inalloca are required to
 ; tail calls by the ABI, even at O0.
 
+declare token @llvm.call.preallocated.setup(i32)
+declare i8* @llvm.call.preallocated.arg(token, i32)
+
 %struct.B = type { i32 (...)** }
 %struct.A = type { i32 }
 
@@ -52,6 +55,25 @@
   ret i32 %3
 }
 
+; FIXME: This generates a lot of code even at -O2, any better way to do this? Same with all the preallocated versions of functions below.
+; CHECK-LABEL: g_thunk_2:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc i32 @g_thunk_2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A, i32, %struct.A }>) %0) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
+  %vtable = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)**, i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*, i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*, i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
+  %tmp = load <{ %struct.A, i32, %struct.A }>, <{ %struct.A, i32, %struct.A }>* %0
+  %c = call token @llvm.call.preallocated.setup(i32 1)
+  %A = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(<{ %struct.A, i32, %struct.A }>)
+  %a = bitcast i8* %A to <{ %struct.A, i32, %struct.A }>*
+  store <{ %struct.A, i32, %struct.A }> %tmp, <{ %struct.A, i32, %struct.A }>* %a
+  %3 = musttail call x86_thiscallcc i32 %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A, i32, %struct.A }>) %a) ["preallocated"(token %c)]
+  ret i32 %3
+}
+
 ; CHECK-LABEL: h_thunk:
 ; CHECK: jmpl
 ; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
@@ -66,6 +88,22 @@
   ret void
 }
 
+; CHECK-LABEL: h_thunk_2:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc void @h_thunk_2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A, i32, %struct.A }>)) {
+entry:
+  %1 = bitcast %struct.B* %this to void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
+  %vtable = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)**, void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*, void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 2
+  %2 = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*, void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
+  %c = call token @llvm.call.preallocated.setup(i32 1)
+  %A = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(<{ %struct.A, i32, %struct.A }>)
+  %a = bitcast i8* %A to <{ %struct.A, i32, %struct.A }>*
+  musttail call x86_thiscallcc void %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A, i32, %struct.A }>) %a) ["preallocated"(token %c)]
+  ret void
+}
+
 ; CHECK-LABEL: i_thunk:
 ; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
 ; CHECK: jmpl
@@ -80,6 +118,22 @@
   ret %struct.A* %3
 }
 
+; CHECK-LABEL: i_thunk_2:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc %struct.A* @i_thunk_2(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A*, %struct.A, i32, %struct.A }>)) {
+entry:
+  %1 = bitcast %struct.B* %this to %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)***
+  %vtable = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)**, %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*, %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vtable, i32 3
+  %2 = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*, %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vfn
+  %c = call token @llvm.call.preallocated.setup(i32 1)
+  %A = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(<{ %struct.A, i32, %struct.A }>)
+  %a = bitcast i8* %A to <{ %struct.A*, %struct.A, i32, %struct.A }>*
+  %3 = musttail call x86_thiscallcc %struct.A* %2(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* preallocated(<{ %struct.A*, %struct.A, i32, %struct.A }>) %a) ["preallocated"(token %c)]
+  ret %struct.A* %3
+}
+
 ; CHECK-LABEL: j_thunk:
 ; CHECK: jmpl
 ; CHECK-NOT: ret
@@ -109,6 +163,24 @@
   ret i32 %3
 }
 
+; CHECK-LABEL: _stdcall_thunk_2@8:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_stdcallcc i32 @stdcall_thunk_2(<{ %struct.B*, %struct.A }>* preallocated(<{ %struct.B*, %struct.A }>)) {
+entry:
+  %this_ptr = getelementptr inbounds <{ %struct.B*, %struct.A }>, <{ %struct.B*, %struct.A }>* %0, i32 0, i32 0
+  %this = load %struct.B*, %struct.B** %this_ptr
+  %1 = bitcast %struct.B* %this to i32 (<{ %struct.B*, %struct.A }>*)***
+  %vtable = load i32 (<{ %struct.B*, %struct.A }>*)**, i32 (<{ %struct.B*, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (<{ %struct.B*, %struct.A }>*)*, i32 (<{ %struct.B*, %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (<{ %struct.B*, %struct.A }>*)*, i32 (<{ %struct.B*, %struct.A }>*)** %vfn
+  %c = call token @llvm.call.preallocated.setup(i32 1)
+  %A = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(<{ %struct.B*, %struct.A }>)
+  %a = bitcast i8* %A to <{ %struct.B*, %struct.A }>*
+  %3 = musttail call x86_stdcallcc i32 %2(<{ %struct.B*, %struct.A }>* preallocated(<{ %struct.B*, %struct.A }>) %a) ["preallocated"(token %c)]
+  ret i32 %3
+}
+
 ; CHECK-LABEL: @fastcall_thunk@8:
 ; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
 ; CHECK: jmpl
@@ -122,3 +194,19 @@
   %3 = musttail call x86_fastcallcc i32 %2(%struct.B* inreg %this, <{ %struct.A }>* inalloca %0)
   ret i32 %3
 }
+
+; CHECK-LABEL: @fastcall_thunk_2@8:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_fastcallcc i32 @fastcall_thunk_2(%struct.B* inreg %this, <{ %struct.A }>* preallocated(<{%struct.A}>)) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A }>*)***
+  %vtable = load i32 (%struct.B*, <{ %struct.A }>*)**, i32 (%struct.B*, <{ %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A }>*)*, i32 (%struct.B*, <{ %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (%struct.B*, <{ %struct.A }>*)*, i32 (%struct.B*, <{ %struct.A }>*)** %vfn
+  %c = call token @llvm.call.preallocated.setup(i32 1)
+  %A = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(<{ %struct.A }>)
+  %a = bitcast i8* %A to <{ %struct.A }>*
+  %3 = musttail call x86_fastcallcc i32 %2(%struct.B* inreg %this, <{ %struct.A }>* preallocated(<{ %struct.A }>) %a) ["preallocated"(token %c)]
+  ret i32 %3
+}
diff --git a/llvm/test/CodeGen/X86/musttail-thiscall.ll b/llvm/test/CodeGen/X86/musttail-thiscall.ll
--- a/llvm/test/CodeGen/X86/musttail-thiscall.ll
+++ b/llvm/test/CodeGen/X86/musttail-thiscall.ll
@@ -1,6 +1,9 @@
 ; RUN: llc -verify-machineinstrs -mtriple=i686-- < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=i686-- -O0 < %s | FileCheck %s
 
+declare token @llvm.call.preallocated.setup(i32)
+declare i8* @llvm.call.preallocated.arg(token, i32)
+
 ; CHECK-LABEL: t1:
 ; CHECK: jmp {{_?}}t1_callee
 define x86_thiscallcc void @t1(i8* %this) {
@@ -29,3 +32,19 @@
   ret i8* %rv
 }
 declare x86_thiscallcc i8* @t3_callee(i8* %this, <{ i8*, i32 }>* inalloca %args);
+
+; CHECK-LABEL: t4:
+; CHECK: jmp {{_?}}t4_callee
+define x86_thiscallcc i8* @t4(i8* %this, <{ i8*, i32 }>* preallocated(<{i8*, i32}>) %args) {
+  %adj = getelementptr i8, i8* %this, i32 4
+  %a_ptr = getelementptr <{ i8*, i32 }>, <{ i8*, i32 }>* %args, i32 0, i32 1
+  store i32 0, i32* %a_ptr
+  %c = call token @llvm.call.preallocated.setup(i32 1)
+  %A = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(<{i8*, i32}>)
+  %a = bitcast i8* %A to <{ i8*, i32 }>*
+  %tmp = load <{ i8*, i32 }>, <{ i8*, i32 }>* %args
+  store <{ i8*, i32 }> %tmp, <{ i8*, i32 }>* %a
+  %rv = musttail call x86_thiscallcc i8* @t4_callee(i8* %adj, <{ i8*, i32 }>* preallocated(<{ i8*, i32 }>) %a) ["preallocated"(token %c)]
+  ret i8* %rv
+}
+declare x86_thiscallcc i8* @t4_callee(i8* %this, <{ i8*, i32 }>* preallocated(<{i8*, i32}>) %args);
diff --git a/llvm/test/CodeGen/X86/preallocated-nocall.ll b/llvm/test/CodeGen/X86/preallocated-nocall.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/preallocated-nocall.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+; XFAIL: *
+
+declare token @llvm.call.preallocated.setup(i32)
+declare i8* @llvm.call.preallocated.arg(token, i32)
+
+%Foo = type { i32, i32 }
+
+declare void @init(%Foo*)
+
+
+
+declare void @foo_p(%Foo* preallocated(%Foo))
+
+define void @no_call() {
+; CHECK-LABEL: _no_call:
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+  call void @init(%Foo* %b)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/preallocated-x64.ll b/llvm/test/CodeGen/X86/preallocated-x64.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/preallocated-x64.ll
@@ -0,0 +1,18 @@
+; RUN: not --crash llc %s -mtriple=x86_64-windows-msvc -o /dev/null 2>&1 | FileCheck %s
+
+declare token @llvm.call.preallocated.setup(i32)
+declare i8* @llvm.call.preallocated.arg(token, i32)
+
+%Foo = type { i32, i32 }
+
+declare x86_thiscallcc void @f(i32, %Foo* preallocated(%Foo))
+
+define void @g() {
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+  call void @f(i32 0, %Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+  ret void
+}
+
+; CHECK: cannot use preallocated attribute on a register parameter
diff --git a/llvm/test/CodeGen/X86/preallocated.ll b/llvm/test/CodeGen/X86/preallocated.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/preallocated.ll
@@ -0,0 +1,187 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+
+declare token @llvm.call.preallocated.setup(i32)
+declare i8* @llvm.call.preallocated.arg(token, i32)
+
+%Foo = type { i32, i32 }
+
+declare void @init(%Foo*)
+
+
+
+declare void @foo_p(%Foo* preallocated(%Foo))
+
+define void @one_preallocated() {
+; CHECK-LABEL: _one_preallocated:
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+; CHECK: subl $8, %esp
+; CHECK: calll _foo_p
+  call void @foo_p(%Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+  ret void
+}
+
+define void @one_preallocated_two_blocks() {
+; CHECK-LABEL: _one_preallocated_two_blocks:
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+  br label %second
+second:
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+; CHECK: subl $8, %esp
+; CHECK: calll _foo_p
+  call void @foo_p(%Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+  ret void
+}
+
+define void @preallocated_with_store() {
+; CHECK-LABEL: _preallocated_with_store:
+; CHECK: subl $8, %esp
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+; CHECK: leal (%esp), [[REGISTER:%[a-z]+]]
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+  %p0 = getelementptr %Foo, %Foo* %b, i32 0, i32 0
+  %p1 = getelementptr %Foo, %Foo* %b, i32 0, i32 1
+  store i32 13, i32* %p0
+  store i32 42, i32* %p1
+; CHECK-DAG: movl $13, ([[REGISTER]])
+; CHECK-DAG: movl $42, 4([[REGISTER]])
+; CHECK-NOT: subl {{\$[0-9]+}}, %esp
+; CHECK-NOT: pushl
+; CHECK: calll _foo_p
+  call void @foo_p(%Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+  ret void
+}
+
+define void @preallocated_with_init() {
+; CHECK-LABEL: _preallocated_with_init:
+; CHECK: subl $8, %esp
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+; CHECK: leal (%esp), [[REGISTER:%[a-z]+]]
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+; CHECK: pushl [[REGISTER]]
+; CHECK: calll _init
+  call void @init(%Foo* %b)
+; CHECK-NOT: subl {{\$[0-9]+}}, %esp
+; CHECK-NOT: pushl
+; CHECK: calll _foo_p
+  call void @foo_p(%Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+  ret void
+}
+
+declare void @foo_p_p(%Foo* preallocated(%Foo), %Foo* preallocated(%Foo))
+
+define void @two_preallocated() {
+; CHECK-LABEL: _two_preallocated:
+  %t = call token @llvm.call.preallocated.setup(i32 2)
+  %a1 = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b1 = bitcast i8* %a1 to %Foo*
+  %a2 = call i8* @llvm.call.preallocated.arg(token %t, i32 1) preallocated(%Foo)
+  %b2 = bitcast i8* %a2 to %Foo*
+; CHECK: subl $16, %esp
+; CHECK: calll _foo_p_p
+  call void @foo_p_p(%Foo* preallocated(%Foo) %b1, %Foo* preallocated(%Foo) %b2) ["preallocated"(token %t)]
+  ret void
+}
+
+declare void @foo_p_int(%Foo* preallocated(%Foo), i32)
+
+define void @one_preallocated_one_normal() {
+; CHECK-LABEL: _one_preallocated_one_normal:
+; CHECK: subl $12, %esp
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+; CHECK: leal (%esp), [[REGISTER:%[a-z]+]]
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+; CHECK: pushl [[REGISTER]]
+; CHECK: calll _init
+  call void @init(%Foo* %b)
+; CHECK-NOT: subl {{\$[0-9]+}}, %esp
+; CHECK-NOT: pushl
+; CHECK: movl $2, 8(%esp)
+; CHECK: calll _foo_p_int
+  call void @foo_p_int(%Foo* preallocated(%Foo) %b, i32 2) ["preallocated"(token %t)]
+  ret void
+}
+
+declare void @foo_ret_p(%Foo* sret, %Foo* preallocated(%Foo))
+
+define void @nested_with_init() {
+; CHECK-LABEL: _nested_with_init:
+  %tmp = alloca %Foo
+
+  %t1 = call token @llvm.call.preallocated.setup(i32 1)
+; CHECK: subl $12, %esp
+  %a1 = call i8* @llvm.call.preallocated.arg(token %t1, i32 0) preallocated(%Foo)
+  %b1 = bitcast i8* %a1 to %Foo*
+; CHECK: leal 4(%esp), [[REGISTER1:%[a-z]+]]
+
+  %t2 = call token @llvm.call.preallocated.setup(i32 1)
+; CHECK: subl $12, %esp
+  %a2 = call i8* @llvm.call.preallocated.arg(token %t2, i32 0) preallocated(%Foo)
+; CHECK: leal 4(%esp), [[REGISTER2:%[a-z]+]]
+  %b2 = bitcast i8* %a2 to %Foo*
+
+  call void @init(%Foo* %b2)
+; CHECK: pushl [[REGISTER2]]
+; CHECK: calll _init
+
+  call void @foo_ret_p(%Foo* %b1, %Foo* preallocated(%Foo) %b2) ["preallocated"(token %t2)]
+; CHECK-NOT: subl {{\$[0-9]+}}, %esp
+; CHECK-NOT: pushl
+; CHECK: calll _foo_ret_p
+  call void @foo_ret_p(%Foo* %tmp, %Foo* preallocated(%Foo) %b1) ["preallocated"(token %t1)]
+; CHECK-NOT: subl {{\$[0-9]+}}, %esp
+; CHECK-NOT: pushl
+; CHECK: calll _foo_ret_p
+  ret void
+}
+
+declare void @foo_inreg_p(i32 inreg, %Foo* preallocated(%Foo))
+
+define void @inreg() {
+; CHECK-LABEL: _inreg:
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+; CHECK: subl $8, %esp
+; CHECK: movl $9, %eax
+; CHECK: calll _foo_inreg_p
+  call void @foo_inreg_p(i32 9, %Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+  ret void
+}
+
+declare x86_thiscallcc void @foo_thiscall_p(i8*, %Foo* preallocated(%Foo))
+
+define void @thiscall() {
+; CHECK-LABEL: _thiscall:
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+; CHECK: subl $8, %esp
+; CHECK: xorl %ecx, %ecx
+; CHECK: calll _foo_thiscall_p
+  call x86_thiscallcc void @foo_thiscall_p(i8* null, %Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+  ret void
+}
+
+declare x86_stdcallcc void @foo_stdcall_p(%Foo* preallocated(%Foo))
+declare x86_stdcallcc void @i(i32)
+
+define void @stdcall() {
+; CHECK-LABEL: _stdcall:
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(%Foo)
+  %b = bitcast i8* %a to %Foo*
+; CHECK: subl $8, %esp
+; CHECK: calll _foo_stdcall_p@8
+  call x86_stdcallcc void @foo_stdcall_p(%Foo* preallocated(%Foo) %b) ["preallocated"(token %t)]
+; CHECK-NOT: %esp
+; CHECK: pushl
+; CHECK: calll _i@4
+  call x86_stdcallcc void @i(i32 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/shrink-wrap-chkstk.ll b/llvm/test/CodeGen/X86/shrink-wrap-chkstk.ll
--- a/llvm/test/CodeGen/X86/shrink-wrap-chkstk.ll
+++ b/llvm/test/CodeGen/X86/shrink-wrap-chkstk.ll
@@ -1,5 +1,8 @@
 ; RUN: llc < %s -enable-shrink-wrap=true | FileCheck %s
 
+; TODO: add preallocated versions of tests
+; we don't yet support conditionally called preallocated calls after the setup
+
 ; chkstk cannot come before the usual prologue, since it adjusts ESP.
 ; If chkstk is used in the prologue, we also have to be careful about preserving
 ; EAX if it is used.
diff --git a/llvm/test/CodeGen/X86/tail-call-mutable-memarg.ll b/llvm/test/CodeGen/X86/tail-call-mutable-memarg.ll
--- a/llvm/test/CodeGen/X86/tail-call-mutable-memarg.ll
+++ b/llvm/test/CodeGen/X86/tail-call-mutable-memarg.ll
@@ -9,6 +9,21 @@
 declare x86_stdcallcc void @tail_std(i32)
 declare void @capture(i32*)
 
+define x86_thiscallcc void @preallocated(i32* %this, i32* preallocated(i32) %args) {
+entry:
+  %val = load i32, i32* %args
+  store i32 0, i32* %args
+  tail call x86_stdcallcc void @tail_std(i32 %val)
+  ret void
+}
+
+; CHECK-LABEL: _preallocated:                              # @preallocated
+; CHECK:         movl    4(%esp), %[[reg:[^ ]*]]
+; CHECK:         movl    $0, 4(%esp)
+; CHECK:         pushl   %[[reg]]
+; CHECK:         calll   _tail_std@4
+; CHECK:         retl    $4
+
 define x86_thiscallcc void @inalloca(i32* %this, i32* inalloca %args) {
 entry:
   %val = load i32, i32* %args
diff --git a/llvm/test/Transforms/Attributor/value-simplify.ll b/llvm/test/Transforms/Attributor/value-simplify.ll
--- a/llvm/test/Transforms/Attributor/value-simplify.ll
+++ b/llvm/test/Transforms/Attributor/value-simplify.ll
@@ -6,6 +6,8 @@
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 declare void @f(i32)
+declare token @llvm.call.preallocated.setup(i32)
+declare i8* @llvm.call.preallocated.arg(token, i32)
 
 ; Test1: Replace argument with constant
 define internal void @test1(i32 %a) {
@@ -284,6 +286,38 @@
   ret i32* %call
 }
 
+define internal i32* @test_preallocated(i32* preallocated(i32) %a) {
+; IS__TUNIT____-LABEL: define {{[^@]+}}@test_preallocated
+; IS__TUNIT____-SAME: (i32* noalias nofree returned writeonly preallocated(i32) align 536870912 "no-capture-maybe-returned" [[A:%.*]])
+; IS__TUNIT____-NEXT:    ret i32* [[A]]
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@test_preallocated
+; IS__CGSCC____-SAME: (i32* noalias nofree returned writeonly preallocated(i32) "no-capture-maybe-returned" [[A:%.*]])
+; IS__CGSCC____-NEXT:    ret i32* [[A]]
+;
+  ret i32* %a
+}
+define i32* @complicated_args_preallocated() {
+; IS__TUNIT_OPM-LABEL: define {{[^@]+}}@complicated_args_preallocated()
+; IS__TUNIT_OPM-NEXT:    [[C:%.*]] = call token @llvm.call.preallocated.setup(i32 1)
+; IS__TUNIT_OPM-NEXT:    [[CALL:%.*]] = call i32* @test_preallocated(i32* noalias nocapture nofree writeonly preallocated(i32) align 536870912 null) #5 [ "preallocated"(token [[C]]) ]
+; IS__TUNIT_OPM-NEXT:    ret i32* [[CALL]]
+;
+; IS__TUNIT_NPM-LABEL: define {{[^@]+}}@complicated_args_preallocated()
+; IS__TUNIT_NPM-NEXT:    [[C:%.*]] = call token @llvm.call.preallocated.setup(i32 1)
+; IS__TUNIT_NPM-NEXT:    [[CALL:%.*]] = call i32* @test_preallocated(i32* noalias nocapture nofree writeonly preallocated(i32) align 536870912 null) #4 [ "preallocated"(token [[C]]) ]
+; IS__TUNIT_NPM-NEXT:    ret i32* [[CALL]]
+;
+; IS__CGSCC____-LABEL: define {{[^@]+}}@complicated_args_preallocated()
+; IS__CGSCC____-NEXT:    [[C:%.*]] = call token @llvm.call.preallocated.setup(i32 1)
+; IS__CGSCC____-NEXT:    [[CALL:%.*]] = call i32* @test_preallocated(i32* noalias nocapture nofree writeonly preallocated(i32) align 536870912 null) #6 [ "preallocated"(token [[C]]) ]
+; IS__CGSCC____-NEXT:    ret i32* [[CALL]]
+;
+  %c = call token @llvm.call.preallocated.setup(i32 1)
+  %call = call i32* @test_preallocated(i32* preallocated(i32) null) ["preallocated"(token %c)]
+  ret i32* %call
+}
+
 define internal void @test_sret(%struct.X* sret %a, %struct.X** %b) {
 ;
 ; IS__TUNIT____-LABEL: define {{[^@]+}}@test_sret
diff --git a/llvm/test/Transforms/DeadArgElim/keepalive.ll b/llvm/test/Transforms/DeadArgElim/keepalive.ll
--- a/llvm/test/Transforms/DeadArgElim/keepalive.ll
+++ b/llvm/test/Transforms/DeadArgElim/keepalive.ll
@@ -1,5 +1,8 @@
 ; RUN: opt < %s -deadargelim -S | FileCheck %s
 
+declare token @llvm.call.preallocated.setup(i32)
+declare i8* @llvm.call.preallocated.arg(token, i32)
+
 %Ty = type <{ i32, i32 }>
 
 ; Check if the pass doesn't modify anything that doesn't need changing. We feed
@@ -44,4 +47,22 @@
 	ret i32 %v
 }
 
+; We can't remove 'this' here, as that would put argmem in ecx instead of
+; memory.
+define internal x86_thiscallcc i32 @unused_this_preallocated(i32* %this, i32* preallocated(i32) %argmem) {
+	%v = load i32, i32* %argmem
+	ret i32 %v
+}
+; CHECK-LABEL: define internal x86_thiscallcc i32 @unused_this_preallocated(i32* %this, i32* preallocated(i32) %argmem)
+
+define i32 @caller3() {
+	%t = alloca i32
+	%c = call token @llvm.call.preallocated.setup(i32 1)
+	%M = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32)
+	%m = bitcast i8* %M to i32*
+	store i32 42, i32* %m
+	%v = call x86_thiscallcc i32 @unused_this_preallocated(i32* %t, i32* preallocated(i32) %m) ["preallocated"(token %c)]
+	ret i32 %v
+}
+
 ; CHECK: attributes #0 = { nounwind }
diff --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple-todo.ll
@@ -58,6 +58,16 @@
   ret void
 }
 
+; Test for preallocated handling.
+define void @test9_3(%struct.x* preallocated(%struct.x)  %a) nounwind  {
+; CHECK-LABEL: @test9_3(
+; CHECK-NEXT:    ret void
+;
+  %tmp2 = getelementptr %struct.x, %struct.x* %a, i32 0, i32 0
+  store i32 1, i32* %tmp2, align 4
+  ret void
+}
+
 ; DSE should delete the dead trampoline.
 declare void @test11f()
 define void @test11() {
diff --git a/llvm/test/Transforms/DeadStoreElimination/simple.ll b/llvm/test/Transforms/DeadStoreElimination/simple.ll
--- a/llvm/test/Transforms/DeadStoreElimination/simple.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/simple.ll
@@ -169,6 +169,16 @@
   ret void
 }
 
+; Test for preallocated handling.
+define void @test9_3(%struct.x* preallocated(%struct.x)  %a) nounwind  {
+; CHECK-LABEL: @test9_3(
+; CHECK-NEXT:    ret void
+;
+  %tmp2 = getelementptr %struct.x, %struct.x* %a, i32 0, i32 0
+  store i32 1, i32* %tmp2, align 4
+  ret void
+}
+
 ; va_arg has fuzzy dependence, the store shouldn't be zapped.
 define double @test10(i8* %X) {
 ; CHECK-LABEL: @test10(
diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
--- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll
+++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
@@ -56,6 +56,12 @@
   ret void
 }
 
+; CHECK: define void @test7_2(i32* nocapture preallocated(i32) %a)
+; preallocated parameters are always considered written
+define void @test7_2(i32* preallocated(i32) %a) {
+  ret void
+}
+
 ; CHECK: define i32* @test8_1(i32* readnone returned %p)
 define i32* @test8_1(i32* %p) {
 entry:
diff --git a/llvm/test/Transforms/GlobalOpt/fastcc.ll b/llvm/test/Transforms/GlobalOpt/fastcc.ll
--- a/llvm/test/Transforms/GlobalOpt/fastcc.ll
+++ b/llvm/test/Transforms/GlobalOpt/fastcc.ll
@@ -1,5 +1,8 @@
 ; RUN: opt < %s -globalopt -S | FileCheck %s
 
+declare token @llvm.call.preallocated.setup(i32)
+declare i8* @llvm.call.preallocated.arg(token, i32)
+
 define internal i32 @f(i32* %m) {
 ; CHECK-LABEL: define internal fastcc i32 @f
   %v = load i32, i32* %m
@@ -32,6 +35,13 @@
   ret i32 %rv
 }
 
+define internal i32 @preallocated(i32* preallocated(i32) %p) {
+; TODO: handle preallocated:
+; CHECK-NOT-LABEL: define internal fastcc i32 @preallocated(i32* %p)
+  %rv = load i32, i32* %p
+  ret i32 %rv
+}
+
 define void @call_things() {
   %m = alloca i32
   call i32 @f(i32* %m)
@@ -40,6 +50,11 @@
   call i32 @j(i32* %m)
   %args = alloca inalloca i32
   call i32 @inalloca(i32* inalloca %args)
+  ; TODO: handle preallocated
+  ;%c = call token @llvm.call.preallocated.setup(i32 1)
+  ;%N = call i8* @llvm.call.preallocated.arg(token %c, i32 0) preallocated(i32)
+  ;%n = bitcast i8* %N to i32*
+   ;call i32 @preallocated(i32* preallocated(i32) %n) ["preallocated"(token %c)]
   ret void
 }
 
diff --git a/llvm/test/Transforms/InstCombine/call-cast-target-preallocated.ll b/llvm/test/Transforms/InstCombine/call-cast-target-preallocated.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/call-cast-target-preallocated.ll
@@ -0,0 +1,28 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32"
+target triple = "i686-pc-win32"
+
+
+declare token @llvm.call.preallocated.setup(i32)
+declare i8* @llvm.call.preallocated.arg(token, i32)
+
+declare void @takes_i32(i32)
+declare void @takes_i32_preallocated(i32* preallocated(i32))
+
+define void @f() {
+; CHECK-LABEL: define void @f()
+  %t = call token @llvm.call.preallocated.setup(i32 1)
+  %a = call i8* @llvm.call.preallocated.arg(token %t, i32 0) preallocated(i32)
+  %arg = bitcast i8* %a to i32*
+  call void bitcast (void (i32)* @takes_i32 to void (i32*)*)(i32* preallocated(i32) %arg) ["preallocated"(token %t)]
+; CHECK: call void bitcast{{.*}}@takes_i32
+  ret void
+}
+
+define void @g() {
+; CHECK-LABEL: define void @g()
+  call void bitcast (void (i32*)* @takes_i32_preallocated to void (i32)*)(i32 0)
+; CHECK: call void bitcast{{.*}}@takes_i32_preallocated
+  ret void
+}