Index: llvm/trunk/include/llvm/CodeGen/CallingConvLower.h
===================================================================
--- llvm/trunk/include/llvm/CodeGen/CallingConvLower.h
+++ llvm/trunk/include/llvm/CodeGen/CallingConvLower.h
@@ -296,6 +296,12 @@
   void AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
                               CCAssignFn Fn);
 
+  /// The function will invoke AnalyzeFormalArguments.
+  void AnalyzeArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                        CCAssignFn Fn) {
+    AnalyzeFormalArguments(Ins, Fn);
+  }
+
   /// AnalyzeReturn - Analyze the returned values of a return,
   /// incorporating info about the result values into this state.
   void AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -318,11 +324,22 @@
                            SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
                            CCAssignFn Fn);
 
+  /// The function will invoke AnalyzeCallOperands.
+  void AnalyzeArguments(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        CCAssignFn Fn) {
+    AnalyzeCallOperands(Outs, Fn);
+  }
+
   /// AnalyzeCallResult - Analyze the return values of a call,
   /// incorporating info about the passed values into this state.
   void AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
                          CCAssignFn Fn);
 
+  /// A shadow allocated register is a register that was allocated
+  /// but wasn't added to the location list (Locs).
+  /// \returns true if the register was allocated as shadow or false otherwise.
+  bool IsShadowAllocatedReg(unsigned Reg) const;
+
   /// AnalyzeCallResult - Same as above except it's specialized for calls which
   /// produce a single value.
   void AnalyzeCallResult(MVT VT, CCAssignFn Fn);
@@ -521,6 +538,37 @@
                                 const SmallVectorImpl<ISD::InputArg> &Ins,
                                 CCAssignFn CalleeFn, CCAssignFn CallerFn);
 
+  /// The function runs an additional analysis pass over function arguments.
+  /// It will mark each argument with the attribute flag SecArgPass.
+  /// After running, it will sort the locs list.
+  template <class T>
+  void AnalyzeArgumentsSecondPass(const SmallVectorImpl<T> &Args,
+                                  CCAssignFn Fn) {
+    unsigned NumFirstPassLocs = Locs.size();
+
+    /// Creates similar argument list to \p Args in which each argument is
+    /// marked using SecArgPass flag.
+    SmallVector<T, 16> SecPassArg;
+    // SmallVector<ISD::InputArg, 16> SecPassArg;
+    for (auto Arg : Args) {
+      Arg.Flags.setSecArgPass();
+      SecPassArg.push_back(Arg);
+    }
+
+    // Run the second argument pass
+    AnalyzeArguments(SecPassArg, Fn);
+
+    // Sort the locations of the arguments according to their original position.
+    SmallVector<CCValAssign, 16> TmpArgLocs;
+    std::swap(TmpArgLocs, Locs);
+    auto B = TmpArgLocs.begin(), E = TmpArgLocs.end();
+    std::merge(B, B + NumFirstPassLocs, B + NumFirstPassLocs, E,
+               std::back_inserter(Locs),
+               [](const CCValAssign &A, const CCValAssign &B) -> bool {
+                 return A.getValNo() < B.getValNo();
+               });
+  }
+
 private:
   /// MarkAllocated - Mark a register and all of its aliases as allocated.
   void MarkAllocated(unsigned Reg);
Index: llvm/trunk/include/llvm/Target/TargetCallingConv.h
===================================================================
--- llvm/trunk/include/llvm/Target/TargetCallingConv.h
+++ llvm/trunk/include/llvm/Target/TargetCallingConv.h
@@ -51,6 +51,15 @@
     static const uint64_t SwiftSelfOffs  = 14;
     static const uint64_t SwiftError     = 1ULL<<15; ///< Swift error parameter
     static const uint64_t SwiftErrorOffs = 15;
+    static const uint64_t Hva            = 1ULL << 16; ///< HVA field for
+                                                       ///< vectorcall
+    static const uint64_t HvaOffs        = 16;
+    static const uint64_t HvaStart       = 1ULL << 17; ///< HVA structure start
+                                                       ///< for vectorcall
+    static const uint64_t HvaStartOffs   = 17;
+    static const uint64_t SecArgPass     = 1ULL << 18; ///< Second argument
+                                                       ///< pass for vectorcall
+    static const uint64_t SecArgPassOffs = 18;
     static const uint64_t OrigAlign      = 0x1FULL<<27;
     static const uint64_t OrigAlignOffs  = 27;
     static const uint64_t ByValSize      = 0x3fffffffULL<<32; ///< Struct size
@@ -91,6 +100,15 @@
     bool isSwiftError() const { return Flags & SwiftError; }
     void setSwiftError() { Flags |= One << SwiftErrorOffs; }
 
+    bool isHva() const { return Flags & Hva; }
+    void setHva() { Flags |= One << HvaOffs; }
+
+    bool isHvaStart() const { return Flags & HvaStart; }
+    void setHvaStart() { Flags |= One << HvaStartOffs; }
+
+    bool isSecArgPass() const { return Flags & SecArgPass; }
+    void setSecArgPass() { Flags |= One << SecArgPassOffs; }
+
     bool isNest()      const { return Flags & Nest; }
     void setNest()     { Flags |= One << NestOffs; }
 
Index: llvm/trunk/lib/CodeGen/CallingConvLower.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/CallingConvLower.cpp
+++ llvm/trunk/lib/CodeGen/CallingConvLower.cpp
@@ -23,6 +23,8 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+
 using namespace llvm;
 
 CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
@@ -64,6 +66,22 @@
     UsedRegs[*AI/32] |= 1 << (*AI&31);
 }
 
+bool CCState::IsShadowAllocatedReg(unsigned Reg) const {
+  if (!isAllocated(Reg))
+    return false;
+
+  for (auto const &ValAssign : Locs) {
+    if (ValAssign.isRegLoc()) {
+      for (MCRegAliasIterator AI(ValAssign.getLocReg(), &TRI, true);
+           AI.isValid(); ++AI) {
+        if (*AI == Reg)
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
 /// Analyze an array of argument values,
 /// incorporating info about the formals into this state.
 void
Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7732,8 +7732,19 @@
         Flags.setZExt();
       if (Args[i].isSExt)
         Flags.setSExt();
-      if (Args[i].isInReg)
+      if (Args[i].isInReg) {
+        // If we are using vectorcall calling convention, a structure that is
+        // passed InReg - is surely an HVA
+        if (CLI.CallConv == CallingConv::X86_VectorCall &&
+            isa<StructType>(FinalType)) {
+          // The first value of a structure is marked
+          if (0 == Value)
+            Flags.setHvaStart();
+          Flags.setHva();
+        }
+        // Set InReg Flag
         Flags.setInReg();
+      }
       if (Args[i].isSRet)
         Flags.setSRet();
       if (Args[i].isSwiftSelf)
@@ -8019,8 +8030,19 @@
         Flags.setZExt();
       if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))
         Flags.setSExt();
-      if (F.getAttributes().hasAttribute(Idx, Attribute::InReg))
+      if (F.getAttributes().hasAttribute(Idx, Attribute::InReg)) {
+        // If we are using vectorcall calling convention, a structure that is
+        // passed InReg - is surely an HVA
+        if (F.getCallingConv() == CallingConv::X86_VectorCall &&
+            isa<StructType>(I->getType())) {
+          // The first value of a structure is marked
+          if (0 == Value)
+            Flags.setHvaStart();
+          Flags.setHva();
+        }
+        // Set InReg Flag
         Flags.setInReg();
+      }
       if (F.getAttributes().hasAttribute(Idx, Attribute::StructRet))
         Flags.setSRet();
       if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftSelf))
Index: llvm/trunk/lib/Target/X86/X86CallingConv.h
===================================================================
--- llvm/trunk/lib/Target/X86/X86CallingConv.h
+++ llvm/trunk/lib/Target/X86/X86CallingConv.h
@@ -24,22 +24,29 @@
 /// When regcall calling convention compiled to 32 bit arch, special treatment
 /// is required for 64 bit masks.
 /// The value should be assigned to two GPRs.
-/// @return true if registers were allocated and false otherwise
+/// \return true if registers were allocated and false otherwise.
 bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                    CCValAssign::LocInfo &LocInfo,
                                    ISD::ArgFlagsTy &ArgFlags, CCState &State);
 
-inline bool CC_X86_32_VectorCallIndirect(unsigned &ValNo, MVT &ValVT,
-                                         MVT &LocVT,
-                                         CCValAssign::LocInfo &LocInfo,
-                                         ISD::ArgFlagsTy &ArgFlags,
-                                         CCState &State) {
-  // Similar to CCPassIndirect, with the addition of inreg.
-  LocVT = MVT::i32;
-  LocInfo = CCValAssign::Indirect;
-  ArgFlags.setInReg();
-  return false; // Continue the search, but now for i32.
-}
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 64 bit arch.
+/// For HVAs shadow registers might be allocated on the first pass
+/// and actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State);
+
+/// Vectorcall calling convention has special handling for vector types or
+/// HVA for 32 bit arch.
+/// For HVAs actual XMM registers are allocated on the second pass.
+/// For vector types, actual XMM registers are allocated on the first pass.
+/// \return true if registers were allocated and false otherwise.
+bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State);
 
 inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
                                 CCValAssign::LocInfo &, ISD::ArgFlagsTy &,
Index: llvm/trunk/lib/Target/X86/X86CallingConv.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86CallingConv.cpp
+++ llvm/trunk/lib/Target/X86/X86CallingConv.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86Subtarget.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/IR/CallingConv.h"
 
@@ -39,14 +40,14 @@
   if (AvailableRegs.size() < RequiredGprsUponSplit)
     return false; // Not enough free registers - continue the search.
 
-  // Allocating the available registers
+  // Allocating the available registers.
   for (unsigned I = 0; I < RequiredGprsUponSplit; I++) {
 
-    // Marking the register as located
+    // Marking the register as located.
     unsigned Reg = State.AllocateReg(AvailableRegs[I]);
 
     // Since we previously made sure that 2 registers are available
-    // we expect that a real register number will be returned
+    // we expect that a real register number will be returned.
     assert(Reg && "Expecting a register will be available");
 
     // Assign the value to the allocated register
@@ -57,4 +58,151 @@
   return true;
 }
 
+static ArrayRef<MCPhysReg> CC_X86_VectorCallGetSSEs(const MVT &ValVT) {
+  if (ValVT.is512BitVector()) {
+    static const MCPhysReg RegListZMM[] = {X86::ZMM0, X86::ZMM1, X86::ZMM2,
+                                           X86::ZMM3, X86::ZMM4, X86::ZMM5};
+    return RegListZMM;
+  }
+
+  if (ValVT.is256BitVector()) {
+    static const MCPhysReg RegListYMM[] = {X86::YMM0, X86::YMM1, X86::YMM2,
+                                           X86::YMM3, X86::YMM4, X86::YMM5};
+    return RegListYMM;
+  }
+
+  static const MCPhysReg RegListXMM[] = {X86::XMM0, X86::XMM1, X86::XMM2,
+                                         X86::XMM3, X86::XMM4, X86::XMM5};
+  return RegListXMM;
+}
+
+static ArrayRef<MCPhysReg> CC_X86_64_VectorCallGetGPRs() {
+  static const MCPhysReg RegListGPR[] = {X86::RCX, X86::RDX, X86::R8, X86::R9};
+  return RegListGPR;
+}
+
+static bool CC_X86_VectorCallAssignRegister(unsigned &ValNo, MVT &ValVT,
+                                            MVT &LocVT,
+                                            CCValAssign::LocInfo &LocInfo,
+                                            ISD::ArgFlagsTy &ArgFlags,
+                                            CCState &State) {
+
+  ArrayRef<MCPhysReg> RegList = CC_X86_VectorCallGetSSEs(ValVT);
+  bool Is64bit = static_cast<const X86Subtarget &>(
+                     State.getMachineFunction().getSubtarget())
+                     .is64Bit();
+
+  for (auto Reg : RegList) {
+    // If the register is not marked as allocated - assign to it.
+    if (!State.isAllocated(Reg)) {
+      unsigned AssigedReg = State.AllocateReg(Reg);
+      assert(AssigedReg == Reg && "Expecting a valid register allocation");
+      State.addLoc(
+          CCValAssign::getReg(ValNo, ValVT, AssigedReg, LocVT, LocInfo));
+      return true;
+    }
+    // If the register is marked as shadow allocated - assign to it.
+    if (Is64bit && State.IsShadowAllocatedReg(Reg)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+      return true;
+    }
+  }
+
+  llvm_unreachable("Clang should ensure that hva marked vectors will have "
+                   "an available register.");
+  return false;
+}
+
+bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  // On the second pass, go through the HVAs only.
+  if (ArgFlags.isSecArgPass()) {
+    if (ArgFlags.isHva())
+      return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+                                             ArgFlags, State);
+    return true;
+  }
+
+  // Process only vector types as defined by vectorcall spec:
+  // "A vector type is either a floating-point type, for example,
+  //  a float or double, or an SIMD vector type, for example, __m128 or __m256".
+  if (!(ValVT.isFloatingPoint() ||
+        (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+    // If R9 was already assigned it means that we are after the fourth element
+    // and because this is not an HVA / Vector type, we need to allocate
+    // shadow XMM register.
+    if (State.isAllocated(X86::R9)) {
+      // Assign shadow XMM register.
+      (void)State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT));
+    }
+
+    return false;
+  }
+
+  if (!ArgFlags.isHva() || ArgFlags.isHvaStart()) {
+    // Assign shadow GPR register.
+    (void)State.AllocateReg(CC_X86_64_VectorCallGetGPRs());
+
+    // Assign XMM register - (shadow for HVA and non-shadow for non HVA).
+    if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+      // In Vectorcall Calling convention, additional shadow stack can be
+      // created on top of the basic 32 bytes of win64.
+      // It can happen if the fifth or sixth argument is vector type or HVA.
+      // At that case for each argument a shadow stack of 8 bytes is allocated.
+      if (Reg == X86::XMM4 || Reg == X86::XMM5)
+        State.AllocateStack(8, 8);
+
+      if (!ArgFlags.isHva()) {
+        State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+        return true; // Allocated a register - Stop the search.
+      }
+    }
+  }
+
+  // If this is an HVA - Stop the search,
+  // otherwise continue the search.
+  return ArgFlags.isHva();
+}
+
+bool CC_X86_32_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  // On the second pass, go through the HVAs only.
+  if (ArgFlags.isSecArgPass()) {
+    if (ArgFlags.isHva())
+      return CC_X86_VectorCallAssignRegister(ValNo, ValVT, LocVT, LocInfo,
+                                             ArgFlags, State);
+    return true;
+  }
+
+  // Process only vector types as defined by vectorcall spec:
+  // "A vector type is either a floating point type, for example,
+  //  a float or double, or an SIMD vector type, for example, __m128 or __m256".
+  if (!(ValVT.isFloatingPoint() ||
+        (ValVT.isVector() && ValVT.getSizeInBits() >= 128))) {
+    return false;
+  }
+
+  if (ArgFlags.isHva())
+    return true; // If this is an HVA - Stop the search.
+
+  // Assign XMM register.
+  if (unsigned Reg = State.AllocateReg(CC_X86_VectorCallGetSSEs(ValVT))) {
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+
+  // In case we did not find an available XMM register for a vector -
+  // pass it indirectly.
+  // It is similar to CCPassIndirect, with the addition of inreg.
+  if (!ValVT.isFloatingPoint()) {
+    LocVT = MVT::i32;
+    LocInfo = CCValAssign::Indirect;
+    ArgFlags.setInReg();
+  }
+
+  return false; // No register was assigned - Continue the search.
+}
+
 } // End llvm namespace
Index: llvm/trunk/lib/Target/X86/X86CallingConv.td
===================================================================
--- llvm/trunk/lib/Target/X86/X86CallingConv.td
+++ llvm/trunk/lib/Target/X86/X86CallingConv.td
@@ -308,20 +308,12 @@
   CCIfType<[i32], CCAssignToReg<[ESI, EBP, EAX, EDX]>>
 ]>;
 
-// X86-32 HiPE return-value convention.
+// X86-32 Vectorcall return-value convention.
 def RetCC_X86_32_VectorCall : CallingConv<[
-  // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
-  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+  // Floating Point types are returned in XMM0,XMM1,XMMM2 and XMM3.
+  CCIfType<[f32, f64, f128],
             CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
 
-  // 256-bit FP vectors
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
-
-  // 512-bit FP vectors
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
-            CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>,
-
   // Return integers in the standard way.
   CCDelegateTo<RetCC_X86Common>
 ]>;
@@ -350,6 +342,16 @@
   CCDelegateTo<RetCC_X86_64_C>
 ]>;
 
+// X86-64 vectorcall return-value convention.
+def RetCC_X86_64_Vectorcall : CallingConv<[
+  // Vectorcall calling convention always returns FP values in XMMs.
+  CCIfType<[f32, f64, f128], 
+    CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+  // Otherwise, everything is the same as Windows X86-64 C CC.
+  CCDelegateTo<RetCC_X86_Win64_C>
+]>;
+
 // X86-64 HiPE return-value convention.
 def RetCC_X86_64_HiPE : CallingConv<[
   // Promote all types to i64
@@ -447,6 +449,9 @@
   CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>,
 
+  // Handle Vectorcall CC
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_64_Vectorcall>>,
+
   // Handle HHVM calls.
   CCIfCC<"CallingConv::HHVM", CCDelegateTo<RetCC_X86_64_HHVM>>,
 
@@ -626,18 +631,7 @@
 ]>;
 
 def CC_X86_Win64_VectorCall : CallingConv<[
-  // The first 6 floating point and vector types of 128 bits or less use
-  // XMM0-XMM5.
-  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
-
-  // 256-bit vectors use YMM registers.
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
-
-  // 512-bit vectors use ZMM registers.
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
-           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
+  CCCustom<"CC_X86_64_VectorCall">,
 
   // Delegate to fastcall to handle integer types.
   CCDelegateTo<CC_X86_Win64_C>
@@ -847,25 +841,9 @@
   CCDelegateTo<CC_X86_32_Common>
 ]>;
 
-def CC_X86_32_VectorCall : CallingConv<[
-  // The first 6 floating point and vector types of 128 bits or less use
-  // XMM0-XMM5.
-  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5]>>,
-
-  // 256-bit vectors use YMM registers.
-  CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64],
-           CCAssignToReg<[YMM0, YMM1, YMM2, YMM3, YMM4, YMM5]>>,
-
-  // 512-bit vectors use ZMM registers.
-  CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
-           CCAssignToReg<[ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5]>>,
-
-  // Otherwise, pass it indirectly.
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64,
-            v32i8, v16i16, v8i32, v4i64, v8f32, v4f64,
-            v64i8, v32i16, v16i32, v8i64, v16f32, v8f64],
-           CCCustom<"CC_X86_32_VectorCallIndirect">>,
+def CC_X86_Win32_VectorCall : CallingConv<[
+  // Pass floating point in XMMs
+  CCCustom<"CC_X86_32_VectorCall">,
 
   // Delegate to fastcall to handle integer types.
   CCDelegateTo<CC_X86_32_FastCall>
@@ -999,7 +977,7 @@
   CCIfCC<"CallingConv::X86_INTR", CCDelegateTo<CC_X86_32_Intr>>,
   CCIfSubtarget<"isTargetMCU()", CCDelegateTo<CC_X86_32_MCU>>,
   CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
-  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_32_VectorCall>>,
+  CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>,
   CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
   CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
@@ -17,6 +17,7 @@
 #include "X86CallingConv.h"
 #include "X86FrameLowering.h"
 #include "X86InstrBuilder.h"
+#include "X86IntrinsicsInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86ShuffleDecodeConstantPool.h"
 #include "X86TargetMachine.h"
@@ -53,10 +54,10 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetOptions.h"
-#include "X86IntrinsicsInfo.h"
+#include <algorithm>
 #include <bitset>
-#include <numeric>
 #include <cctype>
+#include <numeric>
 using namespace llvm;
 
 #define DEBUG_TYPE "x86-isel"
@@ -2781,6 +2782,13 @@
   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
 }
 
+static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
+  return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
+                        [](const CCValAssign &A, const CCValAssign &B) -> bool {
+                          return A.getValNo() < B.getValNo();
+                        });
+}
+
 SDValue X86TargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
@@ -2815,11 +2823,22 @@
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
-  // Allocate shadow area for Win64
+  // Allocate shadow area for Win64.
   if (IsWin64)
     CCInfo.AllocateStack(32, 8);
 
-  CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
+  CCInfo.AnalyzeArguments(Ins, CC_X86);
+
+  // In vectorcall calling convention a second pass is required for the HVA
+  // types.
+  if (CallingConv::X86_VectorCall == CallConv) {
+    CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
+  }
+
+  // The next loop assumes that the locations are in the same order of the
+  // input arguments.
+  assert(isSortedByValueNo(ArgLocs) &&
+         "Argument Location list must be sorted before lowering");
 
   SDValue ArgValue;
   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
@@ -3263,11 +3282,17 @@
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
-  // Allocate shadow area for Win64
+  // Allocate shadow area for Win64.
   if (IsWin64)
     CCInfo.AllocateStack(32, 8);
 
-  CCInfo.AnalyzeCallOperands(Outs, CC_X86);
+  CCInfo.AnalyzeArguments(Outs, CC_X86);
+
+  // In vectorcall calling convention a second pass is required for the HVA
+  // types.
+  if (CallingConv::X86_VectorCall == CallConv) {
+    CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
+  }
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
@@ -3322,6 +3347,11 @@
   SmallVector<SDValue, 8> MemOpChains;
   SDValue StackPtr;
 
+  // The next loop assumes that the locations are in the same order of the
+  // input arguments.
+  assert(isSortedByValueNo(ArgLocs) &&
+         "Argument Location list must be sorted before lowering");
+
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
Index: llvm/trunk/test/CodeGen/X86/vectorcall.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/vectorcall.ll
+++ llvm/trunk/test/CodeGen/X86/vectorcall.ll
@@ -6,14 +6,12 @@
 define x86_vectorcallcc i32 @test_int_1() {
   ret i32 0
 }
-
 ; CHECK-LABEL: {{^}}test_int_1@@0:
 ; CHECK: xorl %eax, %eax
 
 define x86_vectorcallcc i32 @test_int_2(i32 inreg %a) {
   ret i32 %a
 }
-
 ; X86-LABEL: {{^}}test_int_2@@4:
 ; X64-LABEL: {{^}}test_int_2@@8:
 ; CHECK: movl %ecx, %eax
@@ -22,7 +20,6 @@
   %at = trunc i64 %a to i32
   ret i32 %at
 }
-
 ; X86-LABEL: {{^}}test_int_3@@8:
 ; X64-LABEL: {{^}}test_int_3@@8:
 ; CHECK: movl %ecx, %eax
@@ -31,10 +28,8 @@
   %s = add i32 %a, %b
   ret i32 %s
 }
-
 ; X86-LABEL: {{^}}test_int_4@@8:
 ; X86: leal (%ecx,%edx), %eax
-
 ; X64-LABEL: {{^}}test_int_4@@16:
 ; X64: leal (%rcx,%rdx), %eax
 
@@ -90,4 +85,139 @@
   ret <16 x i8> %r
 }
 ; CHECK-LABEL: {{^}}test_vec_2@@104:
-; CHECK: movaps (%{{[re]}}cx), %xmm0
+; x64:           movq    {{[0-9]*}}(%rsp), %rax
+; CHECK:         movaps (%{{rax|ecx}}), %xmm0
+
+%struct.HVA5 = type { <4 x float>, <4 x float>, <4 x float>, <4 x float>, <4 x float> }
+%struct.HVA4 = type { <4 x float>, <4 x float>, <4 x float>, <4 x float> }
+%struct.HVA3 = type { <4 x float>, <4 x float>, <4 x float> }
+%struct.HVA2 = type { <4 x float>, <4 x float> }
+
+define x86_vectorcallcc <4 x float> @test_mixed_1(i32 %a, %struct.HVA4 inreg %bb, i32 %c) {
+entry:
+  %b = alloca %struct.HVA4, align 16
+  store %struct.HVA4 %bb, %struct.HVA4* %b, align 16
+  %w1 = getelementptr inbounds %struct.HVA4, %struct.HVA4* %b, i32 0, i32 1
+  %0 = load <4 x float>, <4 x float>* %w1, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_1
+; CHECK:       movaps	%xmm1, 16(%{{(e|r)}}sp)
+; CHECK:       movaps	16(%{{(e|r)}}sp), %xmm0
+; CHECK:       ret{{q|l}}
+
+define x86_vectorcallcc <4 x float> @test_mixed_2(%struct.HVA4 inreg %a, %struct.HVA4* %b, <4 x float> %c) {
+entry:
+  %c.addr = alloca <4 x float>, align 16
+  store <4 x float> %c, <4 x float>* %c.addr, align 16
+  %0 = load <4 x float>, <4 x float>* %c.addr, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_2
+; X86:         movaps  %xmm0, (%esp)
+; X64:         movaps  %xmm2, %xmm0
+; CHECK:       ret{{[ql]}}
+
+define x86_vectorcallcc <4 x float> @test_mixed_3(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* %f) {
+entry:
+  %x = getelementptr inbounds %struct.HVA2, %struct.HVA2* %f, i32 0, i32 0
+  %0 = load <4 x float>, <4 x float>* %x, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_3
+; CHECK:       movaps	(%{{[re][ac]}}x), %xmm0
+; CHECK:       ret{{[ql]}}
+
+define x86_vectorcallcc <4 x float> @test_mixed_4(%struct.HVA4 inreg %a, %struct.HVA2* %bb, <4 x float> %c) {
+entry:
+  %y4 = getelementptr inbounds %struct.HVA2, %struct.HVA2* %bb, i32 0, i32 1
+  %0 = load <4 x float>, <4 x float>* %y4, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_4
+; X86:         movaps	16(%eax), %xmm0
+; X64:         movaps	16(%rdx), %xmm0
+; CHECK:       ret{{[ql]}}
+
+define x86_vectorcallcc <4 x float> @test_mixed_5(%struct.HVA3 inreg %a, %struct.HVA3* %b, <4 x float> %c, %struct.HVA2 inreg %dd) {
+entry:
+  %d = alloca %struct.HVA2, align 16
+  store %struct.HVA2 %dd, %struct.HVA2* %d, align 16
+  %y5 = getelementptr inbounds %struct.HVA2, %struct.HVA2* %d, i32 0, i32 1
+  %0 = load <4 x float>, <4 x float>* %y5, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_5
+; CHECK:       movaps	%xmm5, 16(%{{(e|r)}}sp)
+; CHECK:       movaps	16(%{{(e|r)}}sp), %xmm0
+; CHECK:       ret{{[ql]}}
+
+define x86_vectorcallcc %struct.HVA4 @test_mixed_6(%struct.HVA4 inreg %a, %struct.HVA4* %b) {
+entry:
+  %retval = alloca %struct.HVA4, align 16
+  %0 = bitcast %struct.HVA4* %retval to i8*
+  %1 = bitcast %struct.HVA4* %b to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 64, i32 16, i1 false)
+  %2 = load %struct.HVA4, %struct.HVA4* %retval, align 16
+  ret %struct.HVA4 %2
+}
+; CHECK-LABEL: test_mixed_6
+; CHECK:       movaps	(%{{[re]}}sp), %xmm0
+; CHECK:       movaps	16(%{{[re]}}sp), %xmm1
+; CHECK:       movaps	32(%{{[re]}}sp), %xmm2
+; CHECK:       movaps	48(%{{[re]}}sp), %xmm3
+; CHECK:       ret{{[ql]}}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32, i1)
+
+define x86_vectorcallcc void @test_mixed_7(%struct.HVA5* noalias sret %agg.result) {
+entry:
+  %a = alloca %struct.HVA5, align 16
+  %0 = bitcast %struct.HVA5* %a to i8*
+  call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 80, i32 16, i1 false)
+  %1 = bitcast %struct.HVA5* %agg.result to i8*
+  %2 = bitcast %struct.HVA5* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %2, i64 80, i32 16, i1 false)
+  ret void
+}
+; CHECK-LABEL: test_mixed_7
+; CHECK:       movaps	%xmm{{[0-9]}}, 64(%{{rcx|eax}})
+; CHECK:       movaps	%xmm{{[0-9]}}, 48(%{{rcx|eax}})
+; CHECK:       movaps	%xmm{{[0-9]}}, 32(%{{rcx|eax}})
+; CHECK:       movaps	%xmm{{[0-9]}}, 16(%{{rcx|eax}})
+; CHECK:       movaps	%xmm{{[0-9]}}, (%{{rcx|eax}})
+; X64:         mov{{[ql]}}	%rcx, %rax
+; CHECK:       ret{{[ql]}}
+
+define x86_vectorcallcc <4 x float> @test_mixed_8(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, i32 %e, <4 x float> %f) {
+entry:
+  %f.addr = alloca <4 x float>, align 16
+  store <4 x float> %f, <4 x float>* %f.addr, align 16
+  %0 = load <4 x float>, <4 x float>* %f.addr, align 16
+  ret <4 x float> %0
+}
+; CHECK-LABEL: test_mixed_8
+; X86:         movaps	%xmm4, %xmm0
+; X64:         movaps	%xmm5, %xmm0
+; CHECK:       ret{{[ql]}}
+
+%struct.HFA4 = type { double, double, double, double }
+declare x86_vectorcallcc double @test_mixed_9_callee(%struct.HFA4 %x, double %y)
+
+define x86_vectorcallcc double @test_mixed_9_caller(%struct.HFA4 inreg %b) {
+entry:
+  %call = call x86_vectorcallcc double @test_mixed_9_callee(%struct.HFA4 inreg %b, double 3.000000e+00)
+  %add = fadd double 1.000000e+00, %call
+  ret double %add
+}
+; CHECK-LABEL: test_mixed_9_caller
+; CHECK:       movaps  %xmm3, %xmm4
+; CHECK:       movaps  %xmm2, %xmm3
+; CHECK:       movaps  %xmm1, %xmm2
+; X32:         movasd  %xmm0, %xmm1
+; X64:         movapd  %xmm5, %xmm1
+; CHECK:       call{{l|q}}   test_mixed_9_callee@@40
+; CHECK:       addsd   {{.*}}, %xmm0
+; CHECK:       ret{{l|q}}