Index: cfe/trunk/lib/AST/Mangle.cpp
===================================================================
--- cfe/trunk/lib/AST/Mangle.cpp
+++ cfe/trunk/lib/AST/Mangle.cpp
@@ -49,10 +49,11 @@
 
 void MangleContext::anchor() { }
 
-enum StdOrFastCC {
-  SOF_OTHER,
-  SOF_FAST,
-  SOF_STD
+enum CCMangling {
+  CCM_Other,
+  CCM_Fast,
+  CCM_Vector,
+  CCM_Std
 };
 
 static bool isExternC(const NamedDecl *ND) {
@@ -61,20 +62,22 @@
   return cast<VarDecl>(ND)->isExternC();
 }
 
-static StdOrFastCC getStdOrFastCallMangling(const ASTContext &Context,
-                                            const NamedDecl *ND) {
+static CCMangling getCallingConvMangling(const ASTContext &Context,
+                                         const NamedDecl *ND) {
   const TargetInfo &TI = Context.getTargetInfo();
   const llvm::Triple &Triple = TI.getTriple();
-  if (!Triple.isOSWindows() || Triple.getArch() != llvm::Triple::x86)
-    return SOF_OTHER;
+  if (!Triple.isOSWindows() ||
+      !(Triple.getArch() == llvm::Triple::x86 ||
+        Triple.getArch() == llvm::Triple::x86_64))
+    return CCM_Other;
 
   if (Context.getLangOpts().CPlusPlus && !isExternC(ND) &&
       TI.getCXXABI() == TargetCXXABI::Microsoft)
-    return SOF_OTHER;
+    return CCM_Other;
 
   const FunctionDecl *FD = dyn_cast<FunctionDecl>(ND);
   if (!FD)
-    return SOF_OTHER;
+    return CCM_Other;
   QualType T = FD->getType();
 
   const FunctionType *FT = T->castAs<FunctionType>();
@@ -82,19 +85,21 @@
   CallingConv CC = FT->getCallConv();
   switch (CC) {
   default:
-    return SOF_OTHER;
+    return CCM_Other;
   case CC_X86FastCall:
-    return SOF_FAST;
+    return CCM_Fast;
   case CC_X86StdCall:
-    return SOF_STD;
+    return CCM_Std;
+  case CC_X86VectorCall:
+    return CCM_Vector;
   }
 }
 
 bool MangleContext::shouldMangleDeclName(const NamedDecl *D) {
   const ASTContext &ASTContext = getASTContext();
 
-  StdOrFastCC CC = getStdOrFastCallMangling(ASTContext, D);
-  if (CC != SOF_OTHER)
+  CCMangling CC = getCallingConvMangling(ASTContext, D);
+  if (CC != CCM_Other)
     return true;
 
   // In C, functions with no attributes never need to be mangled. Fastpath them.
@@ -131,10 +136,10 @@
   }
 
   const ASTContext &ASTContext = getASTContext();
-  StdOrFastCC CC = getStdOrFastCallMangling(ASTContext, D);
+  CCMangling CC = getCallingConvMangling(ASTContext, D);
   bool MCXX = shouldMangleCXXName(D);
   const TargetInfo &TI = Context.getTargetInfo();
-  if (CC == SOF_OTHER || (MCXX && TI.getCXXABI() == TargetCXXABI::Microsoft)) {
+  if (CC == CCM_Other || (MCXX && TI.getCXXABI() == TargetCXXABI::Microsoft)) {
     if (const ObjCMethodDecl *OMD = dyn_cast<ObjCMethodDecl>(D))
       mangleObjCMethodName(OMD, Out);
     else
@@ -143,9 +148,9 @@
   }
 
   Out << '\01';
-  if (CC == SOF_STD)
+  if (CC == CCM_Std)
     Out << '_';
-  else
+  else if (CC == CCM_Fast)
     Out << '@';
 
   if (!MCXX)
@@ -158,6 +163,8 @@
   const FunctionDecl *FD = cast<FunctionDecl>(D);
   const FunctionType *FT = FD->getType()->castAs<FunctionType>();
   const FunctionProtoType *Proto = dyn_cast<FunctionProtoType>(FT);
+  if (CC == CCM_Vector)
+    Out << '@';
   Out << '@';
   if (!Proto) {
     Out << '0';
@@ -169,9 +176,11 @@
     if (!MD->isStatic())
       ++ArgWords;
   for (const auto &AT : Proto->param_types())
-    // Size should be aligned to DWORD boundary
-    ArgWords += llvm::RoundUpToAlignment(ASTContext.getTypeSize(AT), 32) / 32;
-  Out << 4 * ArgWords;
+    // Size should be aligned to pointer size.
+    ArgWords += llvm::RoundUpToAlignment(ASTContext.getTypeSize(AT),
+                                         TI.getPointerWidth(0)) /
+                TI.getPointerWidth(0);
+  Out << ((TI.getPointerWidth(0) / 8) * ArgWords);
 }
 
 void MangleContext::mangleGlobalBlock(const BlockDecl *BD,
Index: cfe/trunk/lib/Basic/Targets.cpp
===================================================================
--- cfe/trunk/lib/Basic/Targets.cpp
+++ cfe/trunk/lib/Basic/Targets.cpp
@@ -3503,6 +3503,7 @@
 
   CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
     return (CC == CC_C ||
+            CC == CC_X86VectorCall ||
             CC == CC_IntelOclBicc ||
             CC == CC_X86_64Win64) ? CCCR_OK : CCCR_Warning;
   }
@@ -3542,6 +3543,7 @@
   }
   CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
     return (CC == CC_C ||
+            CC == CC_X86VectorCall ||
             CC == CC_IntelOclBicc ||
             CC == CC_X86_64SysV) ? CCCR_OK : CCCR_Warning;
   }
Index: cfe/trunk/lib/CodeGen/CGCall.cpp
===================================================================
--- cfe/trunk/lib/CodeGen/CGCall.cpp
+++ cfe/trunk/lib/CodeGen/CGCall.cpp
@@ -50,7 +50,7 @@
   // TODO: Add support for __pascal to LLVM.
   case CC_X86Pascal: return llvm::CallingConv::C;
   // TODO: Add support for __vectorcall to LLVM.
-  case CC_X86VectorCall: return llvm::CallingConv::C;
+  case CC_X86VectorCall: return llvm::CallingConv::X86_VectorCall;
   }
 }
 
@@ -603,6 +603,9 @@
       CharUnits UnionSize = CharUnits::Zero();
 
       for (const auto *FD : RD->fields()) {
+        // Skip zero length bitfields.
+        if (FD->isBitField() && FD->getBitWidthValue(Context) == 0)
+          continue;
         assert(!FD->isBitField() &&
                "Cannot expand structure with bit-field members.");
         CharUnits FieldSize = Context.getTypeSizeInChars(FD->getType());
@@ -622,6 +625,9 @@
       }
 
       for (const auto *FD : RD->fields()) {
+        // Skip zero length bitfields.
+        if (FD->isBitField() && FD->getBitWidthValue(Context) == 0)
+          continue;
         assert(!FD->isBitField() &&
                "Cannot expand structure with bit-field members.");
         Fields.push_back(FD);
Index: cfe/trunk/lib/CodeGen/MicrosoftCXXABI.cpp
===================================================================
--- cfe/trunk/lib/CodeGen/MicrosoftCXXABI.cpp
+++ cfe/trunk/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -617,8 +617,15 @@
     if (RD->hasNonTrivialCopyConstructor())
       return RAA_Indirect;
 
-    // Win64 passes objects larger than 8 bytes indirectly.
-    if (getContext().getTypeSize(RD->getTypeForDecl()) > 64)
+    // If an object has a destructor, we'd really like to pass it indirectly
+    // because it allows us to elide copies.  Unfortunately, MSVC makes that
+    // impossible for small types, which it will pass in a single register or
+    // stack slot. Most objects with dtors are large-ish, so handle that early.
+    // We can't call out all large objects as being indirect because there are
+    // multiple x64 calling conventions and the C++ ABI code shouldn't dictate
+    // how we pass large POD types.
+    if (RD->hasNonTrivialDestructor() &&
+        getContext().getTypeSize(RD->getTypeForDecl()) > 64)
       return RAA_Indirect;
 
     // We have a trivial copy constructor or no copy constructors, but we have
Index: cfe/trunk/lib/CodeGen/TargetInfo.cpp
===================================================================
--- cfe/trunk/lib/CodeGen/TargetInfo.cpp
+++ cfe/trunk/lib/CodeGen/TargetInfo.cpp
@@ -508,18 +508,39 @@
   return Ty;
 }
 
+/// Returns true if this type can be passed in SSE registers with the
+/// X86_VectorCall calling convention. Shared between x86_32 and x86_64.
+static bool isX86VectorTypeForVectorCall(ASTContext &Context, QualType Ty) {
+  if (const BuiltinType *BT = Ty->getAs<BuiltinType>()) {
+    if (BT->isFloatingPoint() && BT->getKind() != BuiltinType::Half)
+      return true;
+  } else if (const VectorType *VT = Ty->getAs<VectorType>()) {
+    // vectorcall can pass XMM, YMM, and ZMM vectors. We don't pass SSE1 MMX
+    // registers specially.
+    unsigned VecSize = Context.getTypeSize(VT);
+    if (VecSize == 128 || VecSize == 256 || VecSize == 512)
+      return true;
+  }
+  return false;
+}
+
+/// Returns true if this aggregate is small enough to be passed in SSE registers
+/// in the X86_VectorCall calling convention. Shared between x86_32 and x86_64.
+static bool isX86VectorCallAggregateSmallEnough(uint64_t NumMembers) {
+  return NumMembers <= 4;
+}
+
 //===----------------------------------------------------------------------===//
 // X86-32 ABI Implementation
 //===----------------------------------------------------------------------===//
 
 /// \brief Similar to llvm::CCState, but for Clang.
 struct CCState {
-  CCState(unsigned CC) : CC(CC), FreeRegs(0) {}
+  CCState(unsigned CC) : CC(CC), FreeRegs(0), FreeSSERegs(0) {}
 
   unsigned CC;
   unsigned FreeRegs;
-  unsigned StackOffset;
-  bool UseInAlloca;
+  unsigned FreeSSERegs;
 };
 
 /// X86_32ABIInfo - The X86-32 ABI information.
@@ -540,6 +561,17 @@
     return (Size == 8 || Size == 16 || Size == 32 || Size == 64);
   }
 
+  bool isHomogeneousAggregateBaseType(QualType Ty) const override {
+    // FIXME: Assumes vectorcall is in use.
+    return isX86VectorTypeForVectorCall(getContext(), Ty);
+  }
+
+  bool isHomogeneousAggregateSmallEnough(const Type *Ty,
+                                         uint64_t NumMembers) const override {
+    // FIXME: Assumes vectorcall is in use.
+    return isX86VectorCallAggregateSmallEnough(NumMembers);
+  }
+
   bool shouldReturnTypeInRegister(QualType Ty, ASTContext &Context) const;
 
   /// getIndirectResult - Give a source type \arg Ty, return a suitable result
@@ -767,6 +799,14 @@
   if (RetTy->isVoidType())
     return ABIArgInfo::getIgnore();
 
+  const Type *Base = nullptr;
+  uint64_t NumElts = 0;
+  if (State.CC == llvm::CallingConv::X86_VectorCall &&
+      isHomogeneousAggregate(RetTy, Base, NumElts)) {
+    // The LLVM struct type for such an aggregate should lower properly.
+    return ABIArgInfo::getDirect();
+  }
+
   if (const VectorType *VT = RetTy->getAs<VectorType>()) {
     // On Darwin, some vectors are returned in registers.
     if (IsDarwinVectorABI) {
@@ -939,7 +979,8 @@
 
   State.FreeRegs -= SizeInRegs;
 
-  if (State.CC == llvm::CallingConv::X86_FastCall) {
+  if (State.CC == llvm::CallingConv::X86_FastCall ||
+      State.CC == llvm::CallingConv::X86_VectorCall) {
     if (Size > 32)
       return false;
 
@@ -964,17 +1005,36 @@
 ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
                                                CCState &State) const {
   // FIXME: Set alignment on indirect arguments.
-  if (isAggregateTypeForABI(Ty)) {
-    if (const RecordType *RT = Ty->getAs<RecordType>()) {
-      // Check with the C++ ABI first.
-      CGCXXABI::RecordArgABI RAA = getRecordArgABI(RT, getCXXABI());
-      if (RAA == CGCXXABI::RAA_Indirect) {
-        return getIndirectResult(Ty, false, State);
-      } else if (RAA == CGCXXABI::RAA_DirectInMemory) {
-        // The field index doesn't matter, we'll fix it up later.
-        return ABIArgInfo::getInAlloca(/*FieldIndex=*/0);
-      }
 
+  // Check with the C++ ABI first.
+  const RecordType *RT = Ty->getAs<RecordType>();
+  if (RT) {
+    CGCXXABI::RecordArgABI RAA = getRecordArgABI(RT, getCXXABI());
+    if (RAA == CGCXXABI::RAA_Indirect) {
+      return getIndirectResult(Ty, false, State);
+    } else if (RAA == CGCXXABI::RAA_DirectInMemory) {
+      // The field index doesn't matter, we'll fix it up later.
+      return ABIArgInfo::getInAlloca(/*FieldIndex=*/0);
+    }
+  }
+
+  // vectorcall adds the concept of a homogenous vector aggregate, similar
+  // to other targets.
+  const Type *Base = nullptr;
+  uint64_t NumElts = 0;
+  if (State.CC == llvm::CallingConv::X86_VectorCall &&
+      isHomogeneousAggregate(Ty, Base, NumElts)) {
+    if (State.FreeSSERegs >= NumElts) {
+      State.FreeSSERegs -= NumElts;
+      if (Ty->isBuiltinType() || Ty->isVectorType())
+        return ABIArgInfo::getDirect();
+      return ABIArgInfo::getExpand();
+    }
+    return getIndirectResult(Ty, /*ByVal=*/false, State);
+  }
+
+  if (isAggregateTypeForABI(Ty)) {
+    if (RT) {
       // Structs are always byval on win32, regardless of what they contain.
       if (IsWin32StructABI)
         return getIndirectResult(Ty, true, State);
@@ -1006,7 +1066,9 @@
     if (getContext().getTypeSize(Ty) <= 4*32 &&
         canExpandIndirectArgument(Ty, getContext()))
       return ABIArgInfo::getExpandWithPadding(
-          State.CC == llvm::CallingConv::X86_FastCall, PaddingType);
+          State.CC == llvm::CallingConv::X86_FastCall ||
+              State.CC == llvm::CallingConv::X86_VectorCall,
+          PaddingType);
 
     return getIndirectResult(Ty, true, State);
   }
@@ -1049,7 +1111,10 @@
   CCState State(FI.getCallingConvention());
   if (State.CC == llvm::CallingConv::X86_FastCall)
     State.FreeRegs = 2;
-  else if (FI.getHasRegParm())
+  else if (State.CC == llvm::CallingConv::X86_VectorCall) {
+    State.FreeRegs = 2;
+    State.FreeSSERegs = 6;
+  } else if (FI.getHasRegParm())
     State.FreeRegs = FI.getRegParm();
   else
     State.FreeRegs = DefaultNumRegisterParameters;
@@ -1434,7 +1499,8 @@
 /// WinX86_64ABIInfo - The Windows X86_64 ABI information.
 class WinX86_64ABIInfo : public ABIInfo {
 
-  ABIArgInfo classify(QualType Ty, bool IsReturnType) const;
+  ABIArgInfo classify(QualType Ty, unsigned &FreeSSERegs,
+                      bool IsReturnType) const;
 
 public:
   WinX86_64ABIInfo(CodeGen::CodeGenTypes &CGT) : ABIInfo(CGT) {}
@@ -1443,6 +1509,17 @@
 
   llvm::Value *EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
                          CodeGenFunction &CGF) const override;
+
+  bool isHomogeneousAggregateBaseType(QualType Ty) const override {
+    // FIXME: Assumes vectorcall is in use.
+    return isX86VectorTypeForVectorCall(getContext(), Ty);
+  }
+
+  bool isHomogeneousAggregateSmallEnough(const Type *Ty,
+                                         uint64_t NumMembers) const override {
+    // FIXME: Assumes vectorcall is in use.
+    return isX86VectorCallAggregateSmallEnough(NumMembers);
+  }
 };
 
 class X86_64TargetCodeGenInfo : public TargetCodeGenInfo {
@@ -2844,7 +2921,8 @@
   return ResAddr;
 }
 
-ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, bool IsReturnType) const {
+ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
+                                      bool IsReturnType) const {
 
   if (Ty->isVoidType())
     return ABIArgInfo::getIgnore();
@@ -2852,7 +2930,9 @@
   if (const EnumType *EnumTy = Ty->getAs<EnumType>())
     Ty = EnumTy->getDecl()->getIntegerType();
 
-  uint64_t Size = getContext().getTypeSize(Ty);
+  TypeInfo Info = getContext().getTypeInfo(Ty);
+  uint64_t Width = Info.Width;
+  unsigned Align = getContext().toCharUnitsFromBits(Info.Align).getQuantity();
 
   const RecordType *RT = Ty->getAs<RecordType>();
   if (RT) {
@@ -2865,11 +2945,26 @@
       return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
 
     // FIXME: mingw-w64-gcc emits 128-bit struct as i128
-    if (Size == 128 && getTarget().getTriple().isWindowsGNUEnvironment())
+    if (Width == 128 && getTarget().getTriple().isWindowsGNUEnvironment())
       return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(),
-                                                          Size));
+                                                          Width));
+  }
+
+  // vectorcall adds the concept of a homogenous vector aggregate, similar to
+  // other targets.
+  const Type *Base = nullptr;
+  uint64_t NumElts = 0;
+  if (FreeSSERegs && isHomogeneousAggregate(Ty, Base, NumElts)) {
+    if (FreeSSERegs >= NumElts) {
+      FreeSSERegs -= NumElts;
+      if (IsReturnType || Ty->isBuiltinType() || Ty->isVectorType())
+        return ABIArgInfo::getDirect();
+      return ABIArgInfo::getExpand();
+    }
+    return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
   }
 
+
   if (Ty->isMemberPointerType()) {
     // If the member pointer is represented by an LLVM int or ptr, pass it
     // directly.
@@ -2881,11 +2976,11 @@
   if (RT || Ty->isMemberPointerType()) {
     // MS x64 ABI requirement: "Any argument that doesn't fit in 8 bytes, or is
     // not 1, 2, 4, or 8 bytes, must be passed by reference."
-    if (Size > 64 || !llvm::isPowerOf2_64(Size))
+    if (Width > 64 || !llvm::isPowerOf2_64(Width))
       return ABIArgInfo::getIndirect(0, /*ByVal=*/false);
 
     // Otherwise, coerce it to a small integer.
-    return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Size));
+    return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Width));
   }
 
   // Bool type is always extended to the ABI, other builtin types are not
@@ -2898,11 +2993,18 @@
 }
 
 void WinX86_64ABIInfo::computeInfo(CGFunctionInfo &FI) const {
+  bool IsVectorCall =
+      FI.getCallingConvention() == llvm::CallingConv::X86_VectorCall;
+
+  // We can use up to 4 SSE return registers with vectorcall.
+  unsigned FreeSSERegs = IsVectorCall ? 4 : 0;
   if (!getCXXABI().classifyReturnType(FI))
-    FI.getReturnInfo() = classify(FI.getReturnType(), true);
+    FI.getReturnInfo() = classify(FI.getReturnType(), FreeSSERegs, true);
 
+  // We can use up to 6 SSE register parameters with vectorcall.
+  FreeSSERegs = IsVectorCall ? 6 : 0;
   for (auto &I : FI.arguments())
-    I.info = classify(I.type, false);
+    I.info = classify(I.type, FreeSSERegs, false);
 }
 
 llvm::Value *WinX86_64ABIInfo::EmitVAArg(llvm::Value *VAListAddr, QualType Ty,
Index: cfe/trunk/test/CodeGen/mangle-windows.c
===================================================================
--- cfe/trunk/test/CodeGen/mangle-windows.c
+++ cfe/trunk/test/CodeGen/mangle-windows.c
@@ -1,33 +1,68 @@
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 | FileCheck %s
 // RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-mingw32 | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 | FileCheck %s --check-prefix=X64
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-mingw32 | FileCheck %s --check-prefix=X64
 
 void __stdcall f1(void) {}
 // CHECK: define x86_stdcallcc void @"\01_f1@0"
+// X64: define void @f1(
 
 void __fastcall f2(void) {}
 // CHECK: define x86_fastcallcc void @"\01@f2@0"
+// X64: define void @f2(
 
 void __stdcall f3() {}
 // CHECK: define x86_stdcallcc void @"\01_f3@0"
+// X64: define void @f3(
 
 void __fastcall f4(char a) {}
 // CHECK: define x86_fastcallcc void @"\01@f4@4"
+// X64: define void @f4(
 
 void __fastcall f5(short a) {}
 // CHECK: define x86_fastcallcc void @"\01@f5@4"
+// X64: define void @f5(
 
 void __fastcall f6(int a) {}
 // CHECK: define x86_fastcallcc void @"\01@f6@4"
+// X64: define void @f6(
 
 void __fastcall f7(long a) {}
 // CHECK: define x86_fastcallcc void @"\01@f7@4"
+// X64: define void @f7(
 
 void __fastcall f8(long long a) {}
 // CHECK: define x86_fastcallcc void @"\01@f8@8"
+// X64: define void @f8(
 
 void __fastcall f9(long long a, char b, char c, short d) {}
-// CHECK: define x86_fastcallcc void @"\01@f9@20"(i64 %a, i8 signext %b, i8
-// signext %c, i16 signext %d)
+// CHECK: define x86_fastcallcc void @"\01@f9@20"(i64 %a, i8 signext %b, i8 signext %c, i16 signext %d)
+// X64: define void @f9(
 
 void f12(void) {}
 // CHECK: define void @f12(
+// X64: define void @f12(
+
+void __vectorcall v1(void) {}
+// CHECK: define x86_vectorcallcc void @"\01v1@@0"(
+// X64: define x86_vectorcallcc void @"\01v1@@0"(
+
+void __vectorcall v2(char a) {}
+// CHECK: define x86_vectorcallcc void @"\01v2@@4"(
+// X64: define x86_vectorcallcc void @"\01v2@@8"(
+
+void __vectorcall v3(short a) {}
+// CHECK: define x86_vectorcallcc void @"\01v3@@4"(
+// X64: define x86_vectorcallcc void @"\01v3@@8"(
+
+void __vectorcall v4(int a) {}
+// CHECK: define x86_vectorcallcc void @"\01v4@@4"(
+// X64: define x86_vectorcallcc void @"\01v4@@8"(
+
+void __vectorcall v5(long long a) {}
+// CHECK: define x86_vectorcallcc void @"\01v5@@8"(
+// X64: define x86_vectorcallcc void @"\01v5@@8"(
+
+void __vectorcall v6(char a, char b) {}
+// CHECK: define x86_vectorcallcc void @"\01v6@@8"(
+// X64: define x86_vectorcallcc void @"\01v6@@16"(
Index: cfe/trunk/test/CodeGen/microsoft-call-conv.c
===================================================================
--- cfe/trunk/test/CodeGen/microsoft-call-conv.c
+++ cfe/trunk/test/CodeGen/microsoft-call-conv.c
@@ -20,9 +20,8 @@
   f3();
 // CHECK: call x86_thiscallcc void @f3()
 }
-// FIXME: Add this to LLVM.
 void __vectorcall f61(void) {
-// CHECK-LABEL: define void @f61()
+// CHECK-LABEL: define x86_vectorcallcc void @f61()
   f3();
 // CHECK: call x86_thiscallcc void @f3()
 }
@@ -41,7 +40,7 @@
     // CHECK: call x86_fastcallcc void @f4()
     // CHECK: call x86_stdcallcc void @f5()
     // CHECK: call x86_thiscallcc void @f6()
-    // CHECK: call void @f61()
+    // CHECK: call x86_vectorcallcc void @f61()
     pf1(); pf2(); pf3(); pf4(); pf5(); pf6(); pf7();
     // CHECK: call x86_fastcallcc void %{{.*}}()
     // CHECK: call x86_stdcallcc void %{{.*}}()
@@ -49,7 +48,7 @@
     // CHECK: call x86_fastcallcc void %{{.*}}()
     // CHECK: call x86_stdcallcc void %{{.*}}()
     // CHECK: call x86_thiscallcc void %{{.*}}()
-    // CHECK: call void %{{.*}}()
+    // CHECK: call x86_vectorcallcc void %{{.*}}()
     return 0;
 }
 
Index: cfe/trunk/test/CodeGen/vectorcall.c
===================================================================
--- cfe/trunk/test/CodeGen/vectorcall.c
+++ cfe/trunk/test/CodeGen/vectorcall.c
@@ -0,0 +1,77 @@
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=i386-pc-win32 | FileCheck %s
+// RUN: %clang_cc1 -emit-llvm %s -o - -triple=x86_64-pc-win32 | FileCheck %s --check-prefix=X64
+
+void __vectorcall v1(int a, int b) {}
+// CHECK: define x86_vectorcallcc void @"\01v1@@8"(i32 inreg %a, i32 inreg %b)
+// X64: define x86_vectorcallcc void @"\01v1@@16"(i32 %a, i32 %b)
+
+void __vectorcall v2(char a, char b) {}
+// CHECK: define x86_vectorcallcc void @"\01v2@@8"(i8 inreg signext %a, i8 inreg signext %b)
+// X64: define x86_vectorcallcc void @"\01v2@@16"(i8 %a, i8 %b)
+
+struct Small { int a; };
+void __vectorcall v3(int a, struct Small b, int c) {}
+// CHECK: define x86_vectorcallcc void @"\01v3@@12"(i32 inreg %a, %struct.Small* byval align 4 %b, i32 inreg %c)
+// X64: define x86_vectorcallcc void @"\01v3@@24"(i32 %a, i32 %b.coerce, i32 %c)
+
+struct Large { int a[5]; };
+void __vectorcall v4(int a, struct Large b, int c) {}
+// CHECK: define x86_vectorcallcc void @"\01v4@@28"(i32 inreg %a, %struct.Large* byval align 4 %b, i32 inreg %c)
+// X64: define x86_vectorcallcc void @"\01v4@@40"(i32 %a, %struct.Large* %b, i32 %c)
+
+struct HFA2 { double x, y; };
+struct HFA4 { double w, x, y, z; };
+struct HFA5 { double v, w, x, y, z; };
+
+void __vectorcall hfa1(int a, struct HFA4 b, int c) {}
+// CHECK: define x86_vectorcallcc void @"\01hfa1@@40"(i32 inreg %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 inreg %c)
+// X64: define x86_vectorcallcc void @"\01hfa1@@48"(i32 %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 %c)
+
+// HFAs that would require more than six total SSE registers are passed
+// indirectly. Additional vector arguments can consume the rest of the SSE
+// registers.
+void __vectorcall hfa2(struct HFA4 a, struct HFA4 b, double c) {}
+// CHECK: define x86_vectorcallcc void @"\01hfa2@@72"(double %a.0, double %a.1, double %a.2, double %a.3, %struct.HFA4* inreg %b, double %c)
+// X64: define x86_vectorcallcc void @"\01hfa2@@72"(double %a.0, double %a.1, double %a.2, double %a.3, %struct.HFA4* align 8 %b, double %c)
+
+// Ensure that we pass builtin types directly while counting them against the
+// SSE register usage.
+void __vectorcall hfa3(double a, double b, double c, double d, double e, struct HFA2 f) {}
+// CHECK: define x86_vectorcallcc void @"\01hfa3@@56"(double %a, double %b, double %c, double %d, double %e, %struct.HFA2* inreg %f)
+// X64: define x86_vectorcallcc void @"\01hfa3@@56"(double %a, double %b, double %c, double %d, double %e, %struct.HFA2* align 8 %f)
+
+// Aggregates with more than four elements are not HFAs and are passed byval.
+// Because they are not classified as homogeneous, they don't get special
+// handling to ensure alignment.
+void __vectorcall hfa4(struct HFA5 a) {}
+// CHECK: define x86_vectorcallcc void @"\01hfa4@@40"(%struct.HFA5* byval align 4)
+// X64: define x86_vectorcallcc void @"\01hfa4@@40"(%struct.HFA5* %a)
+
+// Return HFAs of 4 or fewer elements in registers.
+static struct HFA2 g_hfa2;
+struct HFA2 __vectorcall hfa5(void) { return g_hfa2; }
+// CHECK: define x86_vectorcallcc %struct.HFA2 @"\01hfa5@@0"()
+// X64: define x86_vectorcallcc %struct.HFA2 @"\01hfa5@@0"()
+
+typedef float __attribute__((vector_size(16))) v4f32;
+struct HVA2 { v4f32 x, y; };
+struct HVA4 { v4f32 w, x, y, z; };
+
+void __vectorcall hva1(int a, struct HVA4 b, int c) {}
+// CHECK: define x86_vectorcallcc void @"\01hva1@@72"(i32 inreg %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 inreg %c)
+// X64: define x86_vectorcallcc void @"\01hva1@@80"(i32 %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 %c)
+
+void __vectorcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {}
+// CHECK: define x86_vectorcallcc void @"\01hva2@@144"(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, %struct.HVA4* inreg %b, <4 x float> %c)
+// X64: define x86_vectorcallcc void @"\01hva2@@144"(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, %struct.HVA4* align 16 %b, <4 x float> %c)
+
+void __vectorcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {}
+// CHECK: define x86_vectorcallcc void @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* inreg %f)
+// X64: define x86_vectorcallcc void @"\01hva3@@112"(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, <4 x float> %e, %struct.HVA2* align 16 %f)
+
+typedef float __attribute__((ext_vector_type(3))) v3f32;
+struct OddSizeHVA { v3f32 x, y; };
+
+void __vectorcall odd_size_hva(struct OddSizeHVA a) {}
+// CHECK: define x86_vectorcallcc void @"\01odd_size_hva@@32"(<3 x float> %a.0, <3 x float> %a.1)
+// X64: define x86_vectorcallcc void @"\01odd_size_hva@@32"(<3 x float> %a.0, <3 x float> %a.1)
Index: cfe/trunk/test/CodeGenCXX/homogeneous-aggregates.cpp
===================================================================
--- cfe/trunk/test/CodeGenCXX/homogeneous-aggregates.cpp
+++ cfe/trunk/test/CodeGenCXX/homogeneous-aggregates.cpp
@@ -1,6 +1,13 @@
-// RUNxX: %clang_cc1 -triple powerpc64le-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC
+// RUN: %clang_cc1 -triple powerpc64le-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=PPC
 // RUN: %clang_cc1 -mfloat-abi hard -triple armv7-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=ARM32
 // RUN: %clang_cc1 -mfloat-abi hard -triple aarch64-unknown-linux-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=ARM64
+// RUN: %clang_cc1 -mfloat-abi hard -triple x86_64-unknown-windows-gnu -emit-llvm -o - %s | FileCheck %s --check-prefix=X64
+
+#if defined(__x86_64__)
+#define CC __attribute__((vectorcall))
+#else
+#define CC
+#endif
 
 // Test that C++ classes are correctly classified as homogeneous aggregates.
 
@@ -34,24 +41,26 @@
 // PPC: define void @_Z7func_D12D1(%struct.D1* noalias sret %agg.result, [3 x i64] %x.coerce)
 // ARM32: define arm_aapcs_vfpcc void @_Z7func_D12D1(%struct.D1* noalias sret %agg.result, { [3 x i64] } %x.coerce)
 // ARM64: define void @_Z7func_D12D1(%struct.D1* noalias sret %agg.result, %struct.D1* %x)
-D1 func_D1(D1 x) { return x; }
+// X64: define x86_vectorcallcc void @"\01_Z7func_D12D1@@24"(%struct.D1* noalias sret %agg.result, %struct.D1* %x)
+D1 CC func_D1(D1 x) { return x; }
 
 // PPC: define [3 x double] @_Z7func_D22D2([3 x double] %x.coerce)
 // ARM32: define arm_aapcs_vfpcc %struct.D2 @_Z7func_D22D2(%struct.D2 %x.coerce)
 // ARM64: define %struct.D2 @_Z7func_D22D2(double %x.0, double %x.1, double %x.2)
-D2 func_D2(D2 x) { return x; }
+// X64: define x86_vectorcallcc %struct.D2 @"\01_Z7func_D22D2@@24"(double %x.0, double %x.1, double %x.2)
+D2 CC func_D2(D2 x) { return x; }
 
 // PPC: define void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, [4 x i64] %x.coerce)
 // ARM32: define arm_aapcs_vfpcc void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, { [4 x i64] } %x.coerce)
 // ARM64: define void @_Z7func_D32D3(%struct.D3* noalias sret %agg.result, %struct.D3* %x)
-D3 func_D3(D3 x) { return x; }
+D3 CC func_D3(D3 x) { return x; }
 
 // PPC: define [4 x double] @_Z7func_D42D4([4 x double] %x.coerce)
 // ARM32: define arm_aapcs_vfpcc %struct.D4 @_Z7func_D42D4(%struct.D4 %x.coerce)
 // ARM64: define %struct.D4 @_Z7func_D42D4(double %x.0, double %x.1, double %x.2, double %x.3)
-D4 func_D4(D4 x) { return x; }
+D4 CC func_D4(D4 x) { return x; }
 
-D5 func_D5(D5 x) { return x; }
+D5 CC func_D5(D5 x) { return x; }
 // PPC: define [3 x double] @_Z7func_D52D5([3 x double] %x.coerce)
 // ARM32: define arm_aapcs_vfpcc %struct.D5 @_Z7func_D52D5(%struct.D5 %x.coerce)
 
@@ -92,3 +101,27 @@
 // ARM64: getelementptr inbounds %struct.Base2* %{{.*}}, i32 0, i32 0
 // ARM64: load double*
 // ARM64: call %struct.D5 @_Z7func_D52D5(double %{{.*}}, double %{{.*}}, double %{{.*}})
+
+struct Empty { };
+struct Float1 { float x; };
+struct Float2 { float y; };
+struct HVAWithEmptyBase : Float1, Empty, Float2 { float z; };
+
+// PPC: define void @_Z15with_empty_base16HVAWithEmptyBase([3 x float] %a.coerce)
+// ARM64: define void @_Z15with_empty_base16HVAWithEmptyBase(float %a.0, float %a.1, float %a.2)
+// ARM32: define arm_aapcs_vfpcc void @_Z15with_empty_base16HVAWithEmptyBase(%struct.HVAWithEmptyBase %a.coerce)
+void CC with_empty_base(HVAWithEmptyBase a) {}
+
+// FIXME: MSVC doesn't consider this an HVA becuase of the empty base.
+// X64: define x86_vectorcallcc void @"\01_Z15with_empty_base16HVAWithEmptyBase@@16"(float %a.0, float %a.1, float %a.2)
+
+struct HVAWithEmptyBitField : Float1, Float2 {
+  int : 0; // Takes no space.
+  float z;
+};
+
+// PPC: define void @_Z19with_empty_bitfield20HVAWithEmptyBitField([3 x float] %a.coerce)
+// ARM64: define void @_Z19with_empty_bitfield20HVAWithEmptyBitField(float %a.0, float %a.1, float %a.2)
+// ARM32: define arm_aapcs_vfpcc void @_Z19with_empty_bitfield20HVAWithEmptyBitField(%struct.HVAWithEmptyBitField %a.coerce)
+// X64: define x86_vectorcallcc void @"\01_Z19with_empty_bitfield20HVAWithEmptyBitField@@16"(float %a.0, float %a.1, float %a.2)
+void CC with_empty_bitfield(HVAWithEmptyBitField a) {}