diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3021,6 +3021,9 @@
 def err_attribute_too_few_arguments : Error<
   "%0 attribute takes at least %1 argument%s1">;
 def err_attribute_invalid_vector_type : Error<"invalid vector element type %0">;
+def err_attribute_invalid_bitint_vector_type : Error<
+  "'_BitInt' vector element width must be %select{a power of 2|"
+  "at least as wide as 'CHAR_BIT'}0">;
 def err_attribute_invalid_matrix_type : Error<"invalid matrix element type %0">;
 def err_attribute_bad_neon_vector_size : Error<
   "Neon vector size must be 64 or 128 bits">;
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -4019,7 +4019,11 @@
 /// the specified element type and size. VectorType must be a built-in type.
 QualType ASTContext::getVectorType(QualType vecType, unsigned NumElts,
                                    VectorType::VectorKind VecKind) const {
-  assert(vecType->isBuiltinType());
+  assert(vecType->isBuiltinType() ||
+         (vecType->isBitIntType() &&
+          // Only support _BitInt elements with byte-sized power of 2 NumBits.
+          llvm::isPowerOf2_32(vecType->getAs<BitIntType>()->getNumBits()) &&
+          vecType->getAs<BitIntType>()->getNumBits() >= 8));
 
   // Check if we've already instantiated a vector of this type.
   llvm::FoldingSetNodeID ID;
@@ -4087,9 +4091,13 @@
 
 /// getExtVectorType - Return the unique reference to an extended vector type of
 /// the specified element type and size. VectorType must be a built-in type.
-QualType
-ASTContext::getExtVectorType(QualType vecType, unsigned NumElts) const {
-  assert(vecType->isBuiltinType() || vecType->isDependentType());
+QualType ASTContext::getExtVectorType(QualType vecType,
+                                      unsigned NumElts) const {
+  assert(vecType->isBuiltinType() || vecType->isDependentType() ||
+         (vecType->isBitIntType() &&
+          // Only support _BitInt elements with byte-sized power of 2 NumBits.
+          llvm::isPowerOf2_32(vecType->getAs<BitIntType>()->getNumBits()) &&
+          vecType->getAs<BitIntType>()->getNumBits() >= 8));
 
   // Check if we've already instantiated a vector of this type.
   llvm::FoldingSetNodeID ID;
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -3073,14 +3073,17 @@
 
 void MicrosoftCXXNameMangler::mangleType(const VectorType *T, Qualifiers Quals,
                                          SourceRange Range) {
-  const BuiltinType *ET = T->getElementType()->getAs<BuiltinType>();
-  assert(ET && "vectors with non-builtin elements are unsupported");
+  QualType EltTy = T->getElementType();
+  const BuiltinType *ET = EltTy->getAs<BuiltinType>();
+  const BitIntType *BitIntTy = EltTy->getAs<BitIntType>();
+  assert((ET || BitIntTy) &&
+         "vectors with non-builtin/_BitInt elements are unsupported");
   uint64_t Width = getASTContext().getTypeSize(T);
   // Pattern match exactly the typedefs in our intrinsic headers.  Anything that
   // doesn't match the Intel types uses a custom mangling below.
   size_t OutSizeBefore = Out.tell();
   if (!isa<ExtVectorType>(T)) {
-    if (getASTContext().getTargetInfo().getTriple().isX86()) {
+    if (getASTContext().getTargetInfo().getTriple().isX86() && ET) {
       if (Width == 64 && ET->getKind() == BuiltinType::LongLong) {
         mangleArtificialTagType(TTK_Union, "__m64");
       } else if (Width >= 128) {
@@ -3105,7 +3108,8 @@
     MicrosoftCXXNameMangler Extra(Context, Stream);
     Stream << "?$";
     Extra.mangleSourceName("__vector");
-    Extra.mangleType(QualType(ET, 0), Range, QMM_Escape);
+    Extra.mangleType(QualType(ET ? static_cast<const Type *>(ET) : BitIntTy, 0),
+                     Range, QMM_Escape);
     Extra.mangleIntegerLiteral(llvm::APSInt::getUnsigned(T->getNumElements()));
 
     mangleArtificialTagType(TTK_Union, TemplateMangling, {"__clang"});
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -2636,11 +2636,21 @@
   // can't already be a vector.
   if ((!CurType->isDependentType() &&
        (!CurType->isBuiltinType() || CurType->isBooleanType() ||
-        (!CurType->isIntegerType() && !CurType->isRealFloatingType()))) ||
+        (!CurType->isIntegerType() && !CurType->isRealFloatingType())) &&
+       !CurType->isBitIntType()) ||
       CurType->isArrayType()) {
     Diag(AttrLoc, diag::err_attribute_invalid_vector_type) << CurType;
     return QualType();
   }
+  // Only support _BitInt elements with byte-sized power of 2 NumBits.
+  if (CurType->isBitIntType()) {
+    unsigned NumBits = CurType->getAs<BitIntType>()->getNumBits();
+    if (!llvm::isPowerOf2_32(NumBits) || NumBits < 8) {
+      Diag(AttrLoc, diag::err_attribute_invalid_bitint_vector_type)
+          << (NumBits < 8);
+      return QualType();
+    }
+  }
 
   if (SizeExpr->isTypeDependent() || SizeExpr->isValueDependent())
     return Context.getDependentVectorType(CurType, SizeExpr, AttrLoc,
@@ -2706,12 +2716,22 @@
   // We explictly allow bool elements in ext_vector_type for C/C++.
   bool IsNoBoolVecLang = getLangOpts().OpenCL || getLangOpts().OpenCLCPlusPlus;
   if ((!T->isDependentType() && !T->isIntegerType() &&
-       !T->isRealFloatingType()) || T->isBitIntType() ||
+       !T->isRealFloatingType()) ||
       (IsNoBoolVecLang && T->isBooleanType())) {
     Diag(AttrLoc, diag::err_attribute_invalid_vector_type) << T;
     return QualType();
   }
 
+  // Only support _BitInt elements with byte-sized power of 2 NumBits.
+  if (T->isBitIntType()) {
+    unsigned NumBits = T->getAs<BitIntType>()->getNumBits();
+    if (!llvm::isPowerOf2_32(NumBits) || NumBits < 8) {
+      Diag(AttrLoc, diag::err_attribute_invalid_bitint_vector_type)
+          << (NumBits < 8);
+      return QualType();
+    }
+  }
+
   if (!ArraySize->isTypeDependent() && !ArraySize->isValueDependent()) {
     Optional<llvm::APSInt> vecSize = ArraySize->getIntegerConstantExpr(Context);
     if (!vecSize) {
diff --git a/clang/test/CodeGenCXX/ext-int-vector-abi.cpp b/clang/test/CodeGenCXX/ext-int-vector-abi.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGenCXX/ext-int-vector-abi.cpp
@@ -0,0 +1,113 @@
+// RUN: %clang_cc1 -no-opaque-pointers -no-enable-noundef-analysis -triple x86_64-gnu-linux -O3 -disable-llvm-passes -I%S -emit-llvm -o - %s | FileCheck %s --check-prefixes=LIN64
+// RUN: %clang_cc1 -no-opaque-pointers -no-enable-noundef-analysis -triple i386-gnu-linux -O3 -disable-llvm-passes -I%S -emit-llvm -o - %s | FileCheck %s --check-prefixes=LIN32
+
+// RUN: %clang_cc1 -no-opaque-pointers -no-enable-noundef-analysis -triple x86_64-windows-pc -O3 -disable-llvm-passes -I%S -emit-llvm -o - %s | FileCheck %s --check-prefixes=WIN64
+// RUN: %clang_cc1 -no-opaque-pointers -no-enable-noundef-analysis -triple i386-windows-pc -O3 -disable-llvm-passes -I%S -emit-llvm -o - %s | FileCheck %s --check-prefixes=WIN32
+
+// Make sure BitInt vector match builtin Int vector abi.
+
+using int8_t3 = _BitInt(8)  __attribute__((ext_vector_type(3)));
+int8_t3 ManglingTestRetParam(int8_t3 Param) {
+// LIN64: define{{.*}} i32 @_Z20ManglingTestRetParamDv3_DB8_(i32 %
+// LIN32: define{{.*}} <3 x i8> @_Z20ManglingTestRetParamDv3_DB8_(<3 x i8> %
+// WIN64: define dso_local <3 x i8> @"?ManglingTestRetParam@@YAT?$__vector@U?$_BitInt@$07@__clang@@$02@__clang@@T12@@Z"(<3 x i8> %
+// WIN32: define dso_local <3 x i8> @"?ManglingTestRetParam@@YAT?$__vector@U?$_BitInt@$07@__clang@@$02@__clang@@T12@@Z"(<3 x i8> inreg %
+  return Param;
+}
+using int8_t3c = char  __attribute__((ext_vector_type(3)));
+int8_t3c ManglingTestRetParam(int8_t3c Param) {
+// LIN64: define{{.*}} i32 @_Z20ManglingTestRetParamDv3_c(i32 %
+// LIN32: define{{.*}} <3 x i8> @_Z20ManglingTestRetParamDv3_c(<3 x i8> %
+// WIN64: define dso_local <3 x i8> @"?ManglingTestRetParam@@YAT?$__vector@D$02@__clang@@T12@@Z"(<3 x i8> %
+// WIN32: define dso_local <3 x i8> @"?ManglingTestRetParam@@YAT?$__vector@D$02@__clang@@T12@@Z"(<3 x i8> inreg %
+  return Param;
+}
+
+typedef unsigned _BitInt(16) uint16_t4 __attribute__((ext_vector_type(4)));
+uint16_t4 ManglingTestRetParam(uint16_t4 Param) {
+// LIN64: define{{.*}} double @_Z20ManglingTestRetParamDv4_DU16_(double %
+// LIN32: define{{.*}} <4 x i16> @_Z20ManglingTestRetParamDv4_DU16_(i64 %
+// WIN64: define dso_local <4 x i16> @"?ManglingTestRetParam@@YAT?$__vector@U?$_UBitInt@$0BA@@__clang@@$03@__clang@@T12@@Z"(<4 x i16> %
+// WIN32: define dso_local <4 x i16> @"?ManglingTestRetParam@@YAT?$__vector@U?$_UBitInt@$0BA@@__clang@@$03@__clang@@T12@@Z"(<4 x i16> inreg %
+  return Param;
+}
+
+typedef unsigned short uint16_t4s __attribute__((ext_vector_type(4)));
+uint16_t4s ManglingTestRetParam(uint16_t4s Param) {
+// LIN64: define{{.*}} double @_Z20ManglingTestRetParamDv4_t(double %
+// LIN32: define{{.*}} <4 x i16> @_Z20ManglingTestRetParamDv4_t(i64 %
+// WIN64: define dso_local <4 x i16> @"?ManglingTestRetParam@@YAT?$__vector@G$03@__clang@@T12@@Z"(<4 x i16> %
+// WIN32: define dso_local <4 x i16> @"?ManglingTestRetParam@@YAT?$__vector@G$03@__clang@@T12@@Z"(<4 x i16> inreg %
+  return Param;
+}
+
+typedef unsigned _BitInt(32) uint32_t4 __attribute__((ext_vector_type(4)));
+uint32_t4 ManglingTestRetParam(uint32_t4 Param) {
+// LIN64: define{{.*}} <4 x i32> @_Z20ManglingTestRetParamDv4_DU32_(<4 x i32> %
+// LIN32: define{{.*}} <4 x i32> @_Z20ManglingTestRetParamDv4_DU32_(<4 x i32> %
+// WIN64: define dso_local <4 x i32> @"?ManglingTestRetParam@@YAT?$__vector@U?$_UBitInt@$0CA@@__clang@@$03@__clang@@T12@@Z"(<4 x i32> %
+// WIN32: define dso_local <4 x i32> @"?ManglingTestRetParam@@YAT?$__vector@U?$_UBitInt@$0CA@@__clang@@$03@__clang@@T12@@Z"(<4 x i32> inreg %
+  return Param;
+}
+
+typedef unsigned int uint32_t4s __attribute__((ext_vector_type(4)));
+uint32_t4s ManglingTestRetParam(uint32_t4s Param) {
+// LIN64: define{{.*}} <4 x i32> @_Z20ManglingTestRetParamDv4_j(<4 x i32> %
+// LIN32: define{{.*}} <4 x i32> @_Z20ManglingTestRetParamDv4_j(<4 x i32> %
+// WIN64: define dso_local <4 x i32> @"?ManglingTestRetParam@@YAT?$__vector@I$03@__clang@@T12@@Z"(<4 x i32> %
+// WIN32: define dso_local <4 x i32> @"?ManglingTestRetParam@@YAT?$__vector@I$03@__clang@@T12@@Z"(<4 x i32> inreg %
+  return Param;
+}
+
+typedef unsigned _BitInt(64) uint64_t4 __attribute__((ext_vector_type(4)));
+uint64_t4 ManglingTestRetParam(uint64_t4 Param) {
+// LIN64: define{{.*}} <4 x i64> @_Z20ManglingTestRetParamDv4_DU64_(<4 x i64>* byval(<4 x i64>) align 32 %
+// LIN32: define{{.*}} <4 x i64> @_Z20ManglingTestRetParamDv4_DU64_(<4 x i64> %
+// WIN64: define dso_local <4 x i64> @"?ManglingTestRetParam@@YAT?$__vector@U?$_UBitInt@$0EA@@__clang@@$03@__clang@@T12@@Z"(<4 x i64> %
+// WIN32: define dso_local <4 x i64> @"?ManglingTestRetParam@@YAT?$__vector@U?$_UBitInt@$0EA@@__clang@@$03@__clang@@T12@@Z"(<4 x i64> inreg %
+  return Param;
+}
+
+typedef unsigned long long uint64_t4s __attribute__((ext_vector_type(4)));
+uint64_t4s ManglingTestRetParam(uint64_t4s Param) {
+// LIN64: define{{.*}} <4 x i64> @_Z20ManglingTestRetParamDv4_y(<4 x i64>* byval(<4 x i64>) align 32 %
+// LIN32: define{{.*}} <4 x i64> @_Z20ManglingTestRetParamDv4_y(<4 x i64> %
+// WIN64: define dso_local <4 x i64> @"?ManglingTestRetParam@@YAT?$__vector@_K$03@__clang@@T12@@Z"(<4 x i64> %
+// WIN32: define dso_local <4 x i64> @"?ManglingTestRetParam@@YAT?$__vector@_K$03@__clang@@T12@@Z"(<4 x i64> inreg %
+  return Param;
+}
+
+typedef _BitInt(32) vint32_t8 __attribute__((vector_size(32)));
+vint32_t8 ManglingTestRetParam(vint32_t8 Param) {
+// LIN64: define{{.*}} <8 x i32> @_Z20ManglingTestRetParamDv8_DB32_(<8 x i32>* byval(<8 x i32>) align 32 %
+// LIN32: define{{.*}} <8 x i32> @_Z20ManglingTestRetParamDv8_DB32_(<8 x i32> %
+// WIN64: define dso_local <8 x i32> @"?ManglingTestRetParam@@YA?AT?$__vector@U?$_BitInt@$0CA@@__clang@@$07@__clang@@T12@@Z"(<8 x i32> %
+// WIN32: define dso_local <8 x i32> @"?ManglingTestRetParam@@YA?AT?$__vector@U?$_BitInt@$0CA@@__clang@@$07@__clang@@T12@@Z"(<8 x i32> inreg %
+  return Param;
+}
+
+typedef int vint32_t8i __attribute__((vector_size(32)));
+vint32_t8i ManglingTestRetParam(vint32_t8i Param) {
+// LIN64: define{{.*}} <8 x i32> @_Z20ManglingTestRetParamDv8_i(<8 x i32>* byval(<8 x i32>) align 32 %
+// LIN32: define{{.*}} <8 x i32> @_Z20ManglingTestRetParamDv8_i(<8 x i32> %
+// WIN64: define dso_local <8 x i32> @"?ManglingTestRetParam@@YA?AT?$__vector@H$07@__clang@@T12@@Z"(<8 x i32> %
+// WIN32: define dso_local <8 x i32> @"?ManglingTestRetParam@@YA?AT?$__vector@H$07@__clang@@T12@@Z"(<8 x i32> inreg %
+  return Param;
+}
+
+typedef unsigned _BitInt(64) uvint64_t16 __attribute__((vector_size(16)));
+uvint64_t16 ManglingTestRetParam(uvint64_t16 Param) {
+// LIN64: define{{.*}} <2 x i64> @_Z20ManglingTestRetParamDv2_DU64_(<2 x i64> %
+// LIN32: define{{.*}} <2 x i64> @_Z20ManglingTestRetParamDv2_DU64_(<2 x i64> %
+// WIN64: define dso_local <2 x i64> @"?ManglingTestRetParam@@YA?AT?$__vector@U?$_UBitInt@$0EA@@__clang@@$01@__clang@@T12@@Z"(<2 x i64> %
+// WIN32: define dso_local <2 x i64> @"?ManglingTestRetParam@@YA?AT?$__vector@U?$_UBitInt@$0EA@@__clang@@$01@__clang@@T12@@Z"(<2 x i64> inreg %
+  return Param;
+}
+using uvint64_t16l = unsigned long long  __attribute__((vector_size(16)));
+uvint64_t16l ManglingTestRetParam(uvint64_t16l Param) {
+// LIN64: define{{.*}} <2 x i64> @_Z20ManglingTestRetParamDv2_y(<2 x i64> %
+// LIN32: define{{.*}} <2 x i64> @_Z20ManglingTestRetParamDv2_y(<2 x i64> %
+// WIN64: define dso_local <2 x i64> @"?ManglingTestRetParam@@YA?AT?$__vector@_K$01@__clang@@T12@@Z"(<2 x i64> %
+// WIN32: define dso_local <2 x i64> @"?ManglingTestRetParam@@YA?AT?$__vector@_K$01@__clang@@T12@@Z"(<2 x i64> inreg %
+  return Param;
+}
diff --git a/clang/test/CodeGenCXX/ext-int.cpp b/clang/test/CodeGenCXX/ext-int.cpp
--- a/clang/test/CodeGenCXX/ext-int.cpp
+++ b/clang/test/CodeGenCXX/ext-int.cpp
@@ -129,6 +129,9 @@
   return 0;
 }
 
+typedef unsigned _BitInt(16) uint16_t4 __attribute__((ext_vector_type(4)));
+typedef _BitInt(32) vint32_t8 __attribute__((vector_size(32)));
+
 template<typename T>
 void ManglingTestTemplateParam(T&);
 template<_BitInt(99) T>
@@ -136,7 +139,6 @@
 template <int N>
 auto ManglingDependent() -> decltype(_BitInt(N){});
 
-
 void ManglingInstantiator() {
   // LIN: define{{.*}} void @_Z20ManglingInstantiatorv()
   // WIN: define dso_local void @"?ManglingInstantiator@@YAXXZ"()
@@ -156,6 +158,12 @@
   // LIN: call signext i4 @_Z17ManglingDependentILi4EEDTtlDBT__EEv()
   // WIN64: call i4 @"??$ManglingDependent@$03@@YAU?$_BitInt@$03@__clang@@XZ"()
   // WIN32: call signext i4 @"??$ManglingDependent@$03@@YAU?$_BitInt@$03@__clang@@XZ"()
+  uint16_t4 V;
+  ManglingTestTemplateParam(V);
+  // LIN: call void @_Z25ManglingTestTemplateParamIDv4_DU16_EvRT_(<4 x i16>*
+  // WIN64: call void @"??$ManglingTestTemplateParam@T?$__vector@U?$_UBitInt@$0BA@@__clang@@$03@__clang@@@@YAXAEAT?$__vector@U?$_UBitInt@$0BA@@__clang@@$03@__clang@@@Z"(<4 x i16>*
+  // WIN32: call void @"??$ManglingTestTemplateParam@T?$__vector@U?$_UBitInt@$0BA@@__clang@@$03@__clang@@@@YAXAAT?$__vector@U?$_UBitInt@$0BA@@__clang@@$03@__clang@@@Z"(<4 x i16>*
+
 }
 
 void TakesVarargs(int i, ...) {
@@ -272,6 +280,65 @@
   // WIN32: %[[LOADV3:.+]] = load i16, i16* %[[BC3]]
   // WIN32: store i16 %[[LOADV3]], i16*
 
+  uint16_t4 D = __builtin_va_arg(args, uint16_t4);
+  // LIN64: %[[AD4:.+]] = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %[[ARGS]]
+  // LIN64: %[[OFA_P4:.+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %[[AD4]], i32 0, i32 1
+  // LIN64: %[[GPOFFSET:.+]] = load i32, i32* %[[OFA_P4]]
+  // LIN64: %[[FITSINGP:.+]] = icmp ule i32 %[[GPOFFSET]], 160
+  // LIN64: br i1 %[[FITSINGP]]
+  // LIN64: %[[BC4:.+]] = phi <4 x i16>*
+  // LIN64: %[[LOADV4:.+]] = load <4 x i16>, <4 x i16>* %[[BC4]]
+  // LIN64: store <4 x i16> %[[LOADV4]], <4 x i16>*
+
+  // LIN32: %[[CUR4:.+]] = load i8*, i8** %[[ARGS]]
+  // LIN32: %[[NEXT4:.+]] = getelementptr inbounds i8, i8* %[[CUR4]], i32 8
+  // LIN32: store i8* %[[NEXT4]], i8** %[[ARGS]]
+  // LIN32: %[[BC4:.+]] = bitcast i8* %[[CUR4]] to <4 x i16>*
+  // LIN32: %[[LOADV4:.+]] = load <4 x i16>, <4 x i16>* %[[BC4]]
+  // LIN32: store <4 x i16> %[[LOADV4]], <4 x i16>* %
+
+  // WIN: %[[CUR4:.+]] = load i8*, i8** %[[ARGS]]
+  // WIN64: %[[NEXT4:.+]] = getelementptr inbounds i8, i8* %[[CUR4]], i64 8
+  // WIN32: %[[NEXT4:.+]] = getelementptr inbounds i8, i8* %[[CUR4]], i32 8
+  // WIN: store i8* %[[NEXT4]], i8** %[[ARGS]]
+  // WIN: %[[BC4:.+]] = bitcast i8* %[[CUR4]] to <4 x i16>*
+  // WIN: %[[LOADV4:.+]] = load <4 x i16>, <4 x i16>* %[[BC4]]
+  // WIN: store <4 x i16> %[[LOADV4]], <4 x i16>*
+
+  vint32_t8 E = __builtin_va_arg(args, vint32_t8);
+  // LIN64: %[[AD5:.+]] = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %[[ARGS]]
+  // LIN64: %[[OFAA_P4:.+]] = getelementptr inbounds %struct.__va_list_tag, %struct.__va_list_tag* %[[AD5]], i32 0, i32 2
+  // LIN64: %[[OFAA:.+]] = load i8*, i8** %[[OFAA_P4]]
+  // LIN64: %[[TOINT:.+]] = ptrtoint i8* %[[OFAA]] to i64
+  // LIN64: %[[ADD:.+]] = add i64 %[[TOINT]], 31
+  // LIN64: %[[AND:.+]] = and i64 %[[ADD]], -32
+  // LIN64: %[[OFAA_ALIGNED:.+]] = inttoptr i64 %[[AND]] to i8*
+  // LIN64: %[[BC5:.+]] = bitcast i8* %[[OFAA_ALIGNED]] to <8 x i32>*
+  // LIN64: %[[LOADV5:.+]] = load <8 x i32>, <8 x i32>* %[[BC5]]
+  // LIN64: store <8 x i32> %[[LOADV5]], <8 x i32>*
+
+  // LIN32: %[[CUR5:.+]] = load i8*, i8** %[[ARGS]]
+  // LIN32: %[[TOINT:.+]] = ptrtoint i8* %[[CUR5]] to i32
+  // LIN32: %[[ADD:.+]] = add i32 %[[TOINT]], 31
+  // LIN32: %[[AND:.+]] = and i32 %[[ADD]], -32
+  // LIN32: %[[CUR5_ALIGNED:.+]] = inttoptr i32 %[[AND]] to i8*
+  // LIN32: %[[NEXT5:.+]] = getelementptr inbounds i8, i8* %[[CUR5_ALIGNED]], i32 32
+  // LIN32: store i8* %[[NEXT5]], i8** %[[ARGS]]
+  // LIN32: %[[LOADP5:.+]] = bitcast i8* %[[CUR5_ALIGNED]] to <8 x i32>*
+  // LIN32: %[[LOADV5:.+]] = load <8 x i32>, <8 x i32>* %[[LOADP5]]
+  // LIN32: store <8 x i32> %[[LOADV5]], <8 x i32>*
+
+  // WIN: %[[CUR5:.+]] = load i8*, i8** %[[ARGS]]
+  // WIN64: %[[NEXT5:.+]] = getelementptr inbounds i8, i8* %[[CUR5]], i64 8
+  // WIN32: %[[NEXT5:.+]] = getelementptr inbounds i8, i8* %[[CUR5]], i32 32
+  // WIN: store i8* %[[NEXT5]], i8** %[[ARGS]]
+  // WIN64: %[[BC5:.+]] = bitcast i8* %[[CUR5]] to <8 x i32>**
+  // WIN64: %[[LOADP5:.+]] = load <8 x i32>*, <8 x i32>** %[[BC5]]
+  // WIN64: %[[LOADV5:.+]] = load <8 x i32>, <8 x i32>* %[[LOADP5]]
+  // WIN32: %[[BC5:.+]] = bitcast i8* %argp.cur8 to <8 x i32>*
+  // WIN32: %[[LOADV5:.+]] = load <8 x i32>, <8 x i32>* %[[BC5]]
+  // WIN: store <8 x i32> %[[LOADV5]], <8 x i32>*
+
   __builtin_va_end(args);
   // LIN64: %[[ENDAD:.+]] = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %[[ARGS]]
   // LIN64: %[[ENDAD1:.+]] = bitcast %struct.__va_list_tag* %[[ENDAD]] to i8*
@@ -318,6 +385,16 @@
  // LIN32: call void @_ZNSt9type_infoC1ERKS_(%"class.std::type_info"* {{[^,]*}} %{{.+}}, %"class.std::type_info"* nonnull align 4 dereferenceable(8) bitcast ({ i8*, i8* }* @_ZTIDB32_ to %"class.std::type_info"*))
  // WIN64:  call %"class.std::type_info"* @"??0type_info@std@@QEAA@AEBV01@@Z"(%"class.std::type_info"* {{[^,]*}} %{{.+}}, %"class.std::type_info"* nonnull align 8 dereferenceable(16) bitcast (%rtti.TypeDescriptor27* @"??_R0U?$_BitInt@$0CA@@__clang@@@8" to %"class.std::type_info"*))
  // WIN32:  call x86_thiscallcc %"class.std::type_info"* @"??0type_info@std@@QAE@ABV01@@Z"(%"class.std::type_info"* {{[^,]*}} %{{.+}}, %"class.std::type_info"* nonnull align 4 dereferenceable(8) bitcast (%rtti.TypeDescriptor27* @"??_R0U?$_BitInt@$0CA@@__clang@@@8" to %"class.std::type_info"*))
+ auto G = typeid(uint16_t4);
+ // LIN64: call void @_ZNSt9type_infoC1ERKS_(%"class.std::type_info"* {{[^,]*}} %{{.+}}, %"class.std::type_info"* nonnull align 8 dereferenceable(16) bitcast ({ i8*, i8* }* @_ZTIDv4_DU16_ to %"class.std::type_info"*))
+ // LIN32: call void @_ZNSt9type_infoC1ERKS_(%"class.std::type_info"* {{[^,]*}} %{{.+}}, %"class.std::type_info"* nonnull align 4 dereferenceable(8) bitcast ({ i8*, i8* }* @_ZTIDv4_DU16_ to %"class.std::type_info"*))
+ // WIN64: call %"class.std::type_info"* @"??0type_info@std@@QEAA@AEBV01@@Z"(%"class.std::type_info"* {{[^,]*}} %{{.+}}, %"class.std::type_info"* nonnull align 8 dereferenceable(16) bitcast (%rtti.TypeDescriptor53* @"??_R0T?$__vector@U?$_UBitInt@$0BA@@__clang@@$03@__clang@@@8" to %"class.std::type_info"*))
+ // WIN32: call x86_thiscallcc %"class.std::type_info"* @"??0type_info@std@@QAE@ABV01@@Z"(%"class.std::type_info"* {{[^,]*}} %{{.+}}, %"class.std::type_info"* nonnull align 4 dereferenceable(8) bitcast (%rtti.TypeDescriptor53* @"??_R0T?$__vector@U?$_UBitInt@$0BA@@__clang@@$03@__clang@@@8" to %"class.std::type_info"*))
+ auto H = typeid(vint32_t8);
+ // LIN64: call void @_ZNSt9type_infoC1ERKS_(%"class.std::type_info"* {{[^,]*}} %{{.+}}, %"class.std::type_info"* nonnull align 8 dereferenceable(16) bitcast ({ i8*, i8* }* @_ZTIDv8_DB32_ to %"class.std::type_info"*))
+ // LIN32: call void @_ZNSt9type_infoC1ERKS_(%"class.std::type_info"* {{[^,]*}} %{{.+}}, %"class.std::type_info"* nonnull align 4 dereferenceable(8) bitcast ({ i8*, i8* }* @_ZTIDv8_DB32_ to %"class.std::type_info"*))
+ // WIN64: call %"class.std::type_info"* @"??0type_info@std@@QEAA@AEBV01@@Z"(%"class.std::type_info"* {{[^,]*}} %{{.+}}, %"class.std::type_info"* nonnull align 8 dereferenceable(16) bitcast (%rtti.TypeDescriptor54* @"??_R0?AT?$__vector@U?$_BitInt@$0CA@@__clang@@$07@__clang@@@8" to %"class.std::type_info"*))
+ // WIN32: call x86_thiscallcc %"class.std::type_info"* @"??0type_info@std@@QAE@ABV01@@Z"(%"class.std::type_info"* {{[^,]*}} %{{.+}}, %"class.std::type_info"* nonnull align 4 dereferenceable(8) bitcast (%rtti.TypeDescriptor54* @"??_R0?AT?$__vector@U?$_BitInt@$0CA@@__clang@@$07@__clang@@@8" to %"class.std::type_info"*))
 }
 
 void ExplicitCasts() {
@@ -336,12 +413,19 @@
   // CHECK: %[[CONV:.+]] = trunc i33 %{{.+}} to i32
   i = b;
   // CHECK: %[[CONV:.+]] = sext i31 %{{.+}} to i32
+  uint16_t4 c;
+  c = i;
+  // CHECK: %[[CONV:.+]] = trunc i32 %{{.+}} to i16
+  // CHECK: %[[VEC:.+]] = insertelement <4 x i16> poison, i16 %[[CONV]], i32 0
+  // CHECK: %[[Splat:.+]] = shufflevector <4 x i16> %[[VEC]], <4 x i16> poison, <4 x i32> zeroinitializer
 }
 
 struct S {
   _BitInt(17) A;
   _BitInt(128) B;
   _BitInt(17) C;
+  uint16_t4 D;
+  vint32_t8 E;
 };
 
 void OffsetOfTest() {
@@ -358,6 +442,14 @@
   // LIN64: store i{{.+}} 24, i{{.+}}* %{{.+}}
   // LIN32: store i{{.+}} 20, i{{.+}}* %{{.+}}
   // WIN: store i{{.+}} 24, i{{.+}}* %{{.+}}
+  auto D = __builtin_offsetof(S,D);
+  // LIN64: store i64 32, i64* %{{.+}}
+  // LIN32: store i32 24, i32* %{{.+}}
+  // WIN: store i{{.+}} 32, i{{.+}}* %{{.+}}
+  auto E = __builtin_offsetof(S,E);
+  // LIN64: store i64 64, i64* %{{.+}}
+  // LIN32: store i32 32, i32* %{{.+}}
+  // WIN: store i{{.+}} 64, i{{.+}}* %{{.+}}
 }
 
 
@@ -379,6 +471,44 @@
   Ext >> 29;
   // CHECK: ashr i28 %{{.+}}, 29
 }
+void ShiftBitIntByConstant(uint16_t4 Ext) {
+// LIN64: define{{.*}} void @_Z21ShiftBitIntByConstantDv4_DU16_(double %
+// LIN32: define dso_local void @_Z21ShiftBitIntByConstantDv4_DU16_(i64 %
+// WIN: define dso_local void @"?ShiftBitIntByConstant@@YAXT?$__vector@U?$_UBitInt@$0BA@@__clang@@$03@__clang@@@Z"(<4 x i16>
+  Ext << 7;
+  // CHECK: shl <4 x i16> %{{.+}}, <i16 7, i16 7, i16 7, i16 7>
+  Ext >> 7;
+  // CHECK: lshr <4 x i16> %{{.+}}, <i16 7, i16 7, i16 7, i16 7>
+  Ext << -7;
+  // CHECK: shl <4 x i16> %{{.+}}, <i16 -7, i16 -7, i16 -7, i16 -7>
+  Ext >> -7;
+  // CHECK: lshr <4 x i16> %{{.+}}, <i16 -7, i16 -7, i16 -7, i16 -7>
+
+  // UB in C/C++, Defined in OpenCL.
+  Ext << 29;
+  // CHECK: shl <4 x i16> %{{.+}}, <i16 29, i16 29, i16 29, i16 29>
+  Ext >> 29;
+  // CHECK: lshr <4 x i16> %{{.+}}, <i16 29, i16 29, i16 29, i16 29>
+}
+void ShiftBitIntByConstant(vint32_t8 Ext) {
+// LIN64: define{{.*}} void @_Z21ShiftBitIntByConstantDv8_DB32_(<8 x i32>* byval(<8 x i32>) align 32 %
+// LIN32: define dso_local void @_Z21ShiftBitIntByConstantDv8_DB32_(<8 x i32> %
+// WIN: define dso_local void @"?ShiftBitIntByConstant@@YAXT?$__vector@U?$_BitInt@$0CA@@__clang@@$07@__clang@@@Z"(<8 x i32>
+  Ext << 7;
+  // CHECK: shl <8 x i32> %{{.+}}, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  Ext >> 7;
+  // CHECK: ashr <8 x i32> %{{.+}}, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
+  Ext << -7;
+  // CHECK: shl <8 x i32> %{{.+}}, <i32 -7, i32 -7, i32 -7, i32 -7, i32 -7, i32 -7, i32 -7, i32 -7>
+  Ext >> -7;
+  // CHECK: ashr <8 x i32> %{{.+}}, <i32 -7, i32 -7, i32 -7, i32 -7, i32 -7, i32 -7, i32 -7, i32 -7>
+
+  // UB in C/C++, Defined in OpenCL.
+  Ext << 29;
+  // CHECK: shl <8 x i32> %{{.+}}, <i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29>
+  Ext >> 29;
+  // CHECK: ashr <8 x i32> %{{.+}}, <i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29, i32 29>
+}
 
 void ConstantShiftByBitInt(_BitInt(28) Ext, _BitInt(65) LargeExt) {
   // LIN: define{{.*}} void @_Z21ConstantShiftByBitIntDB28_DB65_
@@ -461,6 +591,28 @@
   // CHECK: %[[IMAG:.+]] = add i33 %[[FIRST_IMAG_CONV]], %[[SECOND_IMAG]]
 }
 
+typedef  _BitInt(64) vint64_t16 __attribute__((vector_size(16)));
+void VectorTest(vint64_t16 first, vint64_t16 second) {
+  // LIN: define{{.*}} void @_Z10VectorTestDv2_DB64_S0_(<2 x i64> %{{.+}}, <2 x i64> %{{.+}})
+  // WIN64: define dso_local void @"?VectorTest@@YAXT?$__vector@U?$_BitInt@$0EA@@__clang@@$01@__clang@@0@Z"(<2 x i64> %{{.+}}, <2 x i64> %{{.+}})
+  // WIN32: define dso_local void @"?VectorTest@@YAXT?$__vector@U?$_BitInt@$0EA@@__clang@@$01@__clang@@0@Z"(<2 x i64> inreg %{{.+}}, <2 x i64> inreg %{{.+}})
+  __builtin_shufflevector (first, first, 1, 3, 2) + __builtin_shufflevector (second, second, 1, 3, 2);
+  // CHECK: %[[Shuffle:.+]] = shufflevector <2 x i64> %{{.+}}, <2 x i64> %{{.+}}, <3 x i32> <i32 1, i32 3, i32 2>
+  // CHECK:  %[[Shuffle1:.+]] = shufflevector <2 x i64> %{{.+}}, <2 x i64> %{{.+}}, <3 x i32> <i32 1, i32 3, i32 2>
+  // CHECK: %[[ADD:.+]] = add <3 x i64> %[[Shuffle]], %[[Shuffle1]]
+}
+
+void VectorTest(uint16_t4 first, uint16_t4 second) {
+  // LIN64: define{{.*}} void @_Z10VectorTestDv4_DU16_S0_(double %{{.+}}, double %{{.+}})
+  // LIN32: define{{.*}} void @_Z10VectorTestDv4_DU16_S0_(i64 %{{.+}}, i64 %{{.+}})
+  // WIN64: define dso_local void @"?VectorTest@@YAXT?$__vector@U?$_UBitInt@$0BA@@__clang@@$03@__clang@@0@Z"(<4 x i16> %{{.+}}, <4 x i16> %{{.+}})
+  // WIN32: define dso_local void @"?VectorTest@@YAXT?$__vector@U?$_UBitInt@$0BA@@__clang@@$03@__clang@@0@Z"(<4 x i16> inreg %{{.+}}, <4 x i16> inreg %{{.+}})
+  first.xzw + second.zwx;
+  // CHECK: %[[Shuffle:.+]] = shufflevector <4 x i16> %{{.+}}, <4 x i16> poison, <3 x i32> <i32 0, i32 2, i32 3>
+  // CHECK: %[[Shuffle1:.+]] = shufflevector <4 x i16> %{{.+}}, <4 x i16> poison, <3 x i32> <i32 2, i32 3, i32 0>
+  // CHECK: %[[ADD:.+]] = add <3 x i16> %[[Shuffle]], %[[Shuffle1]]
+}
+
 // Ensure that these types don't alias the normal int types.
 void TBAATest(_BitInt(sizeof(int) * 8) ExtInt,
               unsigned _BitInt(sizeof(int) * 8) ExtUInt,
diff --git a/clang/test/Sema/builtin-classify-type.c b/clang/test/Sema/builtin-classify-type.c
--- a/clang/test/Sema/builtin-classify-type.c
+++ b/clang/test/Sema/builtin-classify-type.c
@@ -29,6 +29,23 @@
   __attribute__((vector_size(16))) int vec;
   typedef __attribute__((ext_vector_type(4))) int evec_t;
   evec_t evec;
+  typedef _BitInt(8) int8_t3 __attribute__((ext_vector_type(3)));
+  int8_t3 t3;
+  typedef _BitInt(16) int16_t3 __attribute__((ext_vector_type(4)));
+  int16_t3 t4;
+  typedef _BitInt(32) int32_t3 __attribute__((ext_vector_type(5)));
+  int32_t3 t5;
+  typedef _BitInt(64) int64_t3 __attribute__((ext_vector_type(6)));
+  int64_t3 t6;
+  typedef _BitInt(8) vint8_t3 __attribute__((vector_size(3)));
+  vint8_t3 vt3;
+  typedef _BitInt(16) vint16_t3 __attribute__((vector_size(4)));
+  vint16_t3 vt4;
+  typedef _BitInt(32) vint32_t3 __attribute__((vector_size(8)));
+  vint32_t3 vt5;
+  typedef _BitInt(64) vint64_t3 __attribute__((vector_size(16)));
+  vint64_t3 vt6;
+
   _Atomic int atomic_i;
   _Atomic double atomic_d;
   _Complex int complex_i;
diff --git a/clang/test/SemaCXX/ext-int.cpp b/clang/test/SemaCXX/ext-int.cpp
--- a/clang/test/SemaCXX/ext-int.cpp
+++ b/clang/test/SemaCXX/ext-int.cpp
@@ -84,10 +84,22 @@
 };
 
 // Reject vector types:
-// expected-error@+1{{invalid vector element type '_BitInt(32)'}}
-typedef _BitInt(32) __attribute__((vector_size(16))) VecTy;
-// expected-error@+1{{invalid vector element type '_BitInt(32)'}}
-typedef _BitInt(32) __attribute__((ext_vector_type(32))) OtherVecTy;
+// expected-error@+1{{'_BitInt' vector element width must be at least as wide as 'CHAR_BIT'}}
+typedef _BitInt(2) __attribute__((vector_size(16))) VecTy;
+// expected-error@+1{{'_BitInt' vector element width must be at least as wide as 'CHAR_BIT'}}
+typedef _BitInt(2) __attribute__((ext_vector_type(32))) OtherVecTy;
+// expected-error@+1{{'_BitInt' vector element width must be at least as wide as 'CHAR_BIT'}}
+typedef _BitInt(4) __attribute__((vector_size(16))) VecTy2;
+// expected-error@+1{{'_BitInt' vector element width must be at least as wide as 'CHAR_BIT'}}
+typedef _BitInt(4) __attribute__((ext_vector_type(32))) OtherVecTy2;
+// expected-error@+1{{'_BitInt' vector element width must be at least as wide as 'CHAR_BIT'}}
+typedef _BitInt(5) __attribute__((vector_size(16))) VecTy3;
+// expected-error@+1{{'_BitInt' vector element width must be at least as wide as 'CHAR_BIT'}}
+typedef _BitInt(5) __attribute__((ext_vector_type(32))) OtherVecTy3;
+// expected-error@+1{{'_BitInt' vector element width must be a power of 2}}
+typedef _BitInt(37) __attribute__((vector_size(16))) VecTy4;
+// expected-error@+1{{'_BitInt' vector element width must be a power of 2}}
+typedef _BitInt(37) __attribute__((ext_vector_type(32))) OtherVecTy4;
 
 // Allow _Complex:
 _Complex _BitInt(3) Cmplx;