Index: clang/lib/CodeGen/TargetInfo.cpp
===================================================================
--- clang/lib/CodeGen/TargetInfo.cpp
+++ clang/lib/CodeGen/TargetInfo.cpp
@@ -1091,6 +1091,7 @@
   bool IsWin32StructABI;
   bool IsSoftFloatABI;
   bool IsMCUABI;
+  bool IsLinuxABI;
   unsigned DefaultNumRegisterParameters;
 
   static bool isRegisterSize(unsigned Size) {
@@ -1117,11 +1118,13 @@
   ABIArgInfo getIndirectReturnResult(QualType Ty, CCState &State) const;
 
   /// Return the alignment to use for the given type on the stack.
-  unsigned getTypeStackAlignInBytes(QualType Ty, unsigned Align) const;
+  unsigned getTypeStackAlignInBytes(QualType Ty, unsigned Align,
+                                    bool isNamedArg) const;
 
   Class classify(QualType Ty) const;
   ABIArgInfo classifyReturnType(QualType RetTy, CCState &State) const;
-  ABIArgInfo classifyArgumentType(QualType RetTy, CCState &State) const;
+  ABIArgInfo classifyArgumentType(QualType RetTy, CCState &State,
+                                  bool isNamedArg) const;
 
   /// Updates the number of available free registers, returns
   /// true if any registers were allocated.
@@ -1153,9 +1156,9 @@
                 unsigned NumRegisterParameters, bool SoftFloatABI)
     : SwiftABIInfo(CGT), IsDarwinVectorABI(DarwinVectorABI),
       IsRetSmallStructInRegABI(RetSmallStructInRegABI),
-      IsWin32StructABI(Win32StructABI),
-      IsSoftFloatABI(SoftFloatABI),
+      IsWin32StructABI(Win32StructABI), IsSoftFloatABI(SoftFloatABI),
       IsMCUABI(CGT.getTarget().getTriple().isOSIAMCU()),
+      IsLinuxABI(CGT.getTarget().getTriple().isOSLinux()),
       DefaultNumRegisterParameters(NumRegisterParameters) {}
 
   bool shouldPassIndirectlyForSwift(ArrayRef<llvm::Type*> scalars,
@@ -1573,13 +1576,22 @@
   return false;
 }
 
-unsigned X86_32ABIInfo::getTypeStackAlignInBytes(QualType Ty,
-                                                 unsigned Align) const {
+unsigned X86_32ABIInfo::getTypeStackAlignInBytes(QualType Ty, unsigned Align,
+                                                 bool isNamedArg) const {
   // Otherwise, if the alignment is less than or equal to the minimum ABI
   // alignment, just use the default; the backend will handle this.
   if (Align <= MinABIStackAlignInBytes)
     return 0; // Use default alignment.
 
+  if (IsLinuxABI && !isNamedArg) {
+    // Exclude other System V OS (e.g Darwin, PS4 and FreeBSD) since we don't
+    // want to spend any effort dealing with the ramifications of ABI breaks.
+    // If the vector type is __m128/__m256/__m512, return the default alignment.
+    unsigned TypeAlign = getContext().getTypeAlign(Ty) / 8;
+    if (Ty->isVectorType() &&
+        (TypeAlign == 16 || TypeAlign == 32 || TypeAlign == 64))
+      return TypeAlign;
+  }
   // On non-Darwin, the stack type alignment is always 4.
   if (!IsDarwinVectorABI) {
     // Set explicit alignment, since we may need to realign the top.
@@ -1607,7 +1619,7 @@
 
   // Compute the byval alignment.
   unsigned TypeAlign = getContext().getTypeAlign(Ty) / 8;
-  unsigned StackAlign = getTypeStackAlignInBytes(Ty, TypeAlign);
+  unsigned StackAlign = getTypeStackAlignInBytes(Ty, TypeAlign, /*isNamedArg*/true);
   if (StackAlign == 0)
     return ABIArgInfo::getIndirect(CharUnits::fromQuantity(4), /*ByVal=*/true);
 
@@ -1738,8 +1750,8 @@
   }
 }
 
-ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty,
-                                               CCState &State) const {
+ABIArgInfo X86_32ABIInfo::classifyArgumentType(QualType Ty, CCState &State,
+                                               bool isNamedArg) const {
   // FIXME: Set alignment on indirect arguments.
   bool IsFastCall = State.CC == llvm::CallingConv::X86_FastCall;
   bool IsRegCall = State.CC == llvm::CallingConv::X86_RegCall;
@@ -1849,6 +1861,19 @@
     if (IsX86_MMXType(CGT.ConvertType(Ty)))
       return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), 64));
 
+    if (IsLinuxABI && !isNamedArg) {
+      // Exclude other System V OS (e.g Darwin, PS4 and FreeBSD) since we don't
+      // want to spend any effort dealing with the ramifications of ABI breaks.
+      // According to i386 System V ABI, if parameters of type __m256 are
+      // required to be passed on the stack, the stack pointer must be aligned
+      // to 32 byte. If parameters of type __m512 are required to be passed on
+      // the stack, the stack pointer must be aligned to 64 byte. This change
+      // will only apply to varargs function calls.
+      unsigned Align = getContext().getTypeAlign(Ty) / 8;
+      if (Align == 16 || Align == 32 || Align == 64)
+        return ABIArgInfo::getIndirect(CharUnits::fromQuantity(Align),
+                                       /*ByVal*/true);
+    }
     return ABIArgInfo::getDirect();
   }
 
@@ -1924,12 +1949,14 @@
 
   bool UsedInAlloca = false;
   MutableArrayRef<CGFunctionInfoArgInfo> Args = FI.arguments();
+  unsigned NumRequiredArgs = FI.getNumRequiredArgs();
   for (int I = 0, E = Args.size(); I < E; ++I) {
     // Skip arguments that have already been assigned.
     if (State.IsPreassigned.test(I))
       continue;
 
-    Args[I].info = classifyArgumentType(Args[I].type, State);
+    Args[I].info = classifyArgumentType(Args[I].type, State,
+                                        /*isNamedArg*/ I < NumRequiredArgs);
     UsedInAlloca |= (Args[I].info.getKind() == ABIArgInfo::InAlloca);
   }
 
@@ -2044,8 +2071,8 @@
   //
   // Just messing with TypeInfo like this works because we never pass
   // anything indirectly.
-  TypeInfo.second = CharUnits::fromQuantity(
-                getTypeStackAlignInBytes(Ty, TypeInfo.second.getQuantity()));
+  TypeInfo.second = CharUnits::fromQuantity(getTypeStackAlignInBytes(
+      Ty, TypeInfo.second.getQuantity(), /*isNamedArg*/false));
 
   return emitVoidPtrVAArg(CGF, VAListAddr, Ty, /*Indirect*/ false,
                           TypeInfo, CharUnits::fromQuantity(4),
Index: clang/test/CodeGen/x86_32-align-linux.c
===================================================================
--- /dev/null
+++ clang/test/CodeGen/x86_32-align-linux.c
@@ -0,0 +1,60 @@
+// RUN: %clang_cc1 -w -fblocks -ffreestanding -triple i386-pc-linux-gnu -emit-llvm -o %t %s || FileCheck < %t %s
+// RUN: %clang_cc1 -w -fblocks -ffreestanding -triple i386-pc-linux-gnu -target-feature +avx -emit-llvm -o %t %s || FileCheck < %t %s
+// RUN: %clang_cc1 -w -fblocks -ffreestanding -triple i386-pc-linux-gnu -target-feature +avx512f -emit-llvm -o %t %s || FileCheck < %t %s
+
+#include <immintrin.h>
+
+// CHECK-LABEL: define void @testm128
+// CHECK-LABEL: %argp.cur = load i8*, i8** %args, align 4
+// CHECK-NEXT:  %0 = ptrtoint i8* %argp.cur to i32
+// CHECK-NEXT:  %1 = add i32 %0, 15
+// CHECK-NEXT:  %2 = and i32 %1, -16
+// CHECK-NEXT:  %argp.cur.aligned = inttoptr i32 %2 to i8*
+void testm128(int argCount, ...) {
+  __m128 res;
+  __builtin_va_list args;
+  __builtin_va_start(args, argCount);
+  res = __builtin_va_arg(args, __m128);
+  __builtin_va_end(args);
+}
+
+// CHECK-LABEL: define void @testm256
+// CHECK-LABEL: %argp.cur = load i8*, i8** %args, align 4
+// CHECK-NEXT:  %0 = ptrtoint i8* %argp.cur to i32
+// CHECK-NEXT:  %1 = add i32 %0, 31
+// CHECK-NEXT:  %2 = and i32 %1, -32
+// CHECK-NEXT:  %argp.cur.aligned = inttoptr i32 %2 to i8*
+void testm256(int argCount, ...) {
+  __m256 res;
+  __builtin_va_list args;
+  __builtin_va_start(args, argCount);
+  res = __builtin_va_arg(args, __m256);
+  __builtin_va_end(args);
+}
+
+// CHECK-LABEL: define void @testm512
+// CHECK-LABEL: %argp.cur = load i8*, i8** %args, align 4
+// CHECK-NEXT:  %0 = ptrtoint i8* %argp.cur to i32
+// CHECK-NEXT:  %1 = add i32 %0, 63
+// CHECK-NEXT:  %2 = and i32 %1, -64
+// CHECK-NEXT:  %argp.cur.aligned = inttoptr i32 %2 to i8*
+void testm512(int argCount, ...) {
+  __m512 res;
+  __builtin_va_list args;
+  __builtin_va_start(args, argCount);
+  res = __builtin_va_arg(args, __m512);
+  __builtin_va_end(args);
+}
+
+// CHECK-LABEL: efine dso_local void @testPastArguments
+// CHECK: call void (i32, ...) @testm128(i32 1, <4 x float>* byval(<4 x float>) align 16
+// CHECK: call void (i32, ...) @testm256(i32 1, <8 x float>* byval(<8 x float>) align 32
+// CHECK: call void (i32, ...) @testm512(i32 1, <16 x float>* byval(<16 x float>) align 64
+void testPastArguments(void) {
+  __m128 a;
+  __m256 b;
+  __m512 c;
+  testm128(1, a);
+  testm256(1, b);
+  testm512(1, c);
+}
Index: clang/test/CodeGen/x86_32-arguments-linux.c
===================================================================
--- clang/test/CodeGen/x86_32-arguments-linux.c
+++ clang/test/CodeGen/x86_32-arguments-linux.c
@@ -14,10 +14,10 @@
 // CHECK: i32 %{{.*}}, %struct.s56_0* byval(%struct.s56_0) align 4 %{{[^ ]*}},
 // CHECK: i64 %{{[^ ]*}}, %struct.s56_1* byval(%struct.s56_1) align 4 %{{[^ ]*}},
 // CHECK: <1 x double> %{{[^ ]*}}, %struct.s56_2* byval(%struct.s56_2) align 4 %{{[^ ]*}},
-// CHECK: <4 x i32> %{{[^ ]*}}, %struct.s56_3* byval(%struct.s56_3) align 4 %{{[^ ]*}},
-// CHECK: <2 x double> %{{[^ ]*}}, %struct.s56_4* byval(%struct.s56_4) align 4 %{{[^ ]*}},
-// CHECK: <8 x i32> %{{[^ ]*}}, %struct.s56_5* byval(%struct.s56_5) align 4 %{{[^ ]*}},
-// CHECK: <4 x double> %{{[^ ]*}}, %struct.s56_6* byval(%struct.s56_6) align 4 %{{[^ ]*}})
+// CHECK: (<4 x i32>) align 16 %{{[^ ]*}}, %struct.s56_3* byval(%struct.s56_3) align 4 %{{[^ ]*}},
+// CHECK: (<2 x double>) align 16 %{{[^ ]*}}, %struct.s56_4* byval(%struct.s56_4) align 4 %{{[^ ]*}},
+// CHECK: (<8 x i32>) align 32 %{{[^ ]*}}, %struct.s56_5* byval(%struct.s56_5) align 4 %{{[^ ]*}},
+// CHECK: (<4 x double>) align 32 %{{[^ ]*}}, %struct.s56_6* byval(%struct.s56_6) align 4 %{{[^ ]*}})
 // CHECK: }
 //
 // <rdar://problem/7964854> [i386] clang misaligns long double in structures