diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp
--- a/clang/lib/CodeGen/TargetInfo.cpp
+++ b/clang/lib/CodeGen/TargetInfo.cpp
@@ -5934,7 +5934,10 @@
       return ABIArgInfo::getDirect(Ty, 0, nullptr, false);
     }
   }
-  return ABIArgInfo::getDirect(nullptr, 0, nullptr, false);
+  bool NeedsStackAlignment = getContext().getTypeAlignInChars(Ty) !=
+                             getContext().getTypeAlignInChars(Base);
+  return ABIArgInfo::getDirect(nullptr, /*Offset=*/0, /*Padding=*/nullptr,
+                               /*CanBeFlattened=*/false, NeedsStackAlignment);
 }
 
 ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty, bool isVariadic,
@@ -6000,9 +6003,13 @@
     uint64_t Members = 0;
     if (isHomogeneousAggregate(Ty, Base, Members)) {
       assert(Base && Members <= 4 && "unexpected homogeneous aggregate");
+      bool NeedsStackAlignment = getContext().getTypeAlignInChars(Ty) !=
+                                 getContext().getTypeAlignInChars(Base);
       llvm::Type *Ty =
         llvm::ArrayType::get(CGT.ConvertType(QualType(Base, 0)), Members);
-      return ABIArgInfo::getDirect(Ty, 0, nullptr, false);
+      return ABIArgInfo::getDirect(Ty, /*Offset=*/0, /*Padding=*/nullptr,
+                                   /*CanBeFlattened=*/false,
+                                   NeedsStackAlignment);
     }
   }
 
diff --git a/clang/test/CodeGen/arm-aapcs-vfp.c b/clang/test/CodeGen/arm-aapcs-vfp.c
--- a/clang/test/CodeGen/arm-aapcs-vfp.c
+++ b/clang/test/CodeGen/arm-aapcs-vfp.c
@@ -147,3 +147,17 @@
 // is passed ByVal (due to being > 64 bytes), so the backend handles this instead.
 void test_vfp_stack_gpr_split_6(double a, double b, double c, double d, double e, double f, double g, double h, double i, int j, struct_seventeen_ints k) {}
 // CHECK: define arm_aapcs_vfpcc void @test_vfp_stack_gpr_split_6(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i, i32 %j, %struct.struct_seventeen_ints* byval(%struct.struct_seventeen_ints) align 4 %k)
+
+// Make sure over-alignment information is propagated to the backend properly
+typedef struct {
+  __attribute__((__aligned__(8))) float v[2];
+} hfa_align;
+// CHECK: define arm_aapcs_vfpcc float @test_hfa_align_arg(%struct.hfa_align alignstack(8) %h1.coerce) #0
+float test_hfa_align_arg(hfa_align h1) {
+  return h1.v[0];
+}
+// CHECK: %call = call arm_aapcs_vfpcc float @test_hfa_align_arg(%struct.hfa_align alignstack(8) %1) #4
+float test_hfa_align_call() {
+  hfa_align h = {1.0, 2.0};
+  return test_hfa_align_arg(h);
+}
diff --git a/llvm/lib/Target/ARM/ARMCallingConv.cpp b/llvm/lib/Target/ARM/ARMCallingConv.cpp
--- a/llvm/lib/Target/ARM/ARMCallingConv.cpp
+++ b/llvm/lib/Target/ARM/ARMCallingConv.cpp
@@ -266,7 +266,10 @@
   // possible. (E.g. an incoming i64 would have starting Align of 8, but we'll
   // be allocating a bunch of i32 slots).
   unsigned RestAlign = std::min(Align, Size);
-
+  if (ArgFlags.getStackAlign()) {
+    const llvm::Align ArgStackAlign(ArgFlags.getStackAlign());
+    Align = std::max(Align, unsigned(ArgStackAlign.value()));
+  }
   for (auto &It : PendingMembers) {
     It.convertToMem(State.AllocateStack(Size, Align));
     State.addLoc(It);
diff --git a/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll b/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll
--- a/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll
+++ b/llvm/test/CodeGen/ARM/aapcs-hfa-code.ll
@@ -3,6 +3,8 @@
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
 
+%struct.hfa_align = type { [2 x float] }
+
 define arm_aapcs_vfpcc void @test_1float({ float } %a) {
   call arm_aapcs_vfpcc void @test_1float({ float } { float 1.0 })
   ret void
@@ -104,3 +106,73 @@
 
   ret void
 }
+
+; Over-aligned HFA argument placed on register - one element per register
+define arm_aapcs_vfpcc float @test_hfa_align_reg(%struct.hfa_align alignstack(8) %h1.coerce) local_unnamed_addr #3 {
+entry:
+; CHECK-LABEL: test_hfa_align_reg:
+; CHECK-DAG: bx lr
+
+; CHECK-M4F-LABEL: test_hfa_align_reg:
+; CHECK-M4F-DAG: bx lr
+
+  %h1.coerce.fca.0.0.extract = extractvalue %struct.hfa_align %h1.coerce, 0, 0
+  ret float %h1.coerce.fca.0.0.extract
+}
+
+; Call with over-align HFA argument placed on registers - one element per register
+define arm_aapcs_vfpcc float @test_hfa_align_reg_call() local_unnamed_addr #3 {
+entry:
+; CHECK-LABEL: test_hfa_align_reg_call:
+; CHECK-DAG: vmov.f32	s0, #1.000000e+00
+; CHECK-DAG: vmov.f32	s1, #2.000000e+00
+; CHECK-DAG: bl	test_hfa_align_reg
+
+; CHECK-M4F-LABEL: test_hfa_align_reg_call:
+; CHECK-M4F-DAG: vmov.f32	s0, #1.000000e+00
+; CHECK-M4F-DAG: vmov.f32	s1, #2.000000e+00
+; CHECK-M4F-DAG: bl	test_hfa_align_reg
+
+  %call = call arm_aapcs_vfpcc float @test_hfa_align_reg(%struct.hfa_align alignstack(8) { [2 x float] [float 1.000000e+00, float 2.000000e+00] }) #5
+  ret float %call
+}
+
+; Over-aligned HFA argument placed on the stack - stack round up to alignment
+define arm_aapcs_vfpcc float @test_hfa_align_stack(double %d0, double %d1, double %d2, double %d3, double %d4, double %d5, double %d6, double %d7, float %f1, %struct.hfa_align alignstack(8) %h1.coerce) local_unnamed_addr #3 {
+entry:
+; CHECK-LABEL: test_hfa_align_stack:
+; CHECK-DAG: vldr	s0, [sp, #8]
+; CHECK-DAG: bx	lr
+
+; CHECK-M4F-LABEL: test_hfa_align_stack:
+; CHECK-M4F-DAG: vldr	s0, [sp, #8]
+; CHECK-M4F-DAG: bx	lr
+
+  %h1.coerce.fca.0.0.extract = extractvalue %struct.hfa_align %h1.coerce, 0, 0
+  ret float %h1.coerce.fca.0.0.extract
+}
+
+; Call with over-aligned HFA argument placed on the stack - stack round up to alignment
+define arm_aapcs_vfpcc float @test_hfa_align_stack_call() local_unnamed_addr #3 {
+entry:
+; CHECK-LABEL: test_hfa_align_stack_call:
+; CHECK-DAG: sub	sp, sp, #16
+; CHECK-DAG: mov	r0, #1073741824
+; CHECK-DAG: mov	r1, #1065353216
+; CHECK-DAG: str	r1, [sp, #8]
+; CHECK-DAG: str	r0, [sp, #12]
+; CHECK-DAG: bl	test_hfa_align_stack
+; CHECK-DAG: add	sp, sp, #16
+
+; CHECK-M4F-LABEL: test_hfa_align_stack_call:
+; CHECK-M4F-DAG: sub	sp, #16
+; CHECK-M4F-DAG: mov.w	r0, #1073741824
+; CHECK-M4F-DAG: mov.w	r1, #1065353216
+; CHECK-M4F-DAG: strd	r1, r0, [sp, #8]
+; CHECK-M4F-DAG: bl	test_hfa_align_stack
+; CHECK-M4F-DAG: add	sp, #16
+
+  %call = call arm_aapcs_vfpcc float @test_hfa_align_stack(double undef, double undef, double undef, double undef, double undef, double undef, double undef, double undef, float undef, %struct.hfa_align alignstack(8) { [2 x float] [float 1.000000e+00, float 2.000000e+00] }) #5
+  ret float %call
+}
+