diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -432,7 +432,8 @@
   /// global value is specified, and if that global has an explicit alignment
   /// requested, it will override the alignment request if required for
   /// correctness.
-  void emitAlignment(Align Alignment, const GlobalObject *GV = nullptr) const;
+  void emitAlignment(Align Alignment, const GlobalObject *GV = nullptr,
+                     unsigned MaxBytesToEmit = 0) const;
 
   /// Lower the specified LLVM Constant to an MCExpr.
   virtual const MCExpr *lowerConstant(const Constant *CV);
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -136,6 +136,12 @@
   /// Alignment of the basic block. One if the basic block does not need to be
   /// aligned.
   Align Alignment;
+  /// Maximum amount of bytes that can be added to align the basic block. If the
+  /// alignment cannot be reached in this many bytes, no bytes are emitted.
+  /// Zero to represent no maximum.
+  /// Values greater than 1 << Alignment are understood to be functionally
+  /// useless, and are equivalent to a value of Zero.
+  uint8_t MaxBytesForAlignment = 0;
 
   /// Indicate that this basic block is entered via an exception handler.
   bool IsEHPad = false;
@@ -521,6 +527,14 @@
   /// Set alignment of the basic block.
   void setAlignment(Align A) { Alignment = A; }
 
+  /// Return the maximum amount of padding allowed for aligning the basic block
+  unsigned getMaxBytesForAlignment() const { return MaxBytesForAlignment; }
+
+  /// Set the maximum amount of padding allowed for aligning the basic block
+  void setMaxBytesForAlignment(unsigned MaxBytes) {
+    MaxBytesForAlignment = MaxBytes;
+  }
+
   /// Returns true if the block is a landing pad. That is this basic block is
   /// entered via an exception handler.
   bool isEHPad() const { return IsEHPad; }
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1771,11 +1771,14 @@
   /// Return the preferred loop alignment.
   virtual Align getPrefLoopAlignment(MachineLoop *ML = nullptr) const;
 
+  /// Return the maximum amount of bytes allowed to be emitted when padding for
+  /// alignment
+  virtual unsigned
+  getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const;
+
   /// Should loops be aligned even when the function is marked OptSize (but not
   /// MinSize).
-  virtual bool alignLoopsWithOptSize() const {
-    return false;
-  }
+  virtual bool alignLoopsWithOptSize() const { return false; }
 
   /// If the target has a standard location for the stack protector guard,
   /// returns the address of that location. Otherwise, returns nullptr.
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2447,7 +2447,8 @@
 // two boundary.  If a global value is specified, and if that global has
 // an explicit alignment requested, it will override the alignment request
 // if required for correctness.
-void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV) const {
+void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV,
+                               unsigned MaxBytesToEmit) const {
   if (GV)
     Alignment = getGVAlignment(GV, GV->getParent()->getDataLayout(), Alignment);
 
@@ -2460,9 +2461,9 @@
       STI = &getSubtargetInfo();
     else
       STI = TM.getMCSubtargetInfo();
-    OutStreamer->emitCodeAlignment(Alignment.value(), STI);
+    OutStreamer->emitCodeAlignment(Alignment.value(), STI, MaxBytesToEmit);
   } else
-    OutStreamer->emitValueToAlignment(Alignment.value());
+    OutStreamer->emitValueToAlignment(Alignment.value(), 0, 1, MaxBytesToEmit);
 }
 
 //===----------------------------------------------------------------------===//
@@ -3253,7 +3254,7 @@
   // Emit an alignment directive for this block, if needed.
   const Align Alignment = MBB.getAlignment();
   if (Alignment != Align(1))
-    emitAlignment(Alignment);
+    emitAlignment(Alignment, nullptr, MBB.getMaxBytesForAlignment());
 
   // Switch to a new section if this basic block must begin a section. The
   // entry block is always placed in the function section and is handled
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -95,6 +95,12 @@
              "format (e.g 4 means align on 16B boundaries)."),
     cl::init(0), cl::Hidden);
 
+static cl::opt<unsigned> MaxBytesForAlignmentOverride(
+    "max-bytes-for-alignment",
+    cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
+             "alignment"),
+    cl::init(0), cl::Hidden);
+
 // FIXME: Find a good default for this flag and remove the flag.
 static cl::opt<unsigned> ExitBlockBias(
     "block-placement-exit-block-bias",
@@ -2914,6 +2920,14 @@
     MachineBasicBlock *LayoutPred =
         &*std::prev(MachineFunction::iterator(ChainBB));
 
+    // Set the maximum bytes allowed to be emitted for alignment.
+    unsigned MaxBytes;
+    if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
+      MaxBytes = MaxBytesForAlignmentOverride;
+    else
+      MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
+    ChainBB->setMaxBytesForAlignment(MaxBytes);
+
     // Force alignment if all the predecessors are jumps. We already checked
     // that the block isn't cold above.
     if (!LayoutPred->isSuccessor(ChainBB)) {
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2026,6 +2026,11 @@
   return PrefLoopAlignment;
 }
 
+unsigned TargetLoweringBase::getMaxPermittedBytesForAlignment(
+    MachineBasicBlock *MBB) const {
+  return 0;
+}
+
 //===----------------------------------------------------------------------===//
 //  Reciprocal Estimates
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll b/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll
@@ -0,0 +1,75 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -max-bytes-for-alignment=8 --align-loops=32 < %s -o -| FileCheck %s
+
+define i32 @a(i32 %x, i32* nocapture readonly %y, i32* nocapture readonly %z) {
+; CHECK-LABEL: a:
+; CHECK:    .p2align 5, 0x0, 8
+; CHECK-NEXT:  .LBB0_5: // %vector.body
+; CHECK:    .p2align 5, 0x0, 8
+; CHECK-NEXT:  .LBB0_8: // %for.body
+entry:
+  %cmp10 = icmp sgt i32 %x, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %x to i64
+  %min.iters.check = icmp ult i32 %x, 8
+  br i1 %min.iters.check, label %for.body.preheader17, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i64 %wide.trip.count, 4294967288
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %10, %vector.body ]
+  %vec.phi13 = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %11, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %y, i64 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = getelementptr inbounds i32, i32* %0, i64 4
+  %3 = bitcast i32* %2 to <4 x i32>*
+  %wide.load14 = load <4 x i32>, <4 x i32>* %3, align 4
+  %4 = getelementptr inbounds i32, i32* %z, i64 %index
+  %5 = bitcast i32* %4 to <4 x i32>*
+  %wide.load15 = load <4 x i32>, <4 x i32>* %5, align 4
+  %6 = getelementptr inbounds i32, i32* %4, i64 4
+  %7 = bitcast i32* %6 to <4 x i32>*
+  %wide.load16 = load <4 x i32>, <4 x i32>* %7, align 4
+  %8 = add <4 x i32> %wide.load, %vec.phi
+  %9 = add <4 x i32> %wide.load14, %vec.phi13
+  %10 = add <4 x i32> %8, %wide.load15
+  %11 = add <4 x i32> %9, %wide.load16
+  %index.next = add nuw i64 %index, 8
+  %12 = icmp eq i64 %index.next, %n.vec
+  br i1 %12, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %bin.rdx = add <4 x i32> %11, %10
+  %13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %bin.rdx)
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader17
+
+for.body.preheader17:                             ; preds = %for.body.preheader, %middle.block
+  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  %b.011.ph = phi i32 [ 0, %for.body.preheader ], [ %13, %middle.block ]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  %b.0.lcssa = phi i32 [ 0, %entry ], [ %13, %middle.block ], [ %add3, %for.body ]
+  ret i32 %b.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader17, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader17 ]
+  %b.011 = phi i32 [ %add3, %for.body ], [ %b.011.ph, %for.body.preheader17 ]
+  %arrayidx = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %14 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %z, i64 %indvars.iv
+  %15 = load i32, i32* %arrayidx2, align 4
+  %add = add i32 %14, %b.011
+  %add3 = add i32 %add, %15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)