diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -432,7 +432,8 @@
   /// global value is specified, and if that global has an explicit alignment
   /// requested, it will override the alignment request if required for
   /// correctness.
-  void emitAlignment(Align Alignment, const GlobalObject *GV = nullptr) const;
+  void emitAlignment(Align Alignment, const GlobalObject *GV = nullptr,
+                     unsigned MaxBytesToEmit = 0) const;
 
   /// Lower the specified LLVM Constant to an MCExpr.
   virtual const MCExpr *lowerConstant(const Constant *CV);
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -136,6 +136,10 @@
   /// Alignment of the basic block. One if the basic block does not need to be
   /// aligned.
   Align Alignment;
+  /// Maximum amount of bytes that can be added to align the basic block. If the
+  /// alignment cannot be reached in this many bytes, no bytes are emitted.
+  /// Zero to represent no maximum.
+  uint8_t MaxBytesForAlignment = 0;
 
   /// Indicate that this basic block is entered via an exception handler.
   bool IsEHPad = false;
@@ -521,6 +525,19 @@
   /// Set alignment of the basic block.
   void setAlignment(Align A) { Alignment = A; }
 
+  void setAlignment(Align A, unsigned MaxBytes) {
+    setAlignment(A);
+    setMaxBytesForAlignment(MaxBytes);
+  }
+
+  /// Return the maximum amount of padding allowed for aligning the basic block
+  unsigned getMaxBytesForAlignment() const { return MaxBytesForAlignment; }
+
+  /// Set the maximum amount of padding allowed for aligning the basic block
+  void setMaxBytesForAlignment(unsigned MaxBytes) {
+    MaxBytesForAlignment = MaxBytes;
+  }
+
   /// Returns true if the block is a landing pad. That is this basic block is
   /// entered via an exception handler.
   bool isEHPad() const { return IsEHPad; }
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1771,11 +1771,14 @@
   /// Return the preferred loop alignment.
   virtual Align getPrefLoopAlignment(MachineLoop *ML = nullptr) const;
 
+  /// Return the maximum amount of bytes allowed to be emitted when padding for
+  /// alignment
+  virtual unsigned
+  getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const;
+
   /// Should loops be aligned even when the function is marked OptSize (but not
   /// MinSize).
-  virtual bool alignLoopsWithOptSize() const {
-    return false;
-  }
+  virtual bool alignLoopsWithOptSize() const { return false; }
 
   /// If the target has a standard location for the stack protector guard,
   /// returns the address of that location. Otherwise, returns nullptr.
@@ -2312,6 +2315,9 @@
   /// means the target does not care about loop alignment. The target may also
   /// override getPrefLoopAlignment to provide per-loop values.
   void setPrefLoopAlignment(Align Alignment) { PrefLoopAlignment = Alignment; }
+  void setMaxBytesForAlignment(unsigned MaxBytes) {
+    MaxBytesForAlignment = MaxBytes;
+  }
 
   /// Set the minimum stack alignment of an argument.
   void setMinStackArgumentAlignment(Align Alignment) {
@@ -2951,6 +2957,7 @@
 
   /// The preferred loop alignment (in log2 bot in bytes).
   Align PrefLoopAlignment;
+  unsigned MaxBytesForAlignment;
 
   /// Size in bits of the maximum atomics size the backend supports.
   /// Accesses larger than this will be expanded by AtomicExpandPass.
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2447,7 +2447,8 @@
 // two boundary.  If a global value is specified, and if that global has
 // an explicit alignment requested, it will override the alignment request
 // if required for correctness.
-void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV) const {
+void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV,
+                               unsigned MaxBytesToEmit) const {
   if (GV)
     Alignment = getGVAlignment(GV, GV->getParent()->getDataLayout(), Alignment);
 
@@ -2460,9 +2461,9 @@
       STI = &getSubtargetInfo();
     else
       STI = TM.getMCSubtargetInfo();
-    OutStreamer->emitCodeAlignment(Alignment.value(), STI);
+    OutStreamer->emitCodeAlignment(Alignment.value(), STI, MaxBytesToEmit);
   } else
-    OutStreamer->emitValueToAlignment(Alignment.value());
+    OutStreamer->emitValueToAlignment(Alignment.value(), 0, 1, MaxBytesToEmit);
 }
 
 //===----------------------------------------------------------------------===//
@@ -3253,7 +3254,7 @@
   // Emit an alignment directive for this block, if needed.
   const Align Alignment = MBB.getAlignment();
   if (Alignment != Align(1))
-    emitAlignment(Alignment);
+    emitAlignment(Alignment, nullptr, MBB.getMaxBytesForAlignment());
 
   // Switch to a new section if this basic block must begin a section. The
   // entry block is always placed in the function section and is handled
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -95,6 +95,12 @@
              "format (e.g 4 means align on 16B boundaries)."),
     cl::init(0), cl::Hidden);
 
+static cl::opt<unsigned> MaxBytesForAlignmentOverride(
+    "max-bytes-for-alignment",
+    cl::desc("Forces the maximum bytes allowed to be emitted when padding for "
+             "alignment"),
+    cl::init(0), cl::Hidden);
+
 // FIXME: Find a good default for this flag and remove the flag.
 static cl::opt<unsigned> ExitBlockBias(
     "block-placement-exit-block-bias",
@@ -2914,10 +2920,21 @@
     MachineBasicBlock *LayoutPred =
         &*std::prev(MachineFunction::iterator(ChainBB));
 
+    auto DetermineMaxAlignmentPadding = [&]() {
+      // Set the maximum bytes allowed to be emitted for alignment.
+      unsigned MaxBytes;
+      if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0)
+        MaxBytes = MaxBytesForAlignmentOverride;
+      else
+        MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB);
+      ChainBB->setMaxBytesForAlignment(MaxBytes);
+    };
+
     // Force alignment if all the predecessors are jumps. We already checked
     // that the block isn't cold above.
     if (!LayoutPred->isSuccessor(ChainBB)) {
       ChainBB->setAlignment(Align);
+      DetermineMaxAlignmentPadding();
       continue;
     }
 
@@ -2928,8 +2945,10 @@
     BranchProbability LayoutProb =
         MBPI->getEdgeProbability(LayoutPred, ChainBB);
     BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
-    if (LayoutEdgeFreq <= (Freq * ColdProb))
+    if (LayoutEdgeFreq <= (Freq * ColdProb)) {
       ChainBB->setAlignment(Align);
+      DetermineMaxAlignmentPadding();
+    }
   }
 }
 
@@ -3394,17 +3413,30 @@
   ComputedEdges.clear();
   ChainAllocator.DestroyAll();
 
+  bool HasMaxBytesOverride =
+      MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
+
   if (AlignAllBlock)
     // Align all of the blocks in the function to a specific alignment.
-    for (MachineBasicBlock &MBB : MF)
-      MBB.setAlignment(Align(1ULL << AlignAllBlock));
+    for (MachineBasicBlock &MBB : MF) {
+      if (HasMaxBytesOverride)
+        MBB.setAlignment(Align(1ULL << AlignAllBlock),
+                         MaxBytesForAlignmentOverride);
+      else
+        MBB.setAlignment(Align(1ULL << AlignAllBlock));
+    }
   else if (AlignAllNonFallThruBlocks) {
     // Align all of the blocks that have no fall-through predecessors to a
     // specific alignment.
     for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
       auto LayoutPred = std::prev(MBI);
-      if (!LayoutPred->isSuccessor(&*MBI))
-        MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
+      if (!LayoutPred->isSuccessor(&*MBI)) {
+        if (HasMaxBytesOverride)
+          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
+                            MaxBytesForAlignmentOverride);
+        else
+          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
+      }
     }
   }
   if (ViewBlockLayoutWithBFI != GVDT_None &&
@@ -3413,7 +3445,6 @@
     MBFI->view("MBP." + MF.getName(), false);
   }
 
-
   // We always return true as we have no way to track whether the final order
   // differs from the original order.
   return true;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2026,6 +2026,11 @@
   return PrefLoopAlignment;
 }
 
+unsigned TargetLoweringBase::getMaxPermittedBytesForAlignment(
+    MachineBasicBlock *MBB) const {
+  return MaxBytesForAlignment;
+}
+
 //===----------------------------------------------------------------------===//
 //  Reciprocal Estimates
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll b/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll
@@ -0,0 +1,93 @@
+; RUN: llc -mtriple=aarch64-none-linux-gnu -max-bytes-for-alignment=8 --align-loops=32 < %s -o -| FileCheck %s --check-prefixes=CHECK,CHECK-EXPLICIT
+; RUN: llc -mtriple=aarch64-none-linux-gnu --align-loops=32 < %s -o -| FileCheck %s --check-prefixes=CHECK,CHECK-IMPLICIT
+; RUN: llc -mtriple=aarch64-none-linux-gnu --align-loops=32 < %s -o - --filetype=obj | llvm-objdump --arch=aarch64  -d -| FileCheck %s --check-prefixes=CHECK-OBJ,CHECK-OBJ-IMPLICIT
+; RUN: llc -mtriple=aarch64-none-linux-gnu -max-bytes-for-alignment=8 --align-loops=32 < %s -o - --filetype=obj | llvm-objdump --arch=aarch64  -d -| FileCheck %s --check-prefixes=CHECK-OBJ,CHECK-OBJ-EXPLICIT
+
+; This test is checking that the correct operands to the .p2align are emitted correctly, and that the resulting obj
+; is padded as expected. The key interest in the CHECK-OBJ-* sections is the size of the padding region (the nops),
+; and not the exact instructions either side of them (But the last instruction of the EXPLICIT and IMPLICIT checks
+; should be the same, at different locations)
+define i32 @a(i32 %x, i32* nocapture readonly %y, i32* nocapture readonly %z) {
+; CHECK-LABEL: a:
+; CHECK-EXPLICIT:    .p2align 5, 0x0, 8
+; CHECK-IMPLICIT:    .p2align 5
+; CHECK-NEXT:  .LBB0_5: // %vector.body
+; CHECK-EXPLICIT:    .p2align 5, 0x0, 8
+; CHECK-IMPLICIT:    .p2align 5
+; CHECK-NEXT:  .LBB0_8: // %for.body
+; CHECK-OBJ;Disassembly of section .text:
+; CHECK-OBJ:               88: 2a 00 0a 8b   add
+; CHECK-OBJ-IMPLICIT-NEXT: 8c: 1f 20 03 d5   nop
+; CHECK-OBJ-IMPLICIT-NEXT: 90: 1f 20 03 d5   nop
+; CHECK-OBJ-IMPLICIT-NEXT: 94: 1f 20 03 d5   nop
+; CHECK-OBJ-IMPLICIT-NEXT: 98: 1f 20 03 d5   nop
+; CHECK-OBJ-IMPLICIT-NEXT: 9c: 1f 20 03 d5   nop
+; CHECK-OBJ-IMPLICIT-NEXT: a0: 4b 45 40 b8   ldr
+; CHECK-OBJ-EXPLICIT-NEXT: 8c: 4b 45 40 b8   ldr
+entry:
+  %cmp10 = icmp sgt i32 %x, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %x to i64
+  %min.iters.check = icmp ult i32 %x, 8
+  br i1 %min.iters.check, label %for.body.preheader17, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.vec = and i64 %wide.trip.count, 4294967288
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %10, %vector.body ]
+  %vec.phi13 = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %11, %vector.body ]
+  %0 = getelementptr inbounds i32, i32* %y, i64 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
+  %2 = getelementptr inbounds i32, i32* %0, i64 4
+  %3 = bitcast i32* %2 to <4 x i32>*
+  %wide.load14 = load <4 x i32>, <4 x i32>* %3, align 4
+  %4 = getelementptr inbounds i32, i32* %z, i64 %index
+  %5 = bitcast i32* %4 to <4 x i32>*
+  %wide.load15 = load <4 x i32>, <4 x i32>* %5, align 4
+  %6 = getelementptr inbounds i32, i32* %4, i64 4
+  %7 = bitcast i32* %6 to <4 x i32>*
+  %wide.load16 = load <4 x i32>, <4 x i32>* %7, align 4
+  %8 = add <4 x i32> %wide.load, %vec.phi
+  %9 = add <4 x i32> %wide.load14, %vec.phi13
+  %10 = add <4 x i32> %8, %wide.load15
+  %11 = add <4 x i32> %9, %wide.load16
+  %index.next = add nuw i64 %index, 8
+  %12 = icmp eq i64 %index.next, %n.vec
+  br i1 %12, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %bin.rdx = add <4 x i32> %11, %10
+  %13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %bin.rdx)
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader17
+
+for.body.preheader17:                             ; preds = %for.body.preheader, %middle.block
+  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  %b.011.ph = phi i32 [ 0, %for.body.preheader ], [ %13, %middle.block ]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %entry
+  %b.0.lcssa = phi i32 [ 0, %entry ], [ %13, %middle.block ], [ %add3, %for.body ]
+  ret i32 %b.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader17, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader17 ]
+  %b.011 = phi i32 [ %add3, %for.body ], [ %b.011.ph, %for.body.preheader17 ]
+  %arrayidx = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %14 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %z, i64 %indvars.iv
+  %15 = load i32, i32* %arrayidx2, align 4
+  %add = add i32 %14, %b.011
+  %add3 = add i32 %add, %15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)