Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp
===================================================================
--- llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -113,6 +113,9 @@
 static cl::opt<bool> DisableSCO("disable-ppc-sco",
 cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
 
+static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",
+cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
+
 static cl::opt<bool> EnableQuadPrecision("enable-ppc-quad-precision",
 cl::desc("enable quad precision float support on ppc"), cl::Hidden);
 
@@ -13843,6 +13846,15 @@
     if (!ML)
       break;
 
+    if (!DisableInnermostLoopAlign32) {
+      // If the nested loop is an innermost loop, prefer to a 32-byte alignment,
+      // so that we can decrease cache misses and branch-prediction misses. 
+      // Actual alignment of the loop will depend on the hotness check and other
+      // logic in alignBlocks.
+      if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty()) 
+        return 5;
+    }
+
     const PPCInstrInfo *TII = Subtarget.getInstrInfo();
 
     // For small loops (between 5 and 8 instructions), align to a 32-byte
Index: llvm/test/CodeGen/PowerPC/code-align.ll
===================================================================
--- llvm/test/CodeGen/PowerPC/code-align.ll
+++ llvm/test/CodeGen/PowerPC/code-align.ll
@@ -87,6 +87,56 @@
   ret void
 }
 
+; Test the innermost loop alignment without PGO profile data.
+define void @nested_loop(i32* nocapture %s, i32 signext %m, i32 signext %n) {
+entry:
+  %h = alloca i32, align 4
+  %h.0.h.0..sroa_cast = bitcast i32* %h to i8*
+  br label %do.body
+
+; CHECK-LABEL: @nested_loop
+; CHECK: mtctr
+; GENERIC-NOT: .p2align
+; BASIC: .p2align  4
+; PWR: .p2align  5
+; CHECK: bdnz
+
+do.body:                                          ; preds = %while.end, %entry
+  %n.addr.0 = phi i32 [ %n, %entry ], [ -1, %while.end ]
+  %m.addr.0 = phi i32 [ %m, %entry ], [ %dec2, %while.end ]
+  %tobool12 = icmp eq i32 %n.addr.0, 0
+  br i1 %tobool12, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %do.body
+  %idxprom = sext i32 %n.addr.0 to i64
+  br label %while.body
+
+while.body:                                       ; preds = %while.body.lr.ph, %while.body
+  %n.addr.113 = phi i32 [ %n.addr.0, %while.body.lr.ph ], [ %dec, %while.body ]
+  %dec = add nsw i32 %n.addr.113, -1
+  store volatile i32 %n.addr.0, i32* %h, align 4
+  %h.0.h.0. = load volatile i32, i32* %h, align 4
+  %dec1 = add nsw i32 %h.0.h.0., -1
+  store volatile i32 %dec1, i32* %h, align 4
+  %h.0.h.0.5 = load volatile i32, i32* %h, align 4
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %while.cond.while.end_crit_edge, label %while.body
+
+while.cond.while.end_crit_edge:                   ; preds = %while.body
+  %arrayidx = getelementptr inbounds i32, i32* %s, i64 %idxprom
+  %add = add nsw i32 %h.0.h.0.5, %n.addr.0
+  store i32 %add, i32* %arrayidx, align 4
+  br label %while.end
+
+while.end:                                        ; preds = %while.cond.while.end_crit_edge, %do.body
+  %dec2 = add nsw i32 %m.addr.0, -1
+  %tobool3 = icmp eq i32 %m.addr.0, 0
+  br i1 %tobool3, label %do.end, label %do.body
+
+do.end:                                           ; preds = %while.end
+  ret void
+}
+
 ; Function Attrs: nounwind
 define void @test_minsize(i32 signext %x, i32* nocapture %a) #2 {
 entry:
Index: llvm/test/CodeGen/PowerPC/loop-align-pgo.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/PowerPC/loop-align-pgo.ll
@@ -0,0 +1,252 @@
+; Test the loop alignment with PGO profile data.
+; RUN: llc -verify-machineinstrs -mcpu=a2 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,GENERIC
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR
+
+; Test the loop alignment with PGO profile data and the option -disable-ppc-innermost-loop-align32.
+; RUN: llc -verify-machineinstrs -mcpu=a2 -disable-ppc-innermost-loop-align32 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -disable-ppc-innermost-loop-align32 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -disable-ppc-innermost-loop-align32 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -disable-ppc-innermost-loop-align32 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -disable-ppc-innermost-loop-align32 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32
+
+
+%struct.parm = type { i32*, i32, i32 }
+
+; Test the loop alignment when the innermost hot loop has more than 8 instructions.
+define void @big_loop(%struct.parm* %arg) !prof !29 {
+entry:
+  %localArg.sroa.0.0..sroa_idx = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 0
+  %localArg.sroa.0.0.copyload = load i32*, i32** %localArg.sroa.0.0..sroa_idx, align 8
+  %localArg.sroa.4.0..sroa_idx56 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 1
+  %localArg.sroa.4.0.copyload = load i32, i32* %localArg.sroa.4.0..sroa_idx56, align 8
+  %localArg.sroa.5.0..sroa_idx58 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 2
+  %localArg.sroa.5.0.copyload = load i32, i32* %localArg.sroa.5.0..sroa_idx58, align 4
+  %0 = sext i32 %localArg.sroa.5.0.copyload to i64
+  br label %do.body
+
+do.body:                                          ; preds = %do.end, %entry
+  %m.0 = phi i32 [ %localArg.sroa.4.0.copyload, %entry ], [ %dec24, %do.end ]
+  br label %do.body3
+
+do.body3:                                         ; preds = %do.body3, %do.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %do.body3 ], [ %0, %do.body ]
+  %1 = add nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %1
+  %2 = add nsw i64 %indvars.iv, 3
+  %3 = trunc i64 %1 to i32
+  %4 = add nsw i64 %indvars.iv, 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %2
+  %5 = trunc i64 %2 to i32
+  store i32 %5, i32* %arrayidx10, align 4, !tbaa !30
+  %arrayidx12 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %4
+  %6 = trunc i64 %4 to i32
+  store i32 %6, i32* %arrayidx12, align 4, !tbaa !30
+  store i32 %3, i32* %arrayidx, align 4, !tbaa !30
+  %arrayidx21 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %indvars.iv
+  %7 = trunc i64 %indvars.iv to i32
+  %8 = add i32 %7, 1
+  store i32 %8, i32* %arrayidx21, align 4, !tbaa !30
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %9 = icmp eq i64 %indvars.iv, 0
+  br i1 %9, label %do.end, label %do.body3, !prof !34
+
+do.end:                                           ; preds = %do.body3
+  %dec24 = add nsw i32 %m.0, -1
+  %tobool25 = icmp eq i32 %m.0, 0
+  br i1 %tobool25, label %do.end26, label %do.body, !prof !35
+
+do.end26:                                         ; preds = %do.end
+  %arrayidx28 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %0
+  store i32 0, i32* %arrayidx28, align 4, !tbaa !30
+  ret void
+
+
+; CHECK-LABEL: @big_loop
+; CHECK: mtctr 
+; GENERIC: .p2align  4
+; PWR: .p2align  5
+; GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align  4
+; PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align  4  
+; CHECK: bdnz 
+}
+
+; Test the loop alignment when the innermost hot loop has 5-8 instructions.
+define void @general_loop(i32* %s, i64 %m) !prof !29 {
+entry:
+  %tobool40 = icmp eq i64 %m, 0
+  br i1 %tobool40, label %while.end18, label %while.body3.lr.ph, !prof !36
+
+while.cond.loopexit:                              ; preds = %while.body3
+  %tobool = icmp eq i64 %dec, 0
+  br i1 %tobool, label %while.end18, label %while.body3.lr.ph, !prof !36
+
+while.body3.lr.ph:                                ; preds = %entry, %while.cond.loopexit
+  %m.addr.041 = phi i64 [ %dec, %while.cond.loopexit ], [ %m, %entry ]
+  %dec = add nsw i64 %m.addr.041, -1
+  %conv = trunc i64 %m.addr.041 to i32
+  %conv11 = trunc i64 %dec to i32
+  br label %while.body3
+
+while.body3:                                      ; preds = %while.body3.lr.ph, %while.body3
+  %n.039 = phi i64 [ %m.addr.041, %while.body3.lr.ph ], [ %dec16, %while.body3 ]
+  %inc = add nsw i64 %n.039, 1
+  %arrayidx = getelementptr inbounds i32, i32* %s, i64 %n.039
+  %inc5 = add nsw i64 %n.039, 2
+  %arrayidx6 = getelementptr inbounds i32, i32* %s, i64 %inc
+  %sub = sub nsw i64 %dec, %inc5
+  %conv7 = trunc i64 %sub to i32
+  %arrayidx9 = getelementptr inbounds i32, i32* %s, i64 %inc5
+  store i32 %conv7, i32* %arrayidx9, align 4, !tbaa !30
+  store i32 %conv11, i32* %arrayidx6, align 4, !tbaa !30
+  store i32 %conv, i32* %arrayidx, align 4, !tbaa !30
+  %dec16 = add nsw i64 %n.039, -1
+  %tobool2 = icmp eq i64 %dec16, 0
+  br i1 %tobool2, label %while.cond.loopexit, label %while.body3, !prof !37
+
+while.end18:                                      ; preds = %while.cond.loopexit, %entry
+  ret void
+
+
+; CHECK-LABEL: @general_loop
+; CHECK: mtctr 
+; GENERIC: .p2align  4
+; PWR: .p2align  5
+; GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align  4
+; PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align  5  
+; CHECK: bdnz
+}
+
+; Test the small loop alignment when the innermost hot loop has less than 4 instructions.
+define void @small_loop(i64 %m) !prof !29 {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.end, %entry
+  %m.addr.0 = phi i64 [ %m, %entry ], [ %1, %do.end ]
+  br label %do.body1
+
+do.body1:                                         ; preds = %do.body1, %do.body
+  %n.0 = phi i64 [ %m.addr.0, %do.body ], [ %0, %do.body1 ]
+  %0 = tail call i64 asm "subi     $0,$0,1", "=r,0"(i64 %n.0) #4, !srcloc !38
+  %tobool = icmp eq i64 %0, 0
+  br i1 %tobool, label %do.end, label %do.body1, !prof !39
+
+do.end:                                           ; preds = %do.body1
+  %1 = tail call i64 asm "subi     $1,$1,1", "=r,0"(i64 %m.addr.0) #4, !srcloc !40
+  %tobool3 = icmp eq i64 %1, 0
+  br i1 %tobool3, label %do.end4, label %do.body, !prof !41
+
+do.end4:                                          ; preds = %do.end
+  ret void
+
+
+; CHECK-LABEL: @small_loop
+; CHECK: beqlr 
+; GENERIC: .p2align  4
+; PWR: .p2align  5
+; GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align  4
+; PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align  4  
+; CHECK: bne
+}
+
+; Test the loop alignment when the innermost cold loop has more than 8 instructions.
+define void @big_loop_cold_innerloop(%struct.parm* %arg) !prof !29 {
+entry:
+  %localArg.sroa.0.0..sroa_idx = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 0
+  %localArg.sroa.0.0.copyload = load i32*, i32** %localArg.sroa.0.0..sroa_idx, align 8
+  %localArg.sroa.4.0..sroa_idx56 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 1
+  %localArg.sroa.4.0.copyload = load i32, i32* %localArg.sroa.4.0..sroa_idx56, align 8
+  %localArg.sroa.5.0..sroa_idx58 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 2
+  %localArg.sroa.5.0.copyload = load i32, i32* %localArg.sroa.5.0..sroa_idx58, align 4
+  %0 = sext i32 %localArg.sroa.5.0.copyload to i64
+  br label %do.body
+
+do.body:                                          ; preds = %do.end, %entry
+  %m.0 = phi i32 [ %localArg.sroa.4.0.copyload, %entry ], [ %dec24, %do.end ]
+  br label %do.body3
+
+do.body3:                                         ; preds = %do.body3, %do.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %do.body3 ], [ %0, %do.body ]
+  %1 = add nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %1
+  %2 = add nsw i64 %indvars.iv, 3
+  %3 = trunc i64 %1 to i32
+  %4 = add nsw i64 %indvars.iv, 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %2
+  %5 = trunc i64 %2 to i32
+  store i32 %5, i32* %arrayidx10, align 4, !tbaa !30
+  %arrayidx12 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %4
+  %6 = trunc i64 %4 to i32
+  store i32 %6, i32* %arrayidx12, align 4, !tbaa !30
+  store i32 %3, i32* %arrayidx, align 4, !tbaa !30
+  %arrayidx21 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %indvars.iv
+  %7 = trunc i64 %indvars.iv to i32
+  %8 = add i32 %7, 1
+  store i32 %8, i32* %arrayidx21, align 4, !tbaa !30
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %9 = icmp eq i64 %indvars.iv, 0
+  br i1 %9, label %do.end, label %do.body3, !prof !42
+
+do.end:                                           ; preds = %do.body3
+  %dec24 = add nsw i32 %m.0, -1
+  %tobool25 = icmp eq i32 %m.0, 0
+  br i1 %tobool25, label %do.end26, label %do.body, !prof !43
+
+do.end26:                                         ; preds = %do.end
+  %arrayidx28 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %0
+  store i32 0, i32* %arrayidx28, align 4, !tbaa !30
+  ret void
+
+
+; CHECK-LABEL: @big_loop_cold_innerloop
+; CHECK: mtctr 
+; CHECK-NOT: .p2align
+; CHECK: bdnz 
+}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 2018205}
+!5 = !{!"MaxCount", i64 1999000}
+!6 = !{!"MaxInternalCount", i64 5050}
+!7 = !{!"MaxFunctionCount", i64 1999000}
+!8 = !{!"NumCounts", i64 13}
+!9 = !{!"NumFunctions", i64 5}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27}
+!12 = !{i32 10000, i64 1999000, i32 1}
+!13 = !{i32 100000, i64 1999000, i32 1}
+!14 = !{i32 200000, i64 1999000, i32 1}
+!15 = !{i32 300000, i64 1999000, i32 1}
+!16 = !{i32 400000, i64 1999000, i32 1}
+!17 = !{i32 500000, i64 1999000, i32 1}
+!18 = !{i32 600000, i64 1999000, i32 1}
+!19 = !{i32 700000, i64 1999000, i32 1}
+!20 = !{i32 800000, i64 1999000, i32 1}
+!21 = !{i32 900000, i64 1999000, i32 1}
+!22 = !{i32 950000, i64 1999000, i32 1}
+!23 = !{i32 990000, i64 1999000, i32 1}
+!24 = !{i32 999000, i64 2000, i32 5}
+!25 = !{i32 999900, i64 2000, i32 5}
+!26 = !{i32 999990, i64 33, i32 7}
+!27 = !{i32 999999, i64 1, i32 13}
+!28 = !{!"clang version 9.0.0 "}
+!29 = !{!"function_entry_count", i64 1}
+!30 = !{!31, !31, i64 0}
+!31 = !{!"int", !32, i64 0}
+!32 = !{!"omnipotent char", !33, i64 0}
+!33 = !{!"Simple C/C++ TBAA"}
+!34 = !{!"branch_weights", i32 2001, i32 10005}
+!35 = !{!"branch_weights", i32 1, i32 2000}
+!36 = !{!"branch_weights", i32 1, i32 100}
+!37 = !{!"branch_weights", i32 100, i32 5050}
+!38 = !{i32 782}
+!39 = !{!"branch_weights", i32 2000, i32 1999000}
+!40 = !{i32 837}
+!41 = !{!"branch_weights", i32 1, i32 1999}
+!42 = !{!"branch_weights", i32 11, i32 33}
+!43 = !{!"branch_weights", i32 1, i32 10}