Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -113,6 +113,9 @@ static cl::opt DisableSCO("disable-ppc-sco", cl::desc("disable sibling call optimization on ppc"), cl::Hidden); +static cl::opt DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32", +cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden); + static cl::opt EnableQuadPrecision("enable-ppc-quad-precision", cl::desc("enable quad precision float support on ppc"), cl::Hidden); @@ -13843,6 +13846,15 @@ if (!ML) break; + if (!DisableInnermostLoopAlign32) { + // If the nested loop is an innermost loop, prefer to a 32-byte alignment, + // so that we can decrease cache misses and branch-prediction misses. + // Actual alignment of the loop will depend on the hotness check and other + // logic in alignBlocks. + if (ML->getLoopDepth() > 1 && ML->getSubLoops().empty()) + return 5; + } + const PPCInstrInfo *TII = Subtarget.getInstrInfo(); // For small loops (between 5 and 8 instructions), align to a 32-byte Index: llvm/test/CodeGen/PowerPC/code-align.ll =================================================================== --- llvm/test/CodeGen/PowerPC/code-align.ll +++ llvm/test/CodeGen/PowerPC/code-align.ll @@ -87,6 +87,56 @@ ret void } +; Test the innermost loop alignment without PGO profile data. +define void @nested_loop(i32* nocapture %s, i32 signext %m, i32 signext %n) { +entry: + %h = alloca i32, align 4 + %h.0.h.0..sroa_cast = bitcast i32* %h to i8* + br label %do.body + +; CHECK-LABEL: @nested_loop +; CHECK: mtctr +; GENERIC-NOT: .p2align +; BASIC: .p2align 4 +; PWR: .p2align 5 +; CHECK: bdnz + +do.body: ; preds = %while.end, %entry + %n.addr.0 = phi i32 [ %n, %entry ], [ -1, %while.end ] + %m.addr.0 = phi i32 [ %m, %entry ], [ %dec2, %while.end ] + %tobool12 = icmp eq i32 %n.addr.0, 0 + br i1 %tobool12, label %while.end, label %while.body.lr.ph + +while.body.lr.ph: ; preds = %do.body + %idxprom = sext i32 %n.addr.0 to i64 + br label %while.body + +while.body: ; preds = %while.body.lr.ph, %while.body + %n.addr.113 = phi i32 [ %n.addr.0, %while.body.lr.ph ], [ %dec, %while.body ] + %dec = add nsw i32 %n.addr.113, -1 + store volatile i32 %n.addr.0, i32* %h, align 4 + %h.0.h.0. = load volatile i32, i32* %h, align 4 + %dec1 = add nsw i32 %h.0.h.0., -1 + store volatile i32 %dec1, i32* %h, align 4 + %h.0.h.0.5 = load volatile i32, i32* %h, align 4 + %tobool = icmp eq i32 %dec, 0 + br i1 %tobool, label %while.cond.while.end_crit_edge, label %while.body + +while.cond.while.end_crit_edge: ; preds = %while.body + %arrayidx = getelementptr inbounds i32, i32* %s, i64 %idxprom + %add = add nsw i32 %h.0.h.0.5, %n.addr.0 + store i32 %add, i32* %arrayidx, align 4 + br label %while.end + +while.end: ; preds = %while.cond.while.end_crit_edge, %do.body + %dec2 = add nsw i32 %m.addr.0, -1 + %tobool3 = icmp eq i32 %m.addr.0, 0 + br i1 %tobool3, label %do.end, label %do.body + +do.end: ; preds = %while.end + ret void +} + ; Function Attrs: nounwind define void @test_minsize(i32 signext %x, i32* nocapture %a) #2 { entry: Index: llvm/test/CodeGen/PowerPC/loop-align-pgo.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/PowerPC/loop-align-pgo.ll @@ -0,0 +1,252 @@ +; Test the loop alignment with PGO profile data. +; RUN: llc -verify-machineinstrs -mcpu=a2 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,GENERIC +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR + +; Test the loop alignment with PGO profile data and the option -disable-ppc-innermost-loop-align32. +; RUN: llc -verify-machineinstrs -mcpu=a2 -disable-ppc-innermost-loop-align32 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32 +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -disable-ppc-innermost-loop-align32 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -disable-ppc-innermost-loop-align32 -mtriple powerpc64le-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32 +; RUN: llc -verify-machineinstrs -mcpu=pwr8 -disable-ppc-innermost-loop-align32 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32 +; RUN: llc -verify-machineinstrs -mcpu=pwr9 -disable-ppc-innermost-loop-align32 -mtriple powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefixes=CHECK,PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32 + + +%struct.parm = type { i32*, i32, i32 } + +; Test the loop alignment when the innermost hot loop has more than 8 instructions. +define void @big_loop(%struct.parm* %arg) !prof !29 { +entry: + %localArg.sroa.0.0..sroa_idx = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 0 + %localArg.sroa.0.0.copyload = load i32*, i32** %localArg.sroa.0.0..sroa_idx, align 8 + %localArg.sroa.4.0..sroa_idx56 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 1 + %localArg.sroa.4.0.copyload = load i32, i32* %localArg.sroa.4.0..sroa_idx56, align 8 + %localArg.sroa.5.0..sroa_idx58 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 2 + %localArg.sroa.5.0.copyload = load i32, i32* %localArg.sroa.5.0..sroa_idx58, align 4 + %0 = sext i32 %localArg.sroa.5.0.copyload to i64 + br label %do.body + +do.body: ; preds = %do.end, %entry + %m.0 = phi i32 [ %localArg.sroa.4.0.copyload, %entry ], [ %dec24, %do.end ] + br label %do.body3 + +do.body3: ; preds = %do.body3, %do.body + %indvars.iv = phi i64 [ %indvars.iv.next, %do.body3 ], [ %0, %do.body ] + %1 = add nsw i64 %indvars.iv, 2 + %arrayidx = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %1 + %2 = add nsw i64 %indvars.iv, 3 + %3 = trunc i64 %1 to i32 + %4 = add nsw i64 %indvars.iv, 4 + %arrayidx10 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %2 + %5 = trunc i64 %2 to i32 + store i32 %5, i32* %arrayidx10, align 4, !tbaa !30 + %arrayidx12 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %4 + %6 = trunc i64 %4 to i32 + store i32 %6, i32* %arrayidx12, align 4, !tbaa !30 + store i32 %3, i32* %arrayidx, align 4, !tbaa !30 + %arrayidx21 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %indvars.iv + %7 = trunc i64 %indvars.iv to i32 + %8 = add i32 %7, 1 + store i32 %8, i32* %arrayidx21, align 4, !tbaa !30 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %9 = icmp eq i64 %indvars.iv, 0 + br i1 %9, label %do.end, label %do.body3, !prof !34 + +do.end: ; preds = %do.body3 + %dec24 = add nsw i32 %m.0, -1 + %tobool25 = icmp eq i32 %m.0, 0 + br i1 %tobool25, label %do.end26, label %do.body, !prof !35 + +do.end26: ; preds = %do.end + %arrayidx28 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %0 + store i32 0, i32* %arrayidx28, align 4, !tbaa !30 + ret void + + +; CHECK-LABEL: @big_loop +; CHECK: mtctr +; GENERIC: .p2align 4 +; PWR: .p2align 5 +; GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align 4 +; PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align 4 +; CHECK: bdnz +} + +; Test the loop alignment when the innermost hot loop has 5-8 instructions. +define void @general_loop(i32* %s, i64 %m) !prof !29 { +entry: + %tobool40 = icmp eq i64 %m, 0 + br i1 %tobool40, label %while.end18, label %while.body3.lr.ph, !prof !36 + +while.cond.loopexit: ; preds = %while.body3 + %tobool = icmp eq i64 %dec, 0 + br i1 %tobool, label %while.end18, label %while.body3.lr.ph, !prof !36 + +while.body3.lr.ph: ; preds = %entry, %while.cond.loopexit + %m.addr.041 = phi i64 [ %dec, %while.cond.loopexit ], [ %m, %entry ] + %dec = add nsw i64 %m.addr.041, -1 + %conv = trunc i64 %m.addr.041 to i32 + %conv11 = trunc i64 %dec to i32 + br label %while.body3 + +while.body3: ; preds = %while.body3.lr.ph, %while.body3 + %n.039 = phi i64 [ %m.addr.041, %while.body3.lr.ph ], [ %dec16, %while.body3 ] + %inc = add nsw i64 %n.039, 1 + %arrayidx = getelementptr inbounds i32, i32* %s, i64 %n.039 + %inc5 = add nsw i64 %n.039, 2 + %arrayidx6 = getelementptr inbounds i32, i32* %s, i64 %inc + %sub = sub nsw i64 %dec, %inc5 + %conv7 = trunc i64 %sub to i32 + %arrayidx9 = getelementptr inbounds i32, i32* %s, i64 %inc5 + store i32 %conv7, i32* %arrayidx9, align 4, !tbaa !30 + store i32 %conv11, i32* %arrayidx6, align 4, !tbaa !30 + store i32 %conv, i32* %arrayidx, align 4, !tbaa !30 + %dec16 = add nsw i64 %n.039, -1 + %tobool2 = icmp eq i64 %dec16, 0 + br i1 %tobool2, label %while.cond.loopexit, label %while.body3, !prof !37 + +while.end18: ; preds = %while.cond.loopexit, %entry + ret void + + +; CHECK-LABEL: @general_loop +; CHECK: mtctr +; GENERIC: .p2align 4 +; PWR: .p2align 5 +; GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align 4 +; PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align 5 +; CHECK: bdnz +} + +; Test the small loop alignment when the innermost hot loop has less than 4 instructions. +define void @small_loop(i64 %m) !prof !29 { +entry: + br label %do.body + +do.body: ; preds = %do.end, %entry + %m.addr.0 = phi i64 [ %m, %entry ], [ %1, %do.end ] + br label %do.body1 + +do.body1: ; preds = %do.body1, %do.body + %n.0 = phi i64 [ %m.addr.0, %do.body ], [ %0, %do.body1 ] + %0 = tail call i64 asm "subi $0,$0,1", "=r,0"(i64 %n.0) #4, !srcloc !38 + %tobool = icmp eq i64 %0, 0 + br i1 %tobool, label %do.end, label %do.body1, !prof !39 + +do.end: ; preds = %do.body1 + %1 = tail call i64 asm "subi $1,$1,1", "=r,0"(i64 %m.addr.0) #4, !srcloc !40 + %tobool3 = icmp eq i64 %1, 0 + br i1 %tobool3, label %do.end4, label %do.body, !prof !41 + +do.end4: ; preds = %do.end + ret void + + +; CHECK-LABEL: @small_loop +; CHECK: beqlr +; GENERIC: .p2align 4 +; PWR: .p2align 5 +; GENERIC-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align 4 +; PWR-DISABLE-PPC-INNERMOST-LOOP-ALIGN32: .p2align 4 +; CHECK: bne +} + +; Test the loop alignment when the innermost cold loop has more than 8 instructions. +define void @big_loop_cold_innerloop(%struct.parm* %arg) !prof !29 { +entry: + %localArg.sroa.0.0..sroa_idx = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 0 + %localArg.sroa.0.0.copyload = load i32*, i32** %localArg.sroa.0.0..sroa_idx, align 8 + %localArg.sroa.4.0..sroa_idx56 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 1 + %localArg.sroa.4.0.copyload = load i32, i32* %localArg.sroa.4.0..sroa_idx56, align 8 + %localArg.sroa.5.0..sroa_idx58 = getelementptr inbounds %struct.parm, %struct.parm* %arg, i64 0, i32 2 + %localArg.sroa.5.0.copyload = load i32, i32* %localArg.sroa.5.0..sroa_idx58, align 4 + %0 = sext i32 %localArg.sroa.5.0.copyload to i64 + br label %do.body + +do.body: ; preds = %do.end, %entry + %m.0 = phi i32 [ %localArg.sroa.4.0.copyload, %entry ], [ %dec24, %do.end ] + br label %do.body3 + +do.body3: ; preds = %do.body3, %do.body + %indvars.iv = phi i64 [ %indvars.iv.next, %do.body3 ], [ %0, %do.body ] + %1 = add nsw i64 %indvars.iv, 2 + %arrayidx = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %1 + %2 = add nsw i64 %indvars.iv, 3 + %3 = trunc i64 %1 to i32 + %4 = add nsw i64 %indvars.iv, 4 + %arrayidx10 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %2 + %5 = trunc i64 %2 to i32 + store i32 %5, i32* %arrayidx10, align 4, !tbaa !30 + %arrayidx12 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %4 + %6 = trunc i64 %4 to i32 + store i32 %6, i32* %arrayidx12, align 4, !tbaa !30 + store i32 %3, i32* %arrayidx, align 4, !tbaa !30 + %arrayidx21 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %indvars.iv + %7 = trunc i64 %indvars.iv to i32 + %8 = add i32 %7, 1 + store i32 %8, i32* %arrayidx21, align 4, !tbaa !30 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %9 = icmp eq i64 %indvars.iv, 0 + br i1 %9, label %do.end, label %do.body3, !prof !42 + +do.end: ; preds = %do.body3 + %dec24 = add nsw i32 %m.0, -1 + %tobool25 = icmp eq i32 %m.0, 0 + br i1 %tobool25, label %do.end26, label %do.body, !prof !43 + +do.end26: ; preds = %do.end + %arrayidx28 = getelementptr inbounds i32, i32* %localArg.sroa.0.0.copyload, i64 %0 + store i32 0, i32* %arrayidx28, align 4, !tbaa !30 + ret void + + +; CHECK-LABEL: @big_loop_cold_innerloop +; CHECK: mtctr +; CHECK-NOT: .p2align +; CHECK: bdnz +} + +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 2018205} +!5 = !{!"MaxCount", i64 1999000} +!6 = !{!"MaxInternalCount", i64 5050} +!7 = !{!"MaxFunctionCount", i64 1999000} +!8 = !{!"NumCounts", i64 13} +!9 = !{!"NumFunctions", i64 5} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27} +!12 = !{i32 10000, i64 1999000, i32 1} +!13 = !{i32 100000, i64 1999000, i32 1} +!14 = !{i32 200000, i64 1999000, i32 1} +!15 = !{i32 300000, i64 1999000, i32 1} +!16 = !{i32 400000, i64 1999000, i32 1} +!17 = !{i32 500000, i64 1999000, i32 1} +!18 = !{i32 600000, i64 1999000, i32 1} +!19 = !{i32 700000, i64 1999000, i32 1} +!20 = !{i32 800000, i64 1999000, i32 1} +!21 = !{i32 900000, i64 1999000, i32 1} +!22 = !{i32 950000, i64 1999000, i32 1} +!23 = !{i32 990000, i64 1999000, i32 1} +!24 = !{i32 999000, i64 2000, i32 5} +!25 = !{i32 999900, i64 2000, i32 5} +!26 = !{i32 999990, i64 33, i32 7} +!27 = !{i32 999999, i64 1, i32 13} +!28 = !{!"clang version 9.0.0 "} +!29 = !{!"function_entry_count", i64 1} +!30 = !{!31, !31, i64 0} +!31 = !{!"int", !32, i64 0} +!32 = !{!"omnipotent char", !33, i64 0} +!33 = !{!"Simple C/C++ TBAA"} +!34 = !{!"branch_weights", i32 2001, i32 10005} +!35 = !{!"branch_weights", i32 1, i32 2000} +!36 = !{!"branch_weights", i32 1, i32 100} +!37 = !{!"branch_weights", i32 100, i32 5050} +!38 = !{i32 782} +!39 = !{!"branch_weights", i32 2000, i32 1999000} +!40 = !{i32 837} +!41 = !{!"branch_weights", i32 1, i32 1999} +!42 = !{!"branch_weights", i32 11, i32 33} +!43 = !{!"branch_weights", i32 1, i32 10}