Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -1894,7 +1894,7 @@ /// Return the maximum amount of bytes allowed to be emitted when padding for /// alignment virtual unsigned - getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const; + getMaxPermittedBytesForAlignment() const; /// Should loops be aligned even when the function is marked OptSize (but not /// MinSize). Index: llvm/lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -399,6 +399,8 @@ ProfileSummaryInfo *PSI = nullptr; + unsigned MaxBytesForAlignment = 0; + /// Duplicator used to duplicate tails during placement. /// /// Placement decisions can open up new tail duplication opportunities, but @@ -2945,21 +2947,14 @@ MachineBasicBlock *LayoutPred = &*std::prev(MachineFunction::iterator(ChainBB)); - auto DetermineMaxAlignmentPadding = [&]() { - // Set the maximum bytes allowed to be emitted for alignment. - unsigned MaxBytes; - if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0) - MaxBytes = MaxBytesForAlignmentOverride; - else - MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB); - ChainBB->setMaxBytesForAlignment(MaxBytes); - }; - // Force alignment if all the predecessors are jumps. We already checked // that the block isn't cold above. if (!LayoutPred->isSuccessor(ChainBB)) { ChainBB->setAlignment(Align); - DetermineMaxAlignmentPadding(); + if (ChainBB == LoopHeader || MLI->getLoopFor(LayoutPred) != L) + ChainBB->setMaxBytesForAlignment(0); + else + ChainBB->setMaxBytesForAlignment(MaxBytesForAlignment); continue; } @@ -2970,10 +2965,8 @@ BranchProbability LayoutProb = MBPI->getEdgeProbability(LayoutPred, ChainBB); BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb; - if (LayoutEdgeFreq <= (Freq * ColdProb)) { - ChainBB->setAlignment(Align); - DetermineMaxAlignmentPadding(); - } + if (LayoutEdgeFreq <= (Freq * ColdProb)) + ChainBB->setAlignment(Align, MaxBytesForAlignment); } } @@ -3356,6 +3349,12 @@ MPDT = nullptr; PSI = &getAnalysis().getPSI(); + // Set the maximum bytes allowed to be emitted for alignment. + if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0) + MaxBytesForAlignment = MaxBytesForAlignmentOverride; + else + MaxBytesForAlignment = TLI->getMaxPermittedBytesForAlignment(); + initDupThreshold(); // Initialize PreferredLoopExit to nullptr here since it may never be set if @@ -3448,32 +3447,23 @@ ComputedEdges.clear(); ChainAllocator.DestroyAll(); - bool HasMaxBytesOverride = - MaxBytesForAlignmentOverride.getNumOccurrences() > 0; - - if (AlignAllBlock) + if (AlignAllBlock) { // Align all of the blocks in the function to a specific alignment. - for (MachineBasicBlock &MBB : MF) { - if (HasMaxBytesOverride) - MBB.setAlignment(Align(1ULL << AlignAllBlock), - MaxBytesForAlignmentOverride); - else - MBB.setAlignment(Align(1ULL << AlignAllBlock)); - } - else if (AlignAllNonFallThruBlocks) { + for (MachineBasicBlock &MBB : MF) + MBB.setAlignment(Align(1ULL << AlignAllBlock), MaxBytesForAlignment); + + } else if (AlignAllNonFallThruBlocks) { // Align all of the blocks that have no fall-through predecessors to a // specific alignment. for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) { auto LayoutPred = std::prev(MBI); - if (!LayoutPred->isSuccessor(&*MBI)) { - if (HasMaxBytesOverride) - MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks), - MaxBytesForAlignmentOverride); - else - MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks)); - } + if (LayoutPred->isSuccessor(&*MBI)) + continue; + MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks), + MaxBytesForAlignment); } } + if (ViewBlockLayoutWithBFI != GVDT_None && (ViewBlockFreqFuncName.empty() || F->getFunction().getName().equals(ViewBlockFreqFuncName))) { Index: llvm/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -2058,8 +2058,7 @@ return PrefLoopAlignment; } -unsigned TargetLoweringBase::getMaxPermittedBytesForAlignment( - MachineBasicBlock *MBB) const { +unsigned TargetLoweringBase::getMaxPermittedBytesForAlignment() const { return MaxBytesForAlignment; } Index: llvm/test/CodeGen/AArch64/loop-align-limit.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/loop-align-limit.ll @@ -0,0 +1,80 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s | FileCheck %s +target triple = "aarch64-linux" + +declare i1 @cond(i64, i64) +declare i32 @h(i32) + +define i32 @g(ptr %a, i64 %n, i32 %d) "tune-cpu"="neoverse-v1" { +; CHECK-LABEL: g: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill +; CHECK-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill +; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w20, -16 +; CHECK-NEXT: .cfi_offset w21, -24 +; CHECK-NEXT: .cfi_offset w22, -32 +; CHECK-NEXT: .cfi_offset w23, -40 +; CHECK-NEXT: .cfi_offset w30, -48 +; CHECK-NEXT: mov w19, w2 +; CHECK-NEXT: mov x20, x1 +; CHECK-NEXT: mov x21, x0 +; CHECK-NEXT: mov x23, xzr +; CHECK-NEXT: mov w22, wzr +; CHECK-NEXT: b .LBB0_2 +; CHECK-NEXT: .p2align 5 +; CHECK-NEXT: .LBB0_1: // %if.end +; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: add w22, w22, w0 +; CHECK-NEXT: add x23, x23, #1 +; CHECK-NEXT: .LBB0_2: // %loop +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: cmp x23, x20 +; CHECK-NEXT: b.ge .LBB0_5 +; CHECK-NEXT: // %bb.3: // %loop.body +; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: ldr w0, [x21, x23, lsl #2] +; CHECK-NEXT: cmp w19, #0 +; CHECK-NEXT: b.gt .LBB0_1 +; CHECK-NEXT: // %bb.4: // %if.then +; CHECK-NEXT: // in Loop: Header=BB0_2 Depth=1 +; CHECK-NEXT: bl h +; CHECK-NEXT: b .LBB0_1 +; CHECK-NEXT: .LBB0_5: // %exit +; CHECK-NEXT: mov w0, w22 +; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload +; CHECK-NEXT: ret +entry: + br label %loop + +loop: + %i = phi i64 [0, %entry], [%i.next, %if.end] + %s = phi i32 [0, %entry], [%s.next, %if.end] + %c = icmp slt i64 %i, %n + br i1 %c, label %loop.body, label %exit + +loop.body: + %p = getelementptr i32, ptr %a, i64 %i + %v = load i32, ptr %p + %c1 = icmp slt i32 %d, 1 + br i1 %c1, label %if.then, label %if.end + +if.then: + %v0 = call i32 @h(i32 %v) + br label %if.end + +if.end: + %w = phi i32 [%v0, %if.then], [%v, %loop.body] + %s.next = add i32 %s, %w + %i.next = add i64 %i, 1 + br label %loop + + +exit: + ret i32 %s +} + Index: llvm/test/CodeGen/AArch64/merge-store-dependency.ll =================================================================== --- llvm/test/CodeGen/AArch64/merge-store-dependency.ll +++ llvm/test/CodeGen/AArch64/merge-store-dependency.ll @@ -19,12 +19,12 @@ ; A53-NEXT: mov x8, x0 ; A53-NEXT: mov x19, x8 ; A53-NEXT: mov w0, w1 -; A53-NEXT: mov w9, #256 +; A53-NEXT: mov w9, #256 // =0x100 ; A53-NEXT: stp x2, x3, [x8, #32] ; A53-NEXT: mov x2, x8 ; A53-NEXT: str q0, [x19, #16]! ; A53-NEXT: str w1, [x19] -; A53-NEXT: mov w1, #4 +; A53-NEXT: mov w1, #4 // =0x4 ; A53-NEXT: str q0, [x8] ; A53-NEXT: strh w9, [x8, #24] ; A53-NEXT: str wzr, [x8, #20] @@ -51,7 +51,7 @@ ; A53-NEXT: .cfi_restore w19 ; A53-NEXT: .cfi_restore w30 ; A53-NEXT: ret -; A53-NEXT: .p2align 4, , 8 +; A53-NEXT: .p2align 4 ; A53-NEXT: .LBB0_4: // %while.body.i.split ; A53-NEXT: // =>This Inner Loop Header: Depth=1 ; A53-NEXT: .cfi_restore_state