diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h --- a/llvm/include/llvm/CodeGen/AsmPrinter.h +++ b/llvm/include/llvm/CodeGen/AsmPrinter.h @@ -432,7 +432,8 @@ /// global value is specified, and if that global has an explicit alignment /// requested, it will override the alignment request if required for /// correctness. - void emitAlignment(Align Alignment, const GlobalObject *GV = nullptr) const; + void emitAlignment(Align Alignment, const GlobalObject *GV = nullptr, + unsigned MaxBytesToEmit = 0) const; /// Lower the specified LLVM Constant to an MCExpr. virtual const MCExpr *lowerConstant(const Constant *CV); diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h --- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -136,6 +136,10 @@ /// Alignment of the basic block. One if the basic block does not need to be /// aligned. Align Alignment; + /// Maximum amount of bytes that can be added to align the basic block. If the + /// alignment cannot be reached in this many bytes, no bytes are emitted. + /// Zero to represent no maximum. + uint8_t MaxBytesForAlignment = 0; /// Indicate that this basic block is entered via an exception handler. bool IsEHPad = false; @@ -521,6 +525,14 @@ /// Set alignment of the basic block. void setAlignment(Align A) { Alignment = A; } + /// Return the maximum amount of padding allowed for aligning the basic block + unsigned getMaxBytesForAlignment() const { return MaxBytesForAlignment; } + + /// Set the maximum amount of padding allowed for aligning the basic block + void setMaxBytesForAlignment(unsigned MaxBytes) { + MaxBytesForAlignment = MaxBytes; + } + /// Returns true if the block is a landing pad. That is this basic block is /// entered via an exception handler. bool isEHPad() const { return IsEHPad; } diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1771,11 +1771,14 @@ /// Return the preferred loop alignment. virtual Align getPrefLoopAlignment(MachineLoop *ML = nullptr) const; + /// Return the maximum amount of bytes allowed to be emitted when padding for + /// alignment + virtual unsigned + getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const; + /// Should loops be aligned even when the function is marked OptSize (but not /// MinSize). - virtual bool alignLoopsWithOptSize() const { - return false; - } + virtual bool alignLoopsWithOptSize() const { return false; } /// If the target has a standard location for the stack protector guard, /// returns the address of that location. Otherwise, returns nullptr. diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -2447,7 +2447,8 @@ // two boundary. If a global value is specified, and if that global has // an explicit alignment requested, it will override the alignment request // if required for correctness. -void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV) const { +void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV, + unsigned MaxBytesToEmit) const { if (GV) Alignment = getGVAlignment(GV, GV->getParent()->getDataLayout(), Alignment); @@ -2460,9 +2461,9 @@ STI = &getSubtargetInfo(); else STI = TM.getMCSubtargetInfo(); - OutStreamer->emitCodeAlignment(Alignment.value(), STI); + OutStreamer->emitCodeAlignment(Alignment.value(), STI, MaxBytesToEmit); } else - OutStreamer->emitValueToAlignment(Alignment.value()); + OutStreamer->emitValueToAlignment(Alignment.value(), 0, 1, MaxBytesToEmit); } //===----------------------------------------------------------------------===// @@ -3253,7 +3254,7 @@ // Emit an alignment directive for this block, if needed. const Align Alignment = MBB.getAlignment(); if (Alignment != Align(1)) - emitAlignment(Alignment); + emitAlignment(Alignment, nullptr, MBB.getMaxBytesForAlignment()); // Switch to a new section if this basic block must begin a section. The // entry block is always placed in the function section and is handled diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -95,6 +95,12 @@ "format (e.g 4 means align on 16B boundaries)."), cl::init(0), cl::Hidden); +static cl::opt MaxBytesForAlignmentOverride( + "max-bytes-for-alignment", + cl::desc("Forces the maximum bytes allowed to be emitted when padding for " + "alignment"), + cl::init(0), cl::Hidden); + // FIXME: Find a good default for this flag and remove the flag. static cl::opt ExitBlockBias( "block-placement-exit-block-bias", @@ -2914,10 +2920,21 @@ MachineBasicBlock *LayoutPred = &*std::prev(MachineFunction::iterator(ChainBB)); + auto DetermineMaxAlignmentPadding = [&]() { + // Set the maximum bytes allowed to be emitted for alignment. + unsigned MaxBytes; + if (MaxBytesForAlignmentOverride.getNumOccurrences() > 0) + MaxBytes = MaxBytesForAlignmentOverride; + else + MaxBytes = TLI->getMaxPermittedBytesForAlignment(ChainBB); + ChainBB->setMaxBytesForAlignment(MaxBytes); + }; + // Force alignment if all the predecessors are jumps. We already checked // that the block isn't cold above. if (!LayoutPred->isSuccessor(ChainBB)) { ChainBB->setAlignment(Align); + DetermineMaxAlignmentPadding(); continue; } @@ -2928,8 +2945,10 @@ BranchProbability LayoutProb = MBPI->getEdgeProbability(LayoutPred, ChainBB); BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb; - if (LayoutEdgeFreq <= (Freq * ColdProb)) + if (LayoutEdgeFreq <= (Freq * ColdProb)) { ChainBB->setAlignment(Align); + DetermineMaxAlignmentPadding(); + } } } diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -2026,6 +2026,11 @@ return PrefLoopAlignment; } +unsigned TargetLoweringBase::getMaxPermittedBytesForAlignment( + MachineBasicBlock *MBB) const { + return 0; +} + //===----------------------------------------------------------------------===// // Reciprocal Estimates //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll b/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-p2align-max-bytes.ll @@ -0,0 +1,89 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -max-bytes-for-alignment=8 --align-loops=32 < %s -o -| FileCheck %s --check-prefixes=CHECK,CHECK-EXPLICIT +; RUN: llc -mtriple=aarch64-none-linux-gnu --align-loops=32 < %s -o -| FileCheck %s --check-prefixes=CHECK,CHECK-IMPLICIT +; RUN: llc -mtriple=aarch64-none-linux-gnu --align-loops=32 < %s -o - --filetype=obj | llvm-objdump --arch=aarch64 -d -| FileCheck %s --check-prefixes=CHECK-OBJ,CHECK-OBJ-EXPLICIT +; RUN: llc -mtriple=aarch64-none-linux-gnu -max-bytes-for-alignment=8 --align-loops=32 < %s -o - --filetype=obj | llvm-objdump --arch=aarch64 -d -| FileCheck %s --check-prefixes=CHECK-OBJ,CHECK-OBJ-IMPLICIT + +define i32 @a(i32 %x, i32* nocapture readonly %y, i32* nocapture readonly %z) { +; CHECK-LABEL: a: +; CHECK-EXPLICIT: .p2align 5, 0x0, 8 +; CHECK-IMPLICIT: .p2align 5, 0x0, 31 +; CHECK-NEXT: .LBB0_5: // %vector.body +; CHECK-EXPLICIT: .p2align 5, 0x0, 8 +; CHECK-IMPLICIT: .p2align 5, 0x0, 31 +; CHECK-NEXT: .LBB0_8: // %for.body +; CHECK-OBJ;Disassembly of section .text: +; CHECK-OBJ: 88: 2a 00 0a 8b add +; CHECK-OBJ-EXPLICIT-NEXT: 8c: 1f 20 03 d5 nop +; CHECK-OBJ-EXPLICIT-NEXT: 90: 1f 20 03 d5 nop +; CHECK-OBJ-EXPLICIT-NEXT: 94: 1f 20 03 d5 nop +; CHECK-OBJ-EXPLICIT-NEXT: 98: 1f 20 03 d5 nop +; CHECK-OBJ-EXPLICIT-NEXT: 9c: 1f 20 03 d5 nop +; CHECK-OBJ-EXPLICIT-NEXT: a0: 4b 45 40 b8 ldr +; CHECK-OBJ-IMPLICIT-NEXT: 8c: 4b 45 40 b8 ldr +entry: + %cmp10 = icmp sgt i32 %x, 0 + br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %x to i64 + %min.iters.check = icmp ult i32 %x, 8 + br i1 %min.iters.check, label %for.body.preheader17, label %vector.ph + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i64 %wide.trip.count, 4294967288 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %10, %vector.body ] + %vec.phi13 = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %11, %vector.body ] + %0 = getelementptr inbounds i32, i32* %y, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 4 + %2 = getelementptr inbounds i32, i32* %0, i64 4 + %3 = bitcast i32* %2 to <4 x i32>* + %wide.load14 = load <4 x i32>, <4 x i32>* %3, align 4 + %4 = getelementptr inbounds i32, i32* %z, i64 %index + %5 = bitcast i32* %4 to <4 x i32>* + %wide.load15 = load <4 x i32>, <4 x i32>* %5, align 4 + %6 = getelementptr inbounds i32, i32* %4, i64 4 + %7 = bitcast i32* %6 to <4 x i32>* + %wide.load16 = load <4 x i32>, <4 x i32>* %7, align 4 + %8 = add <4 x i32> %wide.load, %vec.phi + %9 = add <4 x i32> %wide.load14, %vec.phi13 + %10 = add <4 x i32> %8, %wide.load15 + %11 = add <4 x i32> %9, %wide.load16 + %index.next = add nuw i64 %index, 8 + %12 = icmp eq i64 %index.next, %n.vec + br i1 %12, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %bin.rdx = add <4 x i32> %11, %10 + %13 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %bin.rdx) + %cmp.n = icmp eq i64 %n.vec, %wide.trip.count + br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader17 + +for.body.preheader17: ; preds = %for.body.preheader, %middle.block + %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] + %b.011.ph = phi i32 [ 0, %for.body.preheader ], [ %13, %middle.block ] + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %middle.block, %entry + %b.0.lcssa = phi i32 [ 0, %entry ], [ %13, %middle.block ], [ %add3, %for.body ] + ret i32 %b.0.lcssa + +for.body: ; preds = %for.body.preheader17, %for.body + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader17 ] + %b.011 = phi i32 [ %add3, %for.body ], [ %b.011.ph, %for.body.preheader17 ] + %arrayidx = getelementptr inbounds i32, i32* %y, i64 %indvars.iv + %14 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %z, i64 %indvars.iv + %15 = load i32, i32* %arrayidx2, align 4 + %add = add i32 %14, %b.011 + %add3 = add i32 %add, %15 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} + +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)