Index: llvm/include/llvm/MC/MCStreamer.h =================================================================== --- llvm/include/llvm/MC/MCStreamer.h +++ llvm/include/llvm/MC/MCStreamer.h @@ -223,13 +223,18 @@ bool UseAssemblerInfoForParsing; - /// Is the assembler allowed to insert padding automatically? For - /// correctness reasons, we sometimes need to ensure instructions aren't - /// seperated in unexpected ways. At the moment, this feature is only - /// useable from an integrated assembler, but assembly syntax is under - /// discussion for future inclusion. + /// Is the assembler allowed to insert padding automatically? This is set + /// just after construction and is effectively a constant property of the + /// streamer. At the moment, this feature is only useable from an integrated + /// assembler, but assembly syntax is under discussion for future inclusion. bool AllowAutoPadding = false; + /// If autopadding is allowed, is it currently active? For correctness + /// reasons, we sometimes need to ensure instructions aren't seperated in + /// unexpected ways. In addition to legality, it can also be suppressed for + /// code size regions. + bool AutoPaddingEnabled = false; + protected: MCStreamer(MCContext &Ctx); @@ -277,6 +282,9 @@ void setAllowAutoPadding(bool v) { AllowAutoPadding = v; } bool getAllowAutoPadding() const { return AllowAutoPadding; } + void setAutoPaddingEnabled(bool v) { AutoPaddingEnabled = v; } + bool getAutoPaddingEnabled() const { return AutoPaddingEnabled; } + /// When emitting an object file, create and emit a real label. When emitting /// textual assembly, this should do nothing to avoid polluting our output. virtual MCSymbol *EmitCFILabel(); Index: llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp =================================================================== --- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -282,6 +282,13 @@ // Allow the target to emit any magic that it wants at the start of the file. EmitStartOfAsmFile(M); + // For an autopadding assembly, default to enabling autopadding. This will + // be selective disabled for cold blocks. + if (OutStreamer->getAllowAutoPadding()) { + OutStreamer->setAutoPaddingEnabled(true); + OutStreamer->emitRawComment("autopadding"); + } + // Very minimal debug info. It is ignored if we emit actual debug info. If we // don't, this at least helps the user find where a global came from. if (MAI->hasSingleParameterDotFile()) { @@ -2929,6 +2936,18 @@ PrintChildLoopComment(OS, Loop, AP.getFunctionNumber()); } +/// Return true if this basic block is definitely a cold block per provided +/// profiling information. Note that a lack of profile is considered not-cold. +static bool isColdBlock(const MachineBasicBlock *MBB, + ProfileSummaryInfo *PSI, + const MachineBlockFrequencyInfo *MBFI) { + if (!PSI || !MBFI) + return false; + auto Count = MBFI->getBlockProfileCount(MBB); + return Count && PSI->isColdCount(*Count); +} + + /// EmitBasicBlockStart - This method prints the label for the specified /// MachineBasicBlock, an alignment (if present) and a comment describing /// it if appropriate. @@ -2990,6 +3009,20 @@ OutStreamer->AddComment("Label of block must be emitted"); OutStreamer->EmitLabel(MBB.getSymbol()); } + + // If we're using an autopadding assembler, disable autopadding in cold + // blocks. We don't want to burn codesize in locations which don't + // contribute to performance. + if (OutStreamer->getAllowAutoPadding()) { + const bool Enable = !isColdBlock(&MBB, PSI, MBFI); + if (Enable != OutStreamer->getAutoPaddingEnabled()) { + OutStreamer->setAutoPaddingEnabled(Enable); + if (Enable) + OutStreamer->emitRawComment("autopadding"); + else + OutStreamer->emitRawComment("noautopadding"); + } + } } void AsmPrinter::EmitBasicBlockEnd(const MachineBasicBlock &MBB) {} Index: llvm/lib/MC/MCParser/AsmParser.cpp =================================================================== --- llvm/lib/MC/MCParser/AsmParser.cpp +++ llvm/lib/MC/MCParser/AsmParser.cpp @@ -890,6 +890,11 @@ // Prime the lexer. Lex(); + // If we're assembling an .s file w/autopadding explicitly enabled, enable + // autopadding from the beginning. + if (getStreamer().getAllowAutoPadding()) + getStreamer().setAutoPaddingEnabled(true); + HadError = false; AsmCond StartingCondState = TheCondState; SmallVector AsmStrRewrites; Index: llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp =================================================================== --- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -343,6 +343,9 @@ return false; assert(allowAutoPadding() && "incorrect initialization!"); + if (!OS.getAutoPaddingEnabled()) + return false; + MCAssembler &Assembler = OS.getAssembler(); MCSection *Sec = OS.getCurrentSectionOnly(); // To be Done: Currently don't deal with Bundle cases. Index: llvm/lib/Target/X86/X86MCInstLower.cpp =================================================================== --- llvm/lib/Target/X86/X86MCInstLower.cpp +++ llvm/lib/Target/X86/X86MCInstLower.cpp @@ -1165,18 +1165,20 @@ /// padding added between them for correctness. struct NoAutoPaddingScope { MCStreamer &OS; - const bool OldAllowAutoPadding; + const bool OldAutoPadding; NoAutoPaddingScope(MCStreamer &OS) - : OS(OS), OldAllowAutoPadding(OS.getAllowAutoPadding()) { + : OS(OS), OldAutoPadding(OS.getAutoPaddingEnabled()) { changeAndComment(false); } ~NoAutoPaddingScope() { - changeAndComment(OldAllowAutoPadding); + changeAndComment(OldAutoPadding); } void changeAndComment(bool b) { - if (b == OS.getAllowAutoPadding()) + if (!OS.getAllowAutoPadding()) return; - OS.setAllowAutoPadding(b); + if (b == OS.getAutoPaddingEnabled()) + return; + OS.setAutoPaddingEnabled(b); if (b) OS.emitRawComment("autopadding"); else Index: llvm/test/CodeGen/X86/align-branch-boundary-pgo.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/align-branch-boundary-pgo.ll @@ -0,0 +1,95 @@ +; RUN: llc -verify-machineinstrs -O3 -mcpu=skylake -x86-align-branch-boundary=32 -x86-align-branch=call+jmp+indirect+ret+jcc+fused -filetype=obj < %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s +; RUN: llc -verify-machineinstrs -O3 -mcpu=skylake -x86-align-branch-boundary=32 -x86-align-branch=call+jmp+indirect+ret+jcc+fused < %s | FileCheck --check-prefix=ASM %s + + +; Test exists to domonstrate that we suppress autopadding along cold paths + +target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-linux-gnu" + +define void @test(i1 %c) !prof !{!"function_entry_count", i64 99999} { +; CHECK: 0: pushq +; CHECK-NEXT: 1: movl +; CHECK-NEXT: 3: callq +; CHECK-NEXT: 8: callq +; CHECK-NEXT: d: callq +; CHECK-NEXT: 12: callq +; CHECK-NEXT: 17: callq +; Entry is hot (and thus needs padding) +; CHECK-NEXT: 1c: nopl +; CHECK-NEXT: 20: testb +; CHECK-NEXT: 23: je +; CHECK-NEXT: 25: callq +; CHECK-NEXT: 2a: callq +; CHECK-NEXT: 2f: callq +; CHECK-NEXT: 34: callq +; CHECK-NEXT: 39: callq +; Taken is hot (and thus needs padding) +; CHECK-NEXT: 3e: nop +; CHECK-NEXT: 40: callq +; CHECK-NEXT: 45: popq +; CHECK-NEXT: 46: retq +; CHECK-NEXT: 47: callq +; CHECK-NEXT: 4c: callq +; CHECK-NEXT: 51: callq +; CHECK-NEXT: 56: callq +; Note the *lack* of padding on the untaken path +; CHECK-NEXT: 5b: callq +; CHECK-NEXT: 60: popq +; CHECK-NEXT: 61: retq + +; ASM: .text +; ASM-NEXT: #autopadding +; ASM-NEXT: .file "" +; ASM-NOT: autopadding +; ASM: .LBB0_2: # %untaken +; ASM-NEXT: #noautopadding + +entry: + call void @foo() + call void @foo() + call void @foo() + call void @foo() + call void @foo() + br i1 %c, label %taken, label %untaken, !prof !{!"branch_weights", i32 99999, i32 0} + +taken: + call void @foo() + call void @foo() + call void @foo() + call void @foo() + call void @foo() + call void @baz() + ret void +untaken: + call void @foo() + call void @foo() + call void @foo() + call void @foo() + call void @bar() + ret void +} + +declare void @foo() +declare void @bar() +declare void @baz() + +; Note: This is complete garbage copied from another test file. It doesn't +; appear to be documented in LangRef, so I have no idea what it means. It +; seems to be required to get a meaningful ProfileSummaryInfo object? +; TODO: revisit the design on how to compute a cold block! +!llvm.module.flags = !{!0} +!0 = !{i32 1, !"ProfileSummary", !1} +!1 = !{!2, !3, !4, !5, !6, !7, !8, !9} +!2 = !{!"ProfileFormat", !"InstrProf"} +!3 = !{!"TotalCount", i64 10000} +!4 = !{!"MaxCount", i64 10} +!5 = !{!"MaxInternalCount", i64 1} +!6 = !{!"MaxFunctionCount", i64 1000} +!7 = !{!"NumCounts", i64 3} +!8 = !{!"NumFunctions", i64 3} +!9 = !{!"DetailedSummary", !10} +!10 = !{!11, !12, !13} +!11 = !{i32 10000, i64 100, i32 1} +!12 = !{i32 999000, i64 100, i32 1} +!13 = !{i32 999999, i64 1, i32 2}