Index: llvm/include/llvm/MC/MCAsmBackend.h =================================================================== --- llvm/include/llvm/MC/MCAsmBackend.h +++ llvm/include/llvm/MC/MCAsmBackend.h @@ -185,6 +185,13 @@ virtual bool isMicroMips(const MCSymbol *Sym) const { return false; } + + /// Given the target mask from a .push_align_branch_boundary directive, + /// return the string suitable an assembly file. + virtual std::string getAlignBranchBoundaryMaskStr(unsigned Mask) const { + llvm_unreachable("unsupported target for .push_align_branch_boundary?"); + return ""; + } }; } // end namespace llvm Index: llvm/include/llvm/MC/MCStreamer.h =================================================================== --- llvm/include/llvm/MC/MCStreamer.h +++ llvm/include/llvm/MC/MCStreamer.h @@ -222,6 +222,10 @@ bool UseAssemblerInfoForParsing; + /// This is stack of align_branch_boundary directives. Data is pair of + /// alignment, and target specific instruction kind mask. + SmallVector, 4> AlignBranchBoundaryStack; + protected: MCStreamer(MCContext &Ctx); @@ -1008,6 +1012,28 @@ /// Ends a bundle-locked group. virtual void EmitBundleUnlock(); + /// Enter a new .align_branch_boundary region which allows alignment for + /// performance of instruction described by the target specific Mask to be + /// aligned so as no to cross or end at an alignment boundary of 2^Align. + virtual void EmitPushAlignBranchBoundary(unsigned Align, unsigned Mask) { + AlignBranchBoundaryStack.push_back(std::make_pair(Align, Mask)); + } + + /// Ends a .align_branch_boundary region. + virtual void EmitPopAlignBranchBoundary() { + assert(!AlignBranchBoundaryStack.empty()); + AlignBranchBoundaryStack.pop_back(); + } + + /// Return information on the current .align_branch_boundary region if any, + /// or None. + Optional> + getCurrentAlignBranchBoundary() const { + if (AlignBranchBoundaryStack.empty()) + return None; + return AlignBranchBoundaryStack.back(); + } + /// If this file is backed by a assembly streamer, this dumps the /// specified string in the output .s file. This capability is indicated by /// the hasRawTextSupport() predicate. By default this aborts. Index: llvm/lib/MC/MCAsmStreamer.cpp =================================================================== --- llvm/lib/MC/MCAsmStreamer.cpp +++ llvm/lib/MC/MCAsmStreamer.cpp @@ -337,6 +337,9 @@ void EmitBundleLock(bool AlignToEnd) override; void EmitBundleUnlock() override; + void EmitPushAlignBranchBoundary(unsigned Align, unsigned Mask) override; + void EmitPopAlignBranchBoundary() override; + bool EmitRelocDirective(const MCExpr &Offset, StringRef Name, const MCExpr *Expr, SMLoc Loc, const MCSubtargetInfo &STI) override; @@ -1972,6 +1975,22 @@ EmitEOL(); } +void MCAsmStreamer::EmitPushAlignBranchBoundary(unsigned Align, + unsigned Mask) { + MCStreamer::EmitPushAlignBranchBoundary(Align, Mask); + OS << "\t.llvm_internal_push_align_branch_boundary "; + OS << Align << " "; + OS << getAssembler().getBackend().getAlignBranchBoundaryMaskStr(Mask); + EmitEOL(); +} + +void MCAsmStreamer::EmitPopAlignBranchBoundary() { + MCStreamer::EmitPopAlignBranchBoundary(); + OS << "\t.llvm_internal_pop_align_branch_boundary"; + EmitEOL(); +} + + bool MCAsmStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name, const MCExpr *Expr, SMLoc, const MCSubtargetInfo &STI) { Index: llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp =================================================================== --- llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -31,6 +31,7 @@ #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/MC/MCSymbol.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" @@ -39,6 +40,15 @@ using namespace llvm; + +cl::opt AllowLLVMInternalSyntax( + "parse-llvm-internal-testing-asm-directives", cl::init(false), + cl::ReallyHidden, + cl::desc( + "Support additional assembly directives which exist for LLVM's " + "internal testing purposes only. These are not stable and no " + "compatibility will be provided in any form.")); + static bool checkScale(unsigned Scale, StringRef &ErrMsg) { if (Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) { ErrMsg = "scale factor in address must be 1, 2, 4 or 8"; @@ -922,6 +932,9 @@ bool parseDirectiveSEHSaveXMM(SMLoc); bool parseDirectiveSEHPushFrame(SMLoc); + /// .push_align_branch_boundary + bool parseDirectivePushAlignBranchBoundary(SMLoc); + unsigned checkTargetMatchPredicate(MCInst &Inst) override; bool validateInstruction(MCInst &Inst, const OperandVector &Ops); @@ -3677,6 +3690,21 @@ return parseDirectiveSEHSaveXMM(DirectiveID.getLoc()); else if (IDVal == ".seh_pushframe") return parseDirectiveSEHPushFrame(DirectiveID.getLoc()); + else if (AllowLLVMInternalSyntax && + IDVal == ".llvm_internal_push_align_branch_boundary") + return parseDirectivePushAlignBranchBoundary(DirectiveID.getLoc()); + else if (AllowLLVMInternalSyntax && + IDVal == ".llvm_internal_pop_align_branch_boundary") { + if (None == getStreamer().getCurrentAlignBranchBoundary()) + report_fatal_error("Mismatched .pop_align_branch_boundary"); + + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + getParser().Lex(); + + getStreamer().EmitPopAlignBranchBoundary(); + return false; + } return true; } @@ -3947,6 +3975,37 @@ return false; } +bool X86AsmParser::parseDirectivePushAlignBranchBoundary(SMLoc Loc) { + int64_t BoundaryLog2; + if (getParser().parseAbsoluteExpression(BoundaryLog2)) + return TokError("expected power-of-2 aligment expression"); + + unsigned Mask = 0; + if (getLexer().isNot(AsmToken::EndOfStatement)) { + do { + StringRef Name; + if (getParser().parseIdentifier(Name)) + return TokError("Missing instruction type"); + unsigned MaskBit = StringSwitch(Name) + .Case("fused", X86::AlignBranchFused) + .Case("jcc", X86::AlignBranchJcc) + .Case("jmp", X86::AlignBranchJmp) + .Case("call", X86::AlignBranchCall) + .Case("ret", X86::AlignBranchRet) + .Case("indirect", X86::AlignBranchIndirect) + .Default(-1); + if (MaskBit == -1U) + return TokError("unrecognized instruction type in directive"); + Mask |= MaskBit; + } while (parseOptionalToken(AsmToken::Comma)); + } + if (getLexer().isNot(AsmToken::EndOfStatement)) + return TokError("unexpected token in directive"); + getParser().Lex(); + getStreamer().EmitPushAlignBranchBoundary(BoundaryLog2, Mask); + return false; +} + // Force static initialization. extern "C" void LLVMInitializeX86AsmParser() { RegisterMCAsmParser X(getTheX86_32Target()); Index: llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp =================================================================== --- llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -8,6 +8,7 @@ #include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86FixupKinds.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/BinaryFormat/MachO.h" @@ -240,6 +241,8 @@ MCInst &Res) const override; bool writeNopData(raw_ostream &OS, uint64_t Count) const override; + + std::string getAlignBranchBoundaryMaskStr(unsigned Mask) const override; }; } // end anonymous namespace @@ -666,6 +669,24 @@ return true; } +std::string X86AsmBackend::getAlignBranchBoundaryMaskStr(unsigned Mask) const { + SmallVector Strings; + if (Mask & X86::AlignBranchFused) + Strings.push_back("fused"); + if (Mask & X86::AlignBranchJcc) + Strings.push_back("jcc"); + if (Mask & X86::AlignBranchJmp) + Strings.push_back("jmp"); + if (Mask & X86::AlignBranchCall) + Strings.push_back("call"); + if (Mask & X86::AlignBranchRet) + Strings.push_back("ret"); + if (Mask & X86::AlignBranchIndirect) + Strings.push_back("indirect"); + + return join(Strings, ", "); +} + /* *** */ namespace { Index: llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h =================================================================== --- llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -345,6 +345,19 @@ } llvm_unreachable("unknown fusion type"); } + + /// Defines the value used for the target specific mask in a scope for + /// align_branch_boundary + enum AlignBranchBoundaryKinds : uint8_t { + AlignBranchNone = 0, + AlignBranchFused = 1U << 0, + AlignBranchJcc = 1U << 1, + AlignBranchJmp = 1U << 2, + AlignBranchCall = 1U << 3, + AlignBranchRet = 1U << 4, + AlignBranchIndirect = 1U << 5 + }; + } // end namespace X86; /// X86II - This namespace holds all of the target specific flags that Index: llvm/lib/Target/X86/X86AsmPrinter.cpp =================================================================== --- llvm/lib/Target/X86/X86AsmPrinter.cpp +++ llvm/lib/Target/X86/X86AsmPrinter.cpp @@ -75,9 +75,26 @@ OutStreamer->EndCOFFSymbolDef(); } + // Some Intel processors require additional alignment padding to mitigate an + // performance issue introduced microcode update fix for a correctness issue + // which negative impacted uop caching of branches which cross a 32 byte + // boundary. For the moment, CLFLUSHOPT is being used a proxy for which + // machines need the mitigation since this was also introduced in the + // affected generation (skylake), but this needs split out into it's own flag + // (FIXME BEFORE COMMIT). + const bool NeedsJCCAlignMitigation = Subtarget->hasCLFLUSHOPT(); + if (NeedsJCCAlignMitigation) { + const uint8_t DefaultAligned = + (X86::AlignBranchFused | X86::AlignBranchJcc | X86::AlignBranchJmp); + OutStreamer->EmitPushAlignBranchBoundary(5, DefaultAligned); + } + // Emit the rest of the function body. EmitFunctionBody(); + if (NeedsJCCAlignMitigation) + OutStreamer->EmitPopAlignBranchBoundary(); + // Emit the XRay table for this function. emitXRayTable(); Index: llvm/lib/Target/X86/X86MCInstLower.cpp =================================================================== --- llvm/lib/Target/X86/X86MCInstLower.cpp +++ llvm/lib/Target/X86/X86MCInstLower.cpp @@ -1142,10 +1142,33 @@ } } +/// A RAII helper which defines a region of instructions which can't have +/// padding added between them for correctness. This is a nop unless we're in +/// a branch_boundary_align region, in which case it pushes a new scope with +/// all padding kinds disabled, and then pops it at end of scope. +struct NoAutoPaddingScope { + MCStreamer &OS; + bool EmitPop = false; + NoAutoPaddingScope(MCStreamer &OS) + : OS(OS) { + if (auto AlignInfo = OS.getCurrentAlignBranchBoundary()) { + // If active, push a new scope with all padding disabled + OS.EmitPushAlignBranchBoundary(AlignInfo->first, 0); + EmitPop = true; + } + } + ~NoAutoPaddingScope() { + if (EmitPop) + OS.EmitPopAlignBranchBoundary(); + } +}; + void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL) { assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64"); + NoAutoPaddingScope NoPadScope(*OutStreamer); + StatepointOpers SOpers(&MI); if (unsigned PatchBytes = SOpers.getNumPatchBytes()) { EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(), @@ -1207,6 +1230,8 @@ // FAULTING_LOAD_OP , , , // , + NoAutoPaddingScope NoPadScope(*OutStreamer); + Register DefRegister = FaultingMI.getOperand(0).getReg(); FaultMaps::FaultKind FK = static_cast(FaultingMI.getOperand(1).getImm()); @@ -1253,6 +1278,8 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI, X86MCInstLower &MCIL) { // PATCHABLE_OP minsize, opcode, operands + + NoAutoPaddingScope NoPadScope(*OutStreamer); unsigned MinSize = MI.getOperand(0).getImm(); unsigned Opcode = MI.getOperand(1).getImm(); @@ -1292,6 +1319,8 @@ void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) { SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); + NoAutoPaddingScope NoPadScope(*OutStreamer); + auto &Ctx = OutStreamer->getContext(); MCSymbol *MILabel = Ctx.createTempSymbol(); OutStreamer->EmitLabel(MILabel); @@ -1309,6 +1338,8 @@ SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); + NoAutoPaddingScope NoPadScope(*OutStreamer); + auto &Ctx = OutStreamer->getContext(); MCSymbol *MILabel = Ctx.createTempSymbol(); OutStreamer->EmitLabel(MILabel); @@ -1368,6 +1399,8 @@ X86MCInstLower &MCIL) { assert(Subtarget->is64Bit() && "XRay custom events only supports X86-64"); + NoAutoPaddingScope NoPadScope(*OutStreamer); + // We want to emit the following pattern, which follows the x86 calling // convention to prepare for the trampoline call to be patched in. // @@ -1462,6 +1495,8 @@ X86MCInstLower &MCIL) { assert(Subtarget->is64Bit() && "XRay typed events only supports X86-64"); + NoAutoPaddingScope NoPadScope(*OutStreamer); + // We want to emit the following pattern, which follows the x86 calling // convention to prepare for the trampoline call to be patched in. // @@ -1559,6 +1594,9 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI, X86MCInstLower &MCIL) { + + NoAutoPaddingScope NoPadScope(*OutStreamer); + // We want to emit the following pattern: // // .p2align 1, ... @@ -1586,6 +1624,8 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI, X86MCInstLower &MCIL) { + NoAutoPaddingScope NoPadScope(*OutStreamer); + // Since PATCHABLE_RET takes the opcode of the return statement as an // argument, we use that to emit the correct form of the RET that we want. // i.e. when we see this: @@ -1616,6 +1656,8 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL) { + NoAutoPaddingScope NoPadScope(*OutStreamer); + // Like PATCHABLE_RET, we have the actual instruction in the operands to this // instruction so we lower that particular instruction and its operands. // Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how Index: llvm/test/CodeGen/X86/implicit-null-check-noautopad.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/implicit-null-check-noautopad.ll @@ -0,0 +1,32 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -O3 -mtriple=x86_64-apple-macosx -enable-implicit-null-checks -mcpu=skylake < %s | FileCheck %s + +; If we have autopadding enabled, make sure the label isn't separated from +; the mov. + +define i32 @test(i32* %x) { +; CHECK-LABEL: test: +; CHECK: ## %bb.0: ## %entry +; CHECK-NEXT: .llvm_internal_push_align_branch_boundary 5 +; CHECK-NEXT: Ltmp0: +; CHECK-NEXT: movl (%rdi), %eax ## on-fault: LBB0_1 +; CHECK-NEXT: .llvm_internal_pop_align_branch_boundary +; CHECK-NEXT: ## %bb.2: ## %not_null +; CHECK-NEXT: retq +; CHECK-NEXT: LBB0_1: ## %is_null +; CHECK-NEXT: movl $42, %eax +; CHECK-NEXT: retq + + entry: + %c = icmp eq i32* %x, null + br i1 %c, label %is_null, label %not_null, !make.implicit !0 + + is_null: + ret i32 42 + + not_null: + %t = load atomic i32, i32* %x unordered, align 4 + ret i32 %t +} + +!0 = !{} Index: llvm/test/CodeGen/X86/statepoint-no-realign-stack.ll =================================================================== --- llvm/test/CodeGen/X86/statepoint-no-realign-stack.ll +++ llvm/test/CodeGen/X86/statepoint-no-realign-stack.ll @@ -24,8 +24,10 @@ ; CHECK-NEXT: vmovaps (%rdi), %ymm0 ; CHECK-NEXT: vmovaps %ymm0, (%rsp) ; CHECK-NEXT: vzeroupper +; CHECK-NEXT: .llvm_internal_push_align_branch_boundary 5 ; CHECK-NEXT: callq foo ; CHECK-NEXT: .Ltmp0: +; CHECK-NEXT: .llvm_internal_pop_align_branch_boundary ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp ; CHECK-NEXT: .cfi_def_cfa %rsp, 8 @@ -43,8 +45,10 @@ ; CHECK-NEXT: vmovaps (%rdi), %ymm0 ; CHECK-NEXT: vmovups %ymm0, (%rsp) ; CHECK-NEXT: vzeroupper +; CHECK-NEXT: .llvm_internal_push_align_branch_boundary 5 ; CHECK-NEXT: callq foo ; CHECK-NEXT: .Ltmp1: +; CHECK-NEXT: .llvm_internal_pop_align_branch_boundary ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 ; CHECK-NEXT: retq @@ -68,8 +72,10 @@ ; CHECK-NEXT: subq $64, %rsp ; CHECK-NEXT: vmovaps %ymm0, (%rsp) ; CHECK-NEXT: vzeroupper +; CHECK-NEXT: .llvm_internal_push_align_branch_boundary 5 ; CHECK-NEXT: callq do_safepoint ; CHECK-NEXT: .Ltmp2: +; CHECK-NEXT: .llvm_internal_pop_align_branch_boundary ; CHECK-NEXT: vmovaps (%rsp), %ymm0 ; CHECK-NEXT: movq %rbp, %rsp ; CHECK-NEXT: popq %rbp @@ -88,8 +94,10 @@ ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: vmovups %ymm0, (%rsp) ; CHECK-NEXT: vzeroupper +; CHECK-NEXT: .llvm_internal_push_align_branch_boundary 5 ; CHECK-NEXT: callq do_safepoint ; CHECK-NEXT: .Ltmp3: +; CHECK-NEXT: .llvm_internal_pop_align_branch_boundary ; CHECK-NEXT: vmovups (%rsp), %ymm0 ; CHECK-NEXT: addq $40, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 8 Index: llvm/test/MC/X86/align-branch-boundary.s =================================================================== --- /dev/null +++ llvm/test/MC/X86/align-branch-boundary.s @@ -0,0 +1,32 @@ + # RUN: llvm-mc -triple x86_64-pc-linux-gnu -parse-llvm-internal-testing-asm-directives %s | FileCheck %s + + # basic sanity check for round trip serialization of the proposed + # assembler extension. Many more tests needed of course + + # CHECK: test1: + # CHECK-NEXT: .llvm_internal_push_align_branch_boundary 7 + # CHECK-NEXT: .llvm_internal_push_align_branch_boundary 5 jmp, ret, indirect + # CHECK-NEXT: .llvm_internal_push_align_branch_boundary 5 fused, jcc, call + # CHECK-NEXT: callq bar + # CHECK-NEXT: .llvm_internal_pop_align_branch_boundary + # CHECK-NEXT: .llvm_internal_pop_align_branch_boundary + # CHECK-NEXT: .llvm_internal_pop_align_branch_boundary + .text + .globl test1 + .p2align 5 +test1: + .llvm_internal_push_align_branch_boundary 7 #no instruction type + .llvm_internal_push_align_branch_boundary 5 jmp, indirect, ret + .llvm_internal_push_align_branch_boundary 5 jcc, fused, call + callq bar + .llvm_internal_pop_align_branch_boundary + .llvm_internal_pop_align_branch_boundary + .llvm_internal_pop_align_branch_boundary +bar: + retq + + # a far target (4 byte imm) + .section "unknown" + .type baz,@function +baz: + retq