Index: lib/MC/MCCodePadder.cpp =================================================================== --- lib/MC/MCCodePadder.cpp +++ lib/MC/MCCodePadder.cpp @@ -39,10 +39,12 @@ ArePoliciesActive = usePoliciesForBasicBlock(Context); bool InsertionPoint = basicBlockRequiresInsertionPoint(Context); - assert((!InsertionPoint || - OS->getCurrentFragment()->getKind() != MCFragment::FT_Align) && - "Cannot insert padding nops right after an alignment fragment as it " - "will ruin the alignment"); + bool BasicBlockHasAlignment = + OS->getCurrentFragment() == nullptr || + OS->getCurrentFragment()->getKind() == MCFragment::FT_Align; + assert((!InsertionPoint || !BasicBlockHasAlignment) && + "Cannot insert padding nops right after a basic block that has " + "alignment"); uint64_t PoliciesMask = MCPaddingFragment::PFK_None; if (ArePoliciesActive) { Index: lib/Target/X86/MCTargetDesc/CMakeLists.txt =================================================================== --- lib/Target/X86/MCTargetDesc/CMakeLists.txt +++ lib/Target/X86/MCTargetDesc/CMakeLists.txt @@ -3,6 +3,7 @@ X86MCTargetDesc.cpp X86MCAsmInfo.cpp X86MCCodeEmitter.cpp + X86MCCodePadder.cpp X86MachObjectWriter.cpp X86ELFObjectWriter.cpp X86WinCOFFObjectWriter.cpp Index: lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp =================================================================== --- lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86MCCodePadder.h" #include "MCTargetDesc/X86FixupKinds.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/BinaryFormat/ELF.h" @@ -73,10 +74,12 @@ const StringRef CPU; bool HasNopl; const uint64_t MaxNopLength; + public: X86AsmBackend(const Target &T, StringRef CPU) - : MCAsmBackend(), CPU(CPU), - MaxNopLength((CPU == "slm") ? 7 : 15) { + : MCAsmBackend(std::move( + std::unique_ptr(new X86::X86MCCodePadder(CPU)))), + CPU(CPU), MaxNopLength((CPU == "slm") ? 7 : 15) { HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" && CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && Index: lib/Target/X86/MCTargetDesc/X86MCCodePadder.h =================================================================== --- /dev/null +++ lib/Target/X86/MCTargetDesc/X86MCCodePadder.h @@ -0,0 +1,114 @@ +//===-- X86MCCodePadder.h - X86 Specific Code Padding Handling --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCCODEPADDER_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCCODEPADDER_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCCodePadder.h" + +namespace llvm { + +class MCPaddingFragment; +class MCAsmLayout; + +namespace X86 { + +/// The X86-specific class incharge of all code padding decisions for the X86 +/// target. +class X86MCCodePadder : public MCCodePadder { + X86MCCodePadder() = delete; + X86MCCodePadder(const X86MCCodePadder &) = delete; + void operator=(const X86MCCodePadder &) = delete; + +protected: + bool basicBlockRequiresInsertionPoint( + const MCCodePaddingContext &Context) override; + + bool usePoliciesForBasicBlock(const MCCodePaddingContext &Context) override; + +public: + X86MCCodePadder(StringRef CPU); + virtual ~X86MCCodePadder() {} +}; + +/// A padding policy that handles brach instructions (all types of jmps and +/// calls) and the first instruction after a branch (i.e. first instruction in a +/// basic block reachable by branch). +/// This policy tries to enforce that: +/// 1. Branch instructions and first instructions in basic blocks won't cross a +/// 16B aligned window. +/// 2. Branch instructions will end at a 0mod16 address. +/// +/// Note that this is also the order of importance implemented in the policy. +/// +/// This policy essentially implements part of rule 12 of section 3.4.1.5 ("Code +/// alignment") of Intel's Architectures Optimization Reference Manual: +/// "When executing code from the legacy decode pipeline, direct branches that +/// are mostly taken should have all their instruction bytes in a 16B aligned +/// chunk of memory and nearer the end of that 16B aligned chunk." +class BranchInstructionAndTargetAlignmentPolicy : public MCCodePaddingPolicy { + BranchInstructionAndTargetAlignmentPolicy( + const BranchInstructionAndTargetAlignmentPolicy &) = delete; + void operator=(const BranchInstructionAndTargetAlignmentPolicy &) = delete; + +protected: + /// Computes the penalty weight caused by having branch instruction or the + /// instruction after a branch (i.e. first instruction in a basic block + /// reachable by branch) being splitted over more than one instruction window, + /// and a branch instruction not being adjacent to the end of its 16B code + /// chunk. + /// + /// \param Window The instruction window. + /// \param Offset The offset of the parent section. + /// \param Layout Code layout information. + /// + /// \returns the penalty weight caused by having branch instruction or + /// instruction after a branch being splitted over more than one instruction + /// window, and a branch instruction not being adjacent to the end of its 16B + /// code chunk. + double computeWindowPenaltyWeight(const MCPFRange &Window, uint64_t Offset, + MCAsmLayout &Layout) const override; + +public: + BranchInstructionAndTargetAlignmentPolicy(); + virtual ~BranchInstructionAndTargetAlignmentPolicy() {} + + /// Determines if a basic block may cause the case of first instruction after + /// a branch (i.e. first instruction in a basic block reachable by branch) + /// being splitted over more than one instruction window. + /// + /// A basic block will be considered hazardous by this policy if it is + /// reachable by a branch (and not only via fallthrough). + /// + /// \param Context the context of the padding, Embeds the basic block's + /// parameters. + /// + /// \returns true iff \p Context indicates that the basic block is reachable + /// via branch. + bool basicBlockRequiresPaddingFragment( + const MCCodePaddingContext &Context) const override; + + /// Determines if an instruction may cause the case of a branch instrucion + /// being splitted over more than one instruction window or a branch not + /// being adjacent to the end of its 16B code chunk. + /// + /// An instruction will be considered hazardous by this policy if it is + /// a branch (all types of jmps and calls). + /// + /// \param Inst Instruction to examine. + /// + /// \returns true iff \p Inst is a branch (all types of jmps and calls). + bool instructionRequiresPaddingFragment(const MCInst &Inst) const override; +}; + +} // namespace X86 +} // namespace llvm + +#endif // LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCCODEPADDER_H Index: lib/Target/X86/MCTargetDesc/X86MCCodePadder.cpp =================================================================== --- /dev/null +++ lib/Target/X86/MCTargetDesc/X86MCCodePadder.cpp @@ -0,0 +1,169 @@ +///=== X86MCCodePadder.cpp - X86 Specific Code Padding Handling -*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "MCTargetDesc/X86MCCodePadder.h" +#include "MCTargetDesc/X86BaseInfo.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/MC/MCAsmLayout.h" +#include "llvm/MC/MCInst.h" +#include "llvm/MC/MCObjectStreamer.h" +#include "llvm/Target/TargetMachine.h" + +namespace llvm { +namespace X86 { + +enum PerfNopFragmentKind { + BranchInstructionAndTargetAlignment = + MCPaddingFragment::FirstTargetPerfNopFragmentKind +}; + +//--------------------------------------------------------------------------- +// X86MCCodePadder +// + +X86MCCodePadder::X86MCCodePadder(StringRef CPU) { + + if (CPU != "sandybridge" && CPU != "corei7-avx" && CPU != "ivybridge" && + CPU != "core-avx-i" && CPU != "haswell" && CPU != "core-avx2" && + CPU != "broadwell" && CPU != "skylake") + return; + + addPolicy(new BranchInstructionAndTargetAlignmentPolicy()); +} + +bool X86MCCodePadder::basicBlockRequiresInsertionPoint( + const MCCodePaddingContext &Context) { + // Insertion points are places that, if contain padding, then this padding + // will never be executed (unreachable code). + bool BasicBlockHasAlignment = + OS->getCurrentFragment() == nullptr || + OS->getCurrentFragment()->getKind() == MCFragment::FT_Align; + return MCCodePadder::basicBlockRequiresInsertionPoint(Context) || + (!Context.IsBasicBlockReachableViaFallthrough && + !BasicBlockHasAlignment); +} + +bool X86MCCodePadder::usePoliciesForBasicBlock( + const MCCodePaddingContext &Context) { + return MCCodePadder::usePoliciesForBasicBlock(Context) && + Context.IsBasicBlockInsideInnermostLoop; +} + +//--------------------------------------------------------------------------- +// Utility functions +// + +static bool isConditionalJump(const MCInst &Inst) { + unsigned int opcode = Inst.getOpcode(); + return + // Immidiate jmps + opcode == JAE_1 || opcode == JAE_2 || opcode == JAE_4 || opcode == JA_1 || + opcode == JA_2 || opcode == JA_4 || opcode == JBE_1 || opcode == JBE_2 || + opcode == JBE_4 || opcode == JB_1 || opcode == JB_2 || opcode == JB_4 || + opcode == JCXZ || opcode == JECXZ || opcode == JE_1 || opcode == JE_2 || + opcode == JE_4 || opcode == JGE_1 || opcode == JGE_2 || opcode == JGE_4 || + opcode == JG_1 || opcode == JG_2 || opcode == JG_4 || opcode == JLE_1 || + opcode == JLE_2 || opcode == JLE_4 || opcode == JL_1 || opcode == JL_2 || + opcode == JL_4 || opcode == JNE_1 || opcode == JNE_2 || opcode == JNE_4 || + opcode == JNO_1 || opcode == JNO_2 || opcode == JNO_4 || + opcode == JNP_1 || opcode == JNP_2 || opcode == JNP_4 || + opcode == JNS_1 || opcode == JNS_2 || opcode == JNS_4 || opcode == JO_1 || + opcode == JO_2 || opcode == JO_4 || opcode == JP_1 || opcode == JP_2 || + opcode == JP_4 || opcode == JRCXZ || opcode == JS_1 || opcode == JS_2 || + opcode == JS_4; +} + +static bool isFarOrIndirectUncoditionalJump(const MCInst &Inst) { + unsigned int opcode = Inst.getOpcode(); + return + // Far jmps + opcode == FARJMP16i || opcode == FARJMP16m || opcode == FARJMP32i || + opcode == FARJMP32m || opcode == FARJMP64 || + // Memory and register jmps + opcode == JMP16m || opcode == JMP16r || opcode == JMP32m || + opcode == JMP32r || opcode == JMP64m || opcode == JMP64r; +} + +static bool isUnconditionalJump(const MCInst &Inst) { + unsigned int opcode = Inst.getOpcode(); + return isFarOrIndirectUncoditionalJump(Inst) || opcode == JMP_1 || + opcode == JMP_2 || opcode == JMP_4; +} + +static bool isJump(const MCInst &Inst) { + return isConditionalJump(Inst) || isUnconditionalJump(Inst); +} + +static bool isCall(const MCInst &Inst) { + unsigned int opcode = Inst.getOpcode(); + return + // PC relative calls + opcode == CALL64pcrel32 || opcode == CALLpcrel16 || + opcode == CALLpcrel32 || + // Far calls + opcode == FARCALL16i || opcode == FARCALL16m || opcode == FARCALL32i || + opcode == FARCALL32m || opcode == FARCALL64 || + // Memory and register calls + opcode == CALL16m || opcode == CALL16r || opcode == CALL32m || + opcode == CALL32r || opcode == CALL64m || opcode == CALL64r; +} + +//--------------------------------------------------------------------------- +// BranchInstructionAndTargetAlignmentPolicy +// + +BranchInstructionAndTargetAlignmentPolicy:: + BranchInstructionAndTargetAlignmentPolicy() + : MCCodePaddingPolicy(BranchInstructionAndTargetAlignment, UINT64_C(16), + false) {} + +bool BranchInstructionAndTargetAlignmentPolicy:: + basicBlockRequiresPaddingFragment( + const MCCodePaddingContext &Context) const { + return Context.IsBasicBlockReachableViaBranch; +} + +bool BranchInstructionAndTargetAlignmentPolicy:: + instructionRequiresPaddingFragment(const MCInst &Inst) const { + return isJump(Inst) || isCall(Inst); +} + +double BranchInstructionAndTargetAlignmentPolicy::computeWindowPenaltyWeight( + const MCPFRange &Window, uint64_t Offset, MCAsmLayout &Layout) const { + + static const double SPLIT_INST_WEIGHT = 10.0; + static const double BRANCH_NOT_AT_CHUNK_END_WEIGHT = 1.0; + + double Weight = 0.0; + for (const MCPaddingFragment *Fragment : Window) { + if (!Fragment->isInstructionInitialized()) + continue; + uint64_t InstructionStartAddress = getNextFragmentOffset(Fragment, Layout); + uint64_t InstructionSecondByteAddress = + InstructionStartAddress + UINT64_C(1); + uint64_t InstructionEndAddress = + InstructionStartAddress + Fragment->getInstSize(); + // Checking if the instruction pointed by the fragment splits over more than + // one window. + if (alignTo(InstructionSecondByteAddress, WindowSize) != + alignTo(InstructionEndAddress, WindowSize)) + Weight += SPLIT_INST_WEIGHT; + if (instructionRequiresPaddingFragment(Fragment->getInst()) && + (InstructionEndAddress & UINT64_C(0xF)) != UINT64_C(0)) + Weight += BRANCH_NOT_AT_CHUNK_END_WEIGHT; + } + return Weight; +} + +} // namespace X86 +} // namespace llvm Index: test/CodeGen/X86/branch-instructions-end-of-16B-chunk-perf-nops.mir =================================================================== --- /dev/null +++ test/CodeGen/X86/branch-instructions-end-of-16B-chunk-perf-nops.mir @@ -0,0 +1,163 @@ +# RUN: llc -mcpu=haswell -filetype=obj -start-before stack-protector -O2 %s -o - | llvm-objdump -d - | FileCheck %s + +# Source C code: +# volatile int y; +# volatile int x; +# +# int perfNopsInsertion(int z, int w) { +# int result = 0; +# while (x > 0 && y < 0) { +# switch(z) { +# case 0: +# result+=result*5;break; +# case 1: +# result--; break; +# case 2: +# result *= result; break; +# case 3: +# result <<= 7; break; +# case 4: +# result >>= 7; break; +# case 5: +# result = result * 16 | ~result; break; +# } +# } +# return result; +# } + +# The test: Branch instructions should be pushed to the end of their 16B code +# Chunks. +# Expect insertion of nops in unreachable code space to make this +# happen. +# CHECK: 49: eb 65 jmp 101 +# CHECK-NEXT: 4b: 0f 1f 80 00 00 00 00 nopl (%rax) +# CHECK: 5e: eb 50 jmp 80 +# CHECK-NEXT: 60: 66 90 nop +# CHECK: 6e: eb 40 jmp 64 +# CHECK-NEXT: 70: 66 90 nop +# CHECK: 7e: eb 30 jmp 48 +# CHECK-NEXT: 80: 66 90 nop +# CHECK: 8e: eb 20 jmp 32 +# CHECK-NEXT: 90: 66 2e 0f 1f 84 00 00 00 00 00 nopw %cs:(%rax,%rax) +# CHECK: aa: 0f 8f 70 ff ff ff jg -144 +# CHECK-NEXT: b0: +--- | + ; ModuleID = 'branch_instruction_and_target_split_perf_nops.c' + source_filename = "branch_instruction_and_target_split_perf_nops.c" + target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128" + target triple = "x86_64-haswell-linux-gnu" + + + @x = common global i32 0, align 4 + @y = common global i32 0, align 4 + + ; Function Attrs: norecurse nounwind uwtable + define i32 @perfNopsInsertion(i32 %z, i32 %w) local_unnamed_addr #0 { + entry: + %0 = load volatile i32, i32* @x, align 4, !tbaa !3 + %cmp19 = icmp sgt i32 %0, 0 + br i1 %cmp19, label %land.rhs.preheader, label %while.end + + land.rhs.preheader: ; preds = %entry + br label %land.rhs + + land.rhs: ; preds = %land.rhs.preheader, %sw.epilog + %result.020 = phi i32 [ %result.1, %sw.epilog ], [ 0, %land.rhs.preheader ] + %1 = load volatile i32, i32* @y, align 4, !tbaa !3 + %cmp1 = icmp slt i32 %1, 0 + br i1 %cmp1, label %while.body, label %while.end + + while.body: ; preds = %land.rhs + switch i32 %z, label %sw.epilog [ + i32 0, label %sw.bb + i32 1, label %sw.bb2 + i32 2, label %sw.bb3 + i32 3, label %sw.bb5 + i32 4, label %sw.bb6 + i32 5, label %sw.bb7 + ] + + sw.bb: ; preds = %while.body + %add = mul nsw i32 %result.020, 6 + br label %sw.epilog + + sw.bb2: ; preds = %while.body + %dec = add nsw i32 %result.020, -1 + br label %sw.epilog + + sw.bb3: ; preds = %while.body + %mul4 = mul nsw i32 %result.020, %result.020 + br label %sw.epilog + + sw.bb5: ; preds = %while.body + %shl = shl i32 %result.020, 7 + br label %sw.epilog + + sw.bb6: ; preds = %while.body + %shr = ashr i32 %result.020, 7 + br label %sw.epilog + + sw.bb7: ; preds = %while.body + %mul8 = shl nsw i32 %result.020, 4 + %neg = xor i32 %result.020, -1 + %or = or i32 %mul8, %neg + br label %sw.epilog + + sw.epilog: ; preds = %while.body, %sw.bb7, %sw.bb6, %sw.bb5, %sw.bb3, %sw.bb2, %sw.bb + %result.1 = phi i32 [ %result.020, %while.body ], [ %or, %sw.bb7 ], [ %shr, %sw.bb6 ], [ %shl, %sw.bb5 ], [ %mul4, %sw.bb3 ], [ %dec, %sw.bb2 ], [ %add, %sw.bb ] + %2 = load volatile i32, i32* @x, align 4, !tbaa !3 + %cmp = icmp sgt i32 %2, 0 + br i1 %cmp, label %land.rhs, label %while.end + + while.end: ; preds = %land.rhs, %sw.epilog, %entry + %result.0.lcssa = phi i32 [ 0, %entry ], [ %result.1, %sw.epilog ], [ %result.020, %land.rhs ] + ret i32 %result.0.lcssa + } + + attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } + + !llvm.module.flags = !{!0, !1} + !llvm.ident = !{!2} + + !0 = !{i32 1, !"wchar_size", i32 2} + !1 = !{i32 7, !"PIC Level", i32 2} + !2 = !{!"clang version 6.0.0 (ssh://git-amr-1.devtools.intel.com:29418/dpd_icl-llvm_clang_worldread 3789ad4283ec09df1ed8411abbb227d76e7ef8cb) (ssh://git-amr-1.devtools.intel.com:29418/dpd_icl-llvm_llvm_worldread 4dc3dc453fd737d29001434b891a613f5fc22638)"} + !3 = !{!4, !4, i64 0} + !4 = !{!"int", !5, i64 0} + !5 = !{!"omnipotent char", !6, i64 0} + !6 = !{!"Simple C/C++ TBAA"} +... +--- +name: perfNopsInsertion +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: +liveins: +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 4294967295 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + savePoint: '' + restorePoint: '' +fixedStack: +stack: +constants: + +body: | + +...