Index: include/llvm/Target/TargetInstrInfo.h =================================================================== --- include/llvm/Target/TargetInstrInfo.h +++ include/llvm/Target/TargetInstrInfo.h @@ -85,6 +85,13 @@ isReallyTriviallyReMaterializableGeneric(MI, AA))); } + /// Determines the size of the specified MachineInstr. + /// \returns -1 if the target was unable to determine the size of the + /// instruction. + virtual unsigned GetInstSizeInBytes(const MachineInstr *MI) const { + return ~0x0; + } + protected: /// For instructions with opcodes for which the M_REMATERIALIZABLE flag is /// set, this hook lets the target specify whether the instruction is actually Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -56,6 +56,10 @@ "Potential frequency of taking conditional branches"); STATISTIC(UncondBranchTakenFreq, "Potential frequency of taking unconditional branches"); +STATISTIC(NumAlignedBlocks, "Number of aligned blocks"); +STATISTIC( + NumAlignedBlocksWithoutNops, + "Number of aligned blocks that don't insert nops in instruction stream"); static cl::opt AlignAllBlock("align-all-blocks", cl::desc("Force the alignment of all " @@ -218,6 +222,10 @@ namespace { class MachineBlockPlacement : public MachineFunctionPass { + /// \brief Did the MBP pass set the alignment of any MBBs? If so, optimize + /// nop placement, if possible. + bool SetAlignmentOfBlocks; + /// \brief A typedef for a block filter set. typedef SmallPtrSet BlockFilterSet; @@ -299,9 +307,12 @@ const BlockFilterSet &LoopBlockSet); void buildCFGChains(MachineFunction &F); + void optimizeAlignmentNops(MachineFunction &F); + public: static char ID; // Pass identification, replacement for typeid - MachineBlockPlacement() : MachineFunctionPass(ID) { + MachineBlockPlacement() + : MachineFunctionPass(ID), SetAlignmentOfBlocks(false) { initializeMachineBlockPlacementPass(*PassRegistry::getPassRegistry()); } @@ -1358,6 +1369,8 @@ // Force alignment if all the predecessors are jumps. We already checked // that the block isn't cold above. if (!LayoutPred->isSuccessor(ChainBB)) { + ++NumAlignedBlocks; + SetAlignmentOfBlocks = true; ChainBB->setAlignment(Align); continue; } @@ -1369,8 +1382,112 @@ BranchProbability LayoutProb = MBPI->getEdgeProbability(LayoutPred, ChainBB); BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb; - if (LayoutEdgeFreq <= (Freq * ColdProb)) + if (LayoutEdgeFreq <= (Freq * ColdProb)) { + ++NumAlignedBlocks; + SetAlignmentOfBlocks = true; ChainBB->setAlignment(Align); + } + } +} + +namespace { + +typedef struct AlignmentRegion { + // Alignment. + unsigned Alignment; + + // Total bytes in the region. + unsigned NumBytes; + + // Track blocks that have no fall-through predecessors. Aligning these blocks + // do not insert nops into the dynamic instruction stream. + SmallVector, 4> NonFallThruBlocks; + + AlignmentRegion(unsigned A) : Alignment(A), NumBytes(0) {} +} AlignmentRegion; + +} // namespace + +void MachineBlockPlacement::optimizeAlignmentNops(MachineFunction &F) { + bool EntryBlock = true; + AlignmentRegion *CurrRegion = nullptr; + SmallVector Regions; + + for (auto MBBI = F.begin(), MBBE = F.end(); MBBI != MBBE; ++MBBI) { + unsigned Alignment = MBBI->getAlignment(); + MachineBasicBlock *MBB = &*MBBI; + + // If this block is aligned (or is the entry block), create a new region. + if (Alignment || EntryBlock) { + unsigned FnAlignment = F.getAlignment(); + if (EntryBlock && !Alignment) + Alignment = FnAlignment; + Regions.push_back(Alignment); + CurrRegion = &Regions.back(); + } else { + assert(!EntryBlock && "Didn't expect the entry block."); + // Determine if this is a non fall-through block. + auto LayoutPred = std::prev(MBBI); + if (!LayoutPred->isSuccessor(MBB)) + CurrRegion->NonFallThruBlocks.push_back( + std::make_pair(MBB, CurrRegion->NumBytes)); + } + EntryBlock = false; + + // Count the number of bytes in the block. + for (auto &I : *MBB) { + // Conservatively assume we don't know the size of inline assembly and + // bail. + if (I.isInlineAsm()) + return; + + unsigned NumBytes = TII->GetInstSizeInBytes(&I); + if (NumBytes == (unsigned)~0x0) + return; + + CurrRegion->NumBytes += NumBytes; + } + } + // If we have a single region we don't have any aligned block in the function. + if (Regions.size() == 1) + return; + + // See if we can align the function to minimize nops in the first region. + bool AlignedFunction = false; + if (F.getAlignment() < Regions[1].Alignment) { + unsigned PrefAlignment = Regions[1].Alignment; + unsigned Mask = (1 << PrefAlignment) - 1; + unsigned UnalignedBytes = Regions[0].NumBytes & Mask; + if (!UnalignedBytes) { + AlignedFunction = true; + ++NumAlignedBlocksWithoutNops; + F.setAlignment(PrefAlignment); + } + } + + // Align non-fall through blocks to minimize nops in the dynamic instruction + // stream. + for (unsigned i = AlignedFunction ? 1 : 0, ie = Regions.size() - 1; i != ie; + ++i) { + // Get the preferred alignment for the next region. + unsigned PrefAlignment = Regions[i + 1].Alignment; + unsigned Mask = (1 << PrefAlignment) - 1; + unsigned RegionBytes = Regions[i].NumBytes; + // Look for a NFTB that can be aligned to remove all nops. + bool Success = false; + for (unsigned j = 0, je = Regions[i].NonFallThruBlocks.size(); j != je; + ++j) { + unsigned NFTBBytes = Regions[i].NonFallThruBlocks[j].second; + unsigned NFTBUnalignedBytes = (RegionBytes - NFTBBytes) & Mask; + if (!NFTBUnalignedBytes) { + MachineBasicBlock *MBB = Regions[i].NonFallThruBlocks[j].first; + assert(!MBB->getAlignment() && "Aligned blocks start new regions."); + ++NumAlignedBlocksWithoutNops; + MBB->setAlignment(PrefAlignment); + Success = true; + break; + } + } } } @@ -1395,6 +1512,10 @@ BlockToChain.clear(); ChainAllocator.DestroyAll(); + // If we aligned any basic blocks, optimize the placement of the nops. + if (SetAlignmentOfBlocks) + optimizeAlignmentNops(F); + if (AlignAllBlock) // Align all of the blocks in the function to a specific alignment. for (MachineBasicBlock &MBB : F) Index: lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.h +++ lib/Target/AArch64/AArch64InstrInfo.h @@ -45,7 +45,7 @@ /// always be able to get register info as well (through this method). const AArch64RegisterInfo &getRegisterInfo() const { return RI; } - unsigned GetInstSizeInBytes(const MachineInstr *MI) const; + unsigned GetInstSizeInBytes(const MachineInstr *MI) const override; bool isAsCheapAsAMove(const MachineInstr *MI) const override; Index: lib/Target/ARM/ARMBaseInstrInfo.h =================================================================== --- lib/Target/ARM/ARMBaseInstrInfo.h +++ lib/Target/ARM/ARMBaseInstrInfo.h @@ -156,7 +156,7 @@ /// GetInstSize - Returns the size of the specified MachineInstr. /// - virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const; + virtual unsigned GetInstSizeInBytes(const MachineInstr *MI) const override; unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const override; Index: lib/Target/MSP430/MSP430InstrInfo.h =================================================================== --- lib/Target/MSP430/MSP430InstrInfo.h +++ lib/Target/MSP430/MSP430InstrInfo.h @@ -69,7 +69,7 @@ const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; - unsigned GetInstSizeInBytes(const MachineInstr *MI) const; + unsigned GetInstSizeInBytes(const MachineInstr *MI) const override; // Branch folding goodness bool Index: lib/Target/Mips/MipsInstrInfo.h =================================================================== --- lib/Target/Mips/MipsInstrInfo.h +++ lib/Target/Mips/MipsInstrInfo.h @@ -92,7 +92,7 @@ virtual unsigned getOppositeBranchOpc(unsigned Opc) const = 0; /// Return the number of bytes of code the specified instruction may be. - unsigned GetInstSizeInBytes(const MachineInstr *MI) const; + unsigned GetInstSizeInBytes(const MachineInstr *MI) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Index: lib/Target/PowerPC/PPCInstrInfo.h =================================================================== --- lib/Target/PowerPC/PPCInstrInfo.h +++ lib/Target/PowerPC/PPCInstrInfo.h @@ -260,7 +260,7 @@ /// GetInstSize - Return the number of bytes of code the specified /// instruction may be. This returns the maximum number of bytes. /// - unsigned GetInstSizeInBytes(const MachineInstr *MI) const; + unsigned GetInstSizeInBytes(const MachineInstr *MI) const override; void getNoopForMachoTarget(MCInst &NopInst) const override;