Index: llvm/trunk/include/llvm/CodeGen/MachineBasicBlock.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/MachineBasicBlock.h +++ llvm/trunk/include/llvm/CodeGen/MachineBasicBlock.h @@ -477,6 +477,11 @@ /// probabilities may need to be normalized. void copySuccessor(MachineBasicBlock *Orig, succ_iterator I); + /// Split the old successor into old plus new and updates the probability + /// info. + void splitSuccessor(MachineBasicBlock *Old, MachineBasicBlock *New, + bool NormalizeSuccProbs = false); + /// Transfers all the successors from MBB to this machine basic block (i.e., /// copies all the successors FromMBB and remove all the successors from /// FromMBB). Index: llvm/trunk/lib/CodeGen/MachineBasicBlock.cpp =================================================================== --- llvm/trunk/lib/CodeGen/MachineBasicBlock.cpp +++ llvm/trunk/lib/CodeGen/MachineBasicBlock.cpp @@ -659,6 +659,25 @@ Succ->addPredecessor(this); } +void MachineBasicBlock::splitSuccessor(MachineBasicBlock *Old, + MachineBasicBlock *New, + bool NormalizeSuccProbs) { + succ_iterator OldI = llvm::find(successors(), Old); + assert(OldI != succ_end() && "Old is not a successor of this block!"); + assert(llvm::find(successors(), New) == succ_end() && + "New is already a successor of this block!"); + + // Add a new successor with equal probability as the original one. Note + // that we directly copy the probability using the iterator rather than + // getting a potentially synthetic probability computed when unknown. This + // preserves the probabilities as-is and then we can renormalize them and + // query them effectively afterward. + addSuccessor(New, Probs.empty() ? BranchProbability::getUnknown() + : *getProbabilityIterator(OldI)); + if (NormalizeSuccProbs) + normalizeSuccProbs(); +} + void MachineBasicBlock::removeSuccessor(MachineBasicBlock *Succ, bool NormalizeSuccProbs) { succ_iterator I = find(Successors, Succ); Index: llvm/trunk/lib/Target/X86/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Target/X86/CMakeLists.txt +++ llvm/trunk/lib/Target/X86/CMakeLists.txt @@ -57,6 +57,7 @@ X86RetpolineThunks.cpp X86SelectionDAGInfo.cpp X86ShuffleDecodeConstantPool.cpp + X86SpeculativeLoadHardening.cpp X86Subtarget.cpp X86TargetMachine.cpp X86TargetObjectFile.cpp Index: llvm/trunk/lib/Target/X86/X86.h =================================================================== --- llvm/trunk/lib/Target/X86/X86.h +++ llvm/trunk/lib/Target/X86/X86.h @@ -127,6 +127,8 @@ void initializeEvexToVexInstPassPass(PassRegistry &); +FunctionPass *createX86SpeculativeLoadHardeningPass(); + } // End llvm namespace #endif Index: llvm/trunk/lib/Target/X86/X86SpeculativeLoadHardening.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ llvm/trunk/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -0,0 +1,1667 @@ +//====- X86SpeculativeLoadHardening.cpp - A Spectre v1 mitigation ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// +/// Provide a pass which mitigates speculative execution attacks which operate +/// by speculating incorrectly past some predicate (a type check, bounds check, +/// or other condition) to reach a load with invalid inputs and leak the data +/// accessed by that load using a side channel out of the speculative domain. +/// +/// For details on the attacks, see the first variant in both the Project Zero +/// writeup and the Spectre paper: +/// https://googleprojectzero.blogspot.com/2018/01/reading-privileged-memory-with-side.html +/// https://spectreattack.com/spectre.pdf +/// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrBuilder.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/SparseBitVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineConstantPool.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineSSAUpdater.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCSchedule.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include +#include + +using namespace llvm; + +#define PASS_KEY "x86-speculative-load-hardening" +#define DEBUG_TYPE PASS_KEY + +STATISTIC(NumCondBranchesTraced, "Number of conditional branches traced"); +STATISTIC(NumBranchesUntraced, "Number of branches unable to trace"); +STATISTIC(NumAddrRegsHardened, + "Number of address mode used registers hardaned"); +STATISTIC(NumPostLoadRegsHardened, + "Number of post-load register values hardened"); +STATISTIC(NumInstsInserted, "Number of instructions inserted"); +STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted"); + +static cl::opt HardenEdgesWithLFENCE( + PASS_KEY "-lfence", + cl::desc( + "Use LFENCE along each conditional edge to harden against speculative " + "loads rather than conditional movs and poisoned pointers."), + cl::init(false), cl::Hidden); + +static cl::opt EnablePostLoadHardening( + PASS_KEY "-post-load", + cl::desc("Harden the value loaded *after* it is loaded by " + "flushing the loaded bits to 1. This is hard to do " + "in general but can be done easily for GPRs."), + cl::init(true), cl::Hidden); + +static cl::opt FenceCallAndRet( + PASS_KEY "-fence-call-and-ret", + cl::desc("Use a full speculation fence to harden both call and ret edges " + "rather than a lighter weight mitigation."), + cl::init(false), cl::Hidden); + +static cl::opt HardenInterprocedurally( + PASS_KEY "-ip", + cl::desc("Harden interprocedurally by passing our state in and out of " + "functions in the high bits of the stack pointer."), + cl::init(true), cl::Hidden); + +static cl::opt + HardenLoads(PASS_KEY "-loads", + cl::desc("Sanitize loads from memory. When disable, no " + "significant security is provided."), + cl::init(true), cl::Hidden); + +namespace llvm { + +void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); + +} // end namespace llvm + +namespace { + +class X86SpeculativeLoadHardeningPass : public MachineFunctionPass { +public: + X86SpeculativeLoadHardeningPass() : MachineFunctionPass(ID) { + initializeX86SpeculativeLoadHardeningPassPass( + *PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { + return "X86 speculative load hardening"; + } + bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + + /// Pass identification, replacement for typeid. + static char ID; + +private: + /// The information about a block's conditional terminators needed to trace + /// our predicate state through the exiting edges. + struct BlockCondInfo { + MachineBasicBlock *MBB; + + // We mostly have one conditional branch, and in extremely rare cases have + // two. Three and more are so rare as to be unimportant for compile time. + SmallVector CondBrs; + + MachineInstr *UncondBr; + }; + + const X86Subtarget *Subtarget; + MachineRegisterInfo *MRI; + const X86InstrInfo *TII; + const TargetRegisterInfo *TRI; + const TargetRegisterClass *PredStateRC; + + void hardenEdgesWithLFENCE(MachineFunction &MF); + + SmallVector collectBlockCondInfo(MachineFunction &MF); + + void checkAllLoads(MachineFunction &MF, MachineSSAUpdater &PredStateSSA); + + unsigned saveEFLAGS(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPt, DebugLoc Loc); + void restoreEFLAGS(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPt, DebugLoc Loc, + unsigned OFReg); + + void mergePredStateIntoSP(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPt, DebugLoc Loc, + unsigned PredStateReg); + unsigned extractPredStateFromSP(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPt, + DebugLoc Loc); + + void + hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO, + MachineOperand &IndexMO, MachineSSAUpdater &PredStateSSA, + SmallDenseMap &AddrRegToHardenedReg); + MachineInstr * + sinkPostLoadHardenedInst(MachineInstr &MI, + SmallPtrSetImpl &HardenedLoads); + void hardenPostLoad(MachineInstr &MI, MachineSSAUpdater &PredStateSSA); + void checkReturnInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA); + void checkCallInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA); +}; + +} // end anonymous namespace + +char X86SpeculativeLoadHardeningPass::ID = 0; + +void X86SpeculativeLoadHardeningPass::getAnalysisUsage( + AnalysisUsage &AU) const { + MachineFunctionPass::getAnalysisUsage(AU); +} + +static MachineBasicBlock &splitEdge(MachineBasicBlock &MBB, + MachineBasicBlock &Succ, int SuccCount, + MachineInstr *Br, MachineInstr *&UncondBr, + const X86InstrInfo &TII) { + assert(!Succ.isEHPad() && "Shouldn't get edges to EH pads!"); + + MachineFunction &MF = *MBB.getParent(); + + MachineBasicBlock &NewMBB = *MF.CreateMachineBasicBlock(); + + // We have to insert the new block immediately after the current one as we + // don't know what layout-successor relationships the successor has and we + // may not be able to (and generally don't want to) try to fix those up. + MF.insert(std::next(MachineFunction::iterator(&MBB)), &NewMBB); + + // Update the branch instruction if necessary. + if (Br) { + assert(Br->getOperand(0).getMBB() == &Succ && + "Didn't start with the right target!"); + Br->getOperand(0).setMBB(&NewMBB); + + // If this successor was reached through a branch rather than fallthrough, + // we might have *broken* fallthrough and so need to inject a new + // unconditional branch. + if (!UncondBr) { + MachineBasicBlock &OldLayoutSucc = + *std::next(MachineFunction::iterator(&NewMBB)); + assert(MBB.isSuccessor(&OldLayoutSucc) && + "Without an unconditional branch, the old layout successor should " + "be an actual successor!"); + auto BrBuilder = + BuildMI(&MBB, DebugLoc(), TII.get(X86::JMP_1)).addMBB(&OldLayoutSucc); + // Update the unconditional branch now that we've added one. + UncondBr = &*BrBuilder; + } + + // Insert unconditional "jump Succ" instruction in the new block if + // necessary. + if (!NewMBB.isLayoutSuccessor(&Succ)) { + SmallVector Cond; + TII.insertBranch(NewMBB, &Succ, nullptr, Cond, Br->getDebugLoc()); + } + } else { + assert(!UncondBr && + "Cannot have a branchless successor and an unconditional branch!"); + assert(NewMBB.isLayoutSuccessor(&Succ) && + "A non-branch successor must have been a layout successor before " + "and now is a layout successor of the new block."); + } + + // If this is the only edge to the successor, we can just replace it in the + // CFG. Otherwise we need to add a new entry in the CFG for the new + // successor. + if (SuccCount == 1) { + MBB.replaceSuccessor(&Succ, &NewMBB); + } else { + MBB.splitSuccessor(&Succ, &NewMBB); + } + + // Hook up the edge from the new basic block to the old successor in the CFG. + NewMBB.addSuccessor(&Succ); + + // Fix PHI nodes in Succ so they refer to NewMBB instead of MBB. + for (MachineInstr &MI : Succ) { + if (!MI.isPHI()) + break; + for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps; + OpIdx += 2) { + MachineOperand &OpV = MI.getOperand(OpIdx); + MachineOperand &OpMBB = MI.getOperand(OpIdx + 1); + assert(OpMBB.isMBB() && "Block operand to a PHI is not a block!"); + if (OpMBB.getMBB() != &MBB) + continue; + + // If this is the last edge to the succesor, just replace MBB in the PHI + if (SuccCount == 1) { + OpMBB.setMBB(&NewMBB); + break; + } + + // Otherwise, append a new pair of operands for the new incoming edge. + MI.addOperand(MF, OpV); + MI.addOperand(MF, MachineOperand::CreateMBB(&NewMBB)); + break; + } + } + + // Inherit live-ins from the successor + for (auto &LI : Succ.liveins()) + NewMBB.addLiveIn(LI); + + LLVM_DEBUG(dbgs() << " Split edge from '" << MBB.getName() << "' to '" + << Succ.getName() << "'.\n"); + return NewMBB; +} + +bool X86SpeculativeLoadHardeningPass::runOnMachineFunction( + MachineFunction &MF) { + LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() + << " **********\n"); + + Subtarget = &MF.getSubtarget(); + MRI = &MF.getRegInfo(); + TII = Subtarget->getInstrInfo(); + TRI = Subtarget->getRegisterInfo(); + // FIXME: Support for 32-bit. + PredStateRC = &X86::GR64_NOSPRegClass; + + if (MF.begin() == MF.end()) + // Nothing to do for a degenerate empty function... + return false; + + // We support an alternative hardening technique based on a debug flag. + if (HardenEdgesWithLFENCE) { + hardenEdgesWithLFENCE(MF); + return true; + } + + // Create a dummy debug loc to use for all the generated code here. + DebugLoc Loc; + + MachineBasicBlock &Entry = *MF.begin(); + auto EntryInsertPt = Entry.SkipPHIsLabelsAndDebug(Entry.begin()); + + // Do a quick scan to see if we have any checkable loads. + bool HasCheckableLoad = false; + for (MachineBasicBlock &MBB : MF) { + for (MachineInstr &MI : MBB) { + // Stop searching blocks at an LFENCE. + if (MI.getOpcode() == X86::LFENCE) + break; + + // Looking for loads only. + if (!MI.mayLoad()) + continue; + + // An MFENCE is modeled as a load but doesn't require hardening. + if (MI.getOpcode() == X86::MFENCE) + continue; + + HasCheckableLoad = true; + break; + } + if (HasCheckableLoad) + break; + } + + // See if we have any conditional branching blocks that we will need to trace + // predicate state through. + SmallVector Infos = collectBlockCondInfo(MF); + + // If we have no interesting conditions or loads, nothing to do here. + if (!HasCheckableLoad && Infos.empty()) + return true; + + unsigned PredStateReg; + unsigned PredStateSizeInBytes = TRI->getRegSizeInBits(*PredStateRC) / 8; + + // The poison value is required to be an all-ones value for many aspects of + // this mitigation. + const int PoisonVal = -1; + unsigned PoisonReg = MRI->createVirtualRegister(PredStateRC); + BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV64ri32), PoisonReg) + .addImm(PoisonVal); + ++NumInstsInserted; + + // If we have loads being hardened and we've asked for call and ret edges to + // get a full fence-based mitigation, inject that fence. + if (HasCheckableLoad && FenceCallAndRet) { + // We need to insert an LFENCE at the start of the function to suspend any + // incoming misspeculation from the caller. This helps two-fold: the caller + // may not have been protected as this code has been, and this code gets to + // not take any specific action to protect across calls. + // FIXME: We could skip this for functions which unconditionally return + // a constant. + BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::LFENCE)); + ++NumInstsInserted; + ++NumLFENCEsInserted; + } + + // If we have no conditionals to protect in blocks, then all we needed to do + // was protect the entry and so we're done. + if (Infos.empty()) + // We may have changed the function's code at this point to insert fences. + return true; + + // For every basic block in the function which can b + if (HardenInterprocedurally && !FenceCallAndRet) { + // Set up the predicate state by extracting it from the incoming stack + // pointer so we pick up any misspeculation in our caller. + PredStateReg = extractPredStateFromSP(Entry, EntryInsertPt, Loc); + } else { + // Otherwise, just build the predicate state itself by zeroing a register + // as we don't need any initial state. + PredStateReg = MRI->createVirtualRegister(PredStateRC); + unsigned PredStateSubReg = MRI->createVirtualRegister(&X86::GR32RegClass); + auto ZeroI = BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::MOV32r0), + PredStateSubReg); + ++NumInstsInserted; + MachineOperand *ZeroEFLAGSDefOp = + ZeroI->findRegisterDefOperand(X86::EFLAGS); + assert(ZeroEFLAGSDefOp && ZeroEFLAGSDefOp->isImplicit() && + "Must have an implicit def of EFLAGS!"); + ZeroEFLAGSDefOp->setIsDead(true); + BuildMI(Entry, EntryInsertPt, Loc, TII->get(X86::SUBREG_TO_REG), + PredStateReg) + .addImm(0) + .addReg(PredStateSubReg) + .addImm(X86::sub_32bit); + } + + // We're going to need to trace predicate state throughout the function's + // CFG. Prepare for this by setting up our initial state of PHIs with unique + // predecessor entries and all the initial predicate state. + + // FIXME: It's really frustrating that we have to do this, but SSA-form in + // MIR isn't what you might expect. We may have multiple entries in PHI nodes + // for a single predecessor. This makes CFG-updating extremely complex, so + // here we simplify all PHI nodes to a model even simpler than the IR's + // model: exactly one entry per predecessor, regardless of how many edges + // there are. + SmallPtrSet Preds; + SmallVector DupIndices; + for (auto &MBB : MF) + for (auto &MI : MBB) { + if (!MI.isPHI()) + break; + + // First we scan the operands of the PHI looking for duplicate entries + // a particular predecessor. We retain the operand index of each duplicate + // entry found. + for (int OpIdx = 1, NumOps = MI.getNumOperands(); OpIdx < NumOps; + OpIdx += 2) + if (!Preds.insert(MI.getOperand(OpIdx + 1).getMBB()).second) + DupIndices.push_back(OpIdx); + + // Now walk the duplicate indices, removing both the block and value. Note + // that these are stored as a vector making this element-wise removal + // :w + // potentially quadratic. + // + // FIXME: It is really frustrating that we have to use a quadratic + // removal algorithm here. There should be a better way, but the use-def + // updates required make that impossible using the public API. + // + // Note that we have to process these backwards so that we don't + // invalidate other indices with each removal. + while (!DupIndices.empty()) { + int OpIdx = DupIndices.pop_back_val(); + // Remove both the block and value operand, again in reverse order to + // preserve indices. + MI.RemoveOperand(OpIdx + 1); + MI.RemoveOperand(OpIdx); + } + + Preds.clear(); + } + + // Track the updated values in an SSA updater to rewrite into SSA form at the + // end. + MachineSSAUpdater PredStateSSA(MF); + PredStateSSA.Initialize(PredStateReg); + PredStateSSA.AddAvailableValue(&Entry, PredStateReg); + // Collect the inserted instructions so we can rewrite their uses of the + // predicate state into SSA form. + SmallVector CMovs; + + // Now walk all of the basic blocks looking for ones that end in conditional + // jumps where we need to update this register along each edge. + for (BlockCondInfo &Info : Infos) { + MachineBasicBlock &MBB = *Info.MBB; + SmallVectorImpl &CondBrs = Info.CondBrs; + MachineInstr *UncondBr = Info.UncondBr; + + LLVM_DEBUG(dbgs() << "Tracing predicate through block: " << MBB.getName() + << "\n"); + ++NumCondBranchesTraced; + + // Compute the non-conditional successor as either the target of any + // unconditional branch or the layout successor. + MachineBasicBlock *UncondSucc = + UncondBr ? (UncondBr->getOpcode() == X86::JMP_1 + ? UncondBr->getOperand(0).getMBB() + : nullptr) + : &*std::next(MachineFunction::iterator(&MBB)); + + // Count how many edges there are to any given successor. + SmallDenseMap SuccCounts; + if (UncondSucc) + ++SuccCounts[UncondSucc]; + for (auto *CondBr : CondBrs) + ++SuccCounts[CondBr->getOperand(0).getMBB()]; + + // A lambda to insert cmov instructions into a block checking all of the + // condition codes in a sequence. + auto BuildCheckingBlockForSuccAndConds = + [&](MachineBasicBlock &MBB, MachineBasicBlock &Succ, int SuccCount, + MachineInstr *Br, MachineInstr *&UncondBr, + ArrayRef Conds) { + // First, we split the edge to insert the checking block into a safe + // location. + auto &CheckingMBB = + (SuccCount == 1 && Succ.pred_size() == 1) + ? Succ + : splitEdge(MBB, Succ, SuccCount, Br, UncondBr, *TII); + + bool LiveEFLAGS = Succ.isLiveIn(X86::EFLAGS); + if (!LiveEFLAGS) + CheckingMBB.addLiveIn(X86::EFLAGS); + + // Now insert the cmovs to implement the checks. + auto InsertPt = CheckingMBB.begin(); + assert( + InsertPt == CheckingMBB.end() || + !InsertPt->isPHI() && + "Should never have a PHI in the initial checking block as it " + "always has a single predecessor!"); + + // We will wire each cmov to each other, but need to start with the + // incoming pred state. + unsigned CurStateReg = PredStateReg; + + for (X86::CondCode Cond : Conds) { + auto CMovOp = X86::getCMovFromCond(Cond, PredStateSizeInBytes); + + unsigned UpdatedStateReg = MRI->createVirtualRegister(PredStateRC); + auto CMovI = BuildMI(CheckingMBB, InsertPt, Loc, TII->get(CMovOp), + UpdatedStateReg) + .addReg(CurStateReg) + .addReg(PoisonReg); + // If this is the last cmov and the EFLAGS weren't originally + // live-in, mark them as killed. + if (!LiveEFLAGS && Cond == Conds.back()) + CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true); + + ++NumInstsInserted; + LLVM_DEBUG(dbgs() << " Inserting cmov: "; CMovI->dump(); + dbgs() << "\n"); + + // The first one of the cmovs will be using the top level + // `PredStateReg` and need to get rewritten into SSA form. + if (CurStateReg == PredStateReg) + CMovs.push_back(&*CMovI); + + // The next cmov should start from this one's def. + CurStateReg = UpdatedStateReg; + } + + // And put the last one into the available values for PredStateSSA. + PredStateSSA.AddAvailableValue(&CheckingMBB, CurStateReg); + }; + + std::vector UncondCodeSeq; + for (auto *CondBr : CondBrs) { + MachineBasicBlock &Succ = *CondBr->getOperand(0).getMBB(); + int &SuccCount = SuccCounts[&Succ]; + + X86::CondCode Cond = X86::getCondFromBranchOpc(CondBr->getOpcode()); + X86::CondCode InvCond = X86::GetOppositeBranchCondition(Cond); + UncondCodeSeq.push_back(Cond); + + BuildCheckingBlockForSuccAndConds(MBB, Succ, SuccCount, CondBr, UncondBr, + {InvCond}); + + // Decrement the successor count now that we've split one of the edges. + // We need to keep the count of edges to the successor accurate in order + // to know above when to *replace* the successor in the CFG vs. just + // adding the new successor. + --SuccCount; + } + + // Since we may have split edges and changed the number of successors, + // normalize the probabilities. This avoids doing it each time we split an + // edge. + MBB.normalizeSuccProbs(); + + // Finally, we need to insert cmovs into the "fallthrough" edge. Here, we + // need to intersect the other condition codes. We can do this by just + // doing a cmov for each one. + if (!UncondSucc) + // If we have no fallthrough to protect (perhaps it is an indirect jump?) + // just skip this and continue. + continue; + + assert(SuccCounts[UncondSucc] == 1 && + "We should never have more than one edge to the unconditional " + "successor at this point because every other edge must have been " + "split above!"); + + // Sort and unique the codes to minimize them. + llvm::sort(UncondCodeSeq.begin(), UncondCodeSeq.end()); + UncondCodeSeq.erase(std::unique(UncondCodeSeq.begin(), UncondCodeSeq.end()), + UncondCodeSeq.end()); + + // Build a checking version of the successor. + BuildCheckingBlockForSuccAndConds(MBB, *UncondSucc, /*SuccCount*/ 1, + UncondBr, UncondBr, UncondCodeSeq); + } + + // We may also enter basic blocks in this function via exception handling + // control flow. Here, if we are hardening interprocedurally, we need to + // re-capture the predicate state from the throwing code. In the Itanium ABI, + // the throw will always look like a call to __cxa_throw and will have the + // predicate state in the stack pointer, so extract fresh predicate state from + // the stack pointer and make it available in SSA. + // FIXME: Handle non-itanium ABI EH models. + if (HardenInterprocedurally) { + for (MachineBasicBlock &MBB : MF) { + assert(!MBB.isEHScopeEntry() && "Only Itanium ABI EH supported!"); + assert(!MBB.isEHFuncletEntry() && "Only Itanium ABI EH supported!"); + assert(!MBB.isCleanupFuncletEntry() && "Only Itanium ABI EH supported!"); + if (!MBB.isEHPad()) + continue; + PredStateSSA.AddAvailableValue( + &MBB, + extractPredStateFromSP(MBB, MBB.SkipPHIsAndLabels(MBB.begin()), Loc)); + } + } + + // Now check all of the loads using the predicate state. + checkAllLoads(MF, PredStateSSA); + + // Now rewrite all the uses of the pred state using the SSA updater so that + // we track updates through the CFG. + for (MachineInstr *CMovI : CMovs) + for (MachineOperand &Op : CMovI->operands()) { + if (!Op.isReg() || Op.getReg() != PredStateReg) + continue; + + PredStateSSA.RewriteUse(Op); + } + + // If we are hardening interprocedurally, find each returning block and + // protect the caller from being returned to through misspeculation. + if (HardenInterprocedurally) + for (MachineBasicBlock &MBB : MF) { + if (MBB.empty()) + continue; + + MachineInstr &MI = MBB.back(); + if (!MI.isReturn()) + continue; + + checkReturnInstr(MI, PredStateSSA); + } + + LLVM_DEBUG(dbgs() << "Final speculative load hardened function:\n"; MF.dump(); + dbgs() << "\n"; MF.verify(this)); + return true; +} + +/// Implements the naive hardening approach of putting an LFENCE after every +/// potentially mis-predicted control flow construct. +/// +/// We include this as an alternative mostly for the purpose of comparison. The +/// performance impact of this is expected to be extremely severe and not +/// practical for any real-world users. +void X86SpeculativeLoadHardeningPass::hardenEdgesWithLFENCE( + MachineFunction &MF) { + // First, we scan the function looking for blocks that are reached along edges + // that we might want to harden. + SmallSetVector Blocks; + for (MachineBasicBlock &MBB : MF) { + // If there are no or only one successor, nothing to do here. + if (MBB.succ_size() <= 1) + continue; + + // Skip blocks unless their terminators start with a branch. Other + // terminators don't seem interesting for guarding against misspeculation. + auto TermIt = MBB.getFirstTerminator(); + if (TermIt == MBB.end() || !TermIt->isBranch()) + continue; + + // Add all the non-EH-pad succossors to the blocks we want to harden. We + // skip EH pads because there isn't really a condition of interest on + // entering. + for (MachineBasicBlock *SuccMBB : MBB.successors()) + if (!SuccMBB->isEHPad()) + Blocks.insert(SuccMBB); + } + + for (MachineBasicBlock *MBB : Blocks) { + auto InsertPt = MBB->SkipPHIsAndLabels(MBB->begin()); + BuildMI(*MBB, InsertPt, DebugLoc(), TII->get(X86::LFENCE)); + ++NumInstsInserted; + ++NumLFENCEsInserted; + } +} + +SmallVector +X86SpeculativeLoadHardeningPass::collectBlockCondInfo(MachineFunction &MF) { + SmallVector Infos; + + // Walk the function and build up a summary for each block's conditions that + // we need to trace through. + for (MachineBasicBlock &MBB : MF) { + // If there are no or only one successor, nothing to do here. + if (MBB.succ_size() <= 1) + continue; + + // We want to reliably handle any conditional branch terminators in the + // MBB, so we manually analyze the branch. We can handle all of the + // permutations here, including ones that analyze branch cannot. + // + // The approach is to walk backwards across the terminators, resetting at + // any unconditional non-indirect branch, and track all conditional edges + // to basic blocks as well as the fallthrough or unconditional successor + // edge. For each conditional edge, we track the target and the opposite + // condition code in order to inject a "no-op" cmov into that successor + // that will harden the predicate. For the fallthrough/unconditional + // edge, we inject a separate cmov for each conditional branch with + // matching condition codes. This effectively implements an "and" of the + // condition flags, even if there isn't a single condition flag that would + // directly implement that. We don't bother trying to optimize either of + // these cases because if such an optimization is possible, LLVM should + // have optimized the conditional *branches* in that way already to reduce + // instruction count. This late, we simply assume the minimal number of + // branch instructions is being emitted and use that to guide our cmov + // insertion. + + BlockCondInfo Info = {&MBB, {}, nullptr}; + + // Now walk backwards through the terminators and build up successors they + // reach and the conditions. + for (MachineInstr &MI : llvm::reverse(MBB)) { + // Once we've handled all the terminators, we're done. + if (!MI.isTerminator()) + break; + + // If we see a non-branch terminator, we can't handle anything so bail. + if (!MI.isBranch()) { + Info.CondBrs.clear(); + break; + } + + // If we see an unconditional branch, reset our state, clear any + // fallthrough, and set this is the "else" successor. + if (MI.getOpcode() == X86::JMP_1) { + Info.CondBrs.clear(); + Info.UncondBr = &MI; + continue; + } + + // If we get an invalid condition, we have an indirect branch or some + // other unanalyzable "fallthrough" case. We model this as a nullptr for + // the destination so we can still guard any conditional successors. + // Consider code sequences like: + // ``` + // jCC L1 + // jmpq *%rax + // ``` + // We still want to harden the edge to `L1`. + if (X86::getCondFromBranchOpc(MI.getOpcode()) == X86::COND_INVALID) { + Info.CondBrs.clear(); + Info.UncondBr = &MI; + continue; + } + + // We have a vanilla conditional branch, add it to our list. + Info.CondBrs.push_back(&MI); + } + if (Info.CondBrs.empty()) { + ++NumBranchesUntraced; + LLVM_DEBUG(dbgs() << "WARNING: unable to secure successors of block:\n"; + MBB.dump()); + continue; + } + + Infos.push_back(Info); + } + + return Infos; +} + +/// Returns true if the instruction has no behavior (specified or otherwise) +/// that is based on the value of any of its register operands +/// +/// A classical example of something that is inherently not data invariant is an +/// indirect jump -- the destination is loaded into icache based on the bits set +/// in the jump destination register. +/// +/// FIXME: This should become part of our instruction tables. +static bool isDataInvariant(MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + // By default, assume that the instruction is not data invariant. + return false; + + // FIXME: For now, we just use a very boring, conservative set of unary + // instructions because we're mostly interested in handling simple + // transformations. + case TargetOpcode::COPY: + return true; + } +} + +/// Returns true if the instruction has no behavior (specified or otherwise) +/// that is based on the value loaded from memory or the value of any +/// non-address register operands. +/// +/// For example, if the latency of the instruction is dependent on the +/// particular bits set in any of the registers *or* any of the bits loaded from +/// memory. +/// +/// A classical example of something that is inherently not data invariant is an +/// indirect jump -- the destination is loaded into icache based on the bits set +/// in the jump destination register. +/// +/// FIXME: This should become part of our instruction tables. +static bool isDataInvariantLoad(MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + // By default, assume that the load will immediately leak. + return false; + + // On x86 it is believed that imul is constant time w.r.t. the loaded data. + // However, they set flags and are perhaps the most surprisingly constant + // time operations so we call them out here separately. + case X86::IMUL16rm: + case X86::IMUL16rmi8: + case X86::IMUL16rmi: + case X86::IMUL32rm: + case X86::IMUL32rmi8: + case X86::IMUL32rmi: + case X86::IMUL64rm: + case X86::IMUL64rmi32: + case X86::IMUL64rmi8: + + // Bitfield and bit scanning instructions that are somewhat surprisingly + // constant time as they scan across bits and do other fairly complex + // operations like popcnt, but are believed to be constant time on x86. + // However, these set flags. + case X86::BLCFILL32rm: + case X86::BLCFILL64rm: + case X86::BLCI32rm: + case X86::BLCI64rm: + case X86::BLCIC32rm: + case X86::BLCIC64rm: + case X86::BLCMSK32rm: + case X86::BLCMSK64rm: + case X86::BLCS32rm: + case X86::BLCS64rm: + case X86::BLSFILL32rm: + case X86::BLSFILL64rm: + case X86::BLSI32rm: + case X86::BLSI64rm: + case X86::BLSIC32rm: + case X86::BLSIC64rm: + case X86::BLSMSK32rm: + case X86::BLSMSK64rm: + case X86::BLSR32rm: + case X86::BLSR64rm: + case X86::BZHI32rm: + case X86::BZHI64rm: + case X86::LZCNT16rm: + case X86::LZCNT32rm: + case X86::LZCNT64rm: + case X86::POPCNT16rm: + case X86::POPCNT32rm: + case X86::POPCNT64rm: + case X86::TZCNT16rm: + case X86::TZCNT32rm: + case X86::TZCNT64rm: + case X86::TZMSK32rm: + case X86::TZMSK64rm: + + // Basic arithmetic is constant time on the input but does set flags. + case X86::ADC8rm: + case X86::ADC16rm: + case X86::ADC32rm: + case X86::ADC64rm: + case X86::ADCX32rm: + case X86::ADCX64rm: + case X86::ADD8rm: + case X86::ADD16rm: + case X86::ADD32rm: + case X86::ADD64rm: + case X86::ADOX32rm: + case X86::ADOX64rm: + case X86::AND8rm: + case X86::AND16rm: + case X86::AND32rm: + case X86::AND64rm: + case X86::ANDN32rm: + case X86::ANDN64rm: + case X86::BSF16rm: + case X86::BSF32rm: + case X86::BSF64rm: + case X86::BSR16rm: + case X86::BSR32rm: + case X86::BSR64rm: + case X86::OR8rm: + case X86::OR16rm: + case X86::OR32rm: + case X86::OR64rm: + case X86::SBB8rm: + case X86::SBB16rm: + case X86::SBB32rm: + case X86::SBB64rm: + case X86::SUB8rm: + case X86::SUB16rm: + case X86::SUB32rm: + case X86::SUB64rm: + case X86::XOR8rm: + case X86::XOR16rm: + case X86::XOR32rm: + case X86::XOR64rm: + case X86::BEXTR32rm: + case X86::BEXTR64rm: + case X86::BEXTRI32mi: + case X86::BEXTRI64mi: + // Check whether the EFLAGS implicit-def is dead. We assume that this will + // always find the implicit-def because this code should only be reached + // for instructions that do in fact implicitly def this. + if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) { + // If we would clobber EFLAGS that are used, just bail for now. + LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: "; + MI.dump(); dbgs() << "\n"); + return false; + } + + // Otherwise, fallthrough to handle these the same as instructions that + // don't set EFLAGS. + LLVM_FALLTHROUGH; + + // Integer multiply w/o affecting flags is still believed to be constant + // time on x86. Called out separately as this is among the most surprising + // instructions to exhibit that behavior. + case X86::MULX32rm: + case X86::MULX64rm: + + // Arithmetic instructions that are both constant time and don't set flags. + case X86::PDEP32rm: + case X86::PDEP64rm: + case X86::PEXT32rm: + case X86::PEXT64rm: + case X86::RORX32mi: + case X86::RORX64mi: + case X86::SARX32rm: + case X86::SARX64rm: + case X86::SHLX32rm: + case X86::SHLX64rm: + case X86::SHRX32rm: + case X86::SHRX64rm: + + // Conversions are believed to be constant time and don't set flags. + // FIXME: Add AVX versions. + case X86::CVTSD2SI64rm_Int: + case X86::CVTSD2SIrm_Int: + case X86::CVTSS2SI64rm_Int: + case X86::CVTSS2SIrm_Int: + case X86::CVTTSD2SI64rm: + case X86::CVTTSD2SI64rm_Int: + case X86::CVTTSD2SIrm: + case X86::CVTTSD2SIrm_Int: + case X86::CVTTSS2SI64rm: + case X86::CVTTSS2SI64rm_Int: + case X86::CVTTSS2SIrm: + case X86::CVTTSS2SIrm_Int: + + // Loads to register don't set flags. + case X86::MOV8rm: + case X86::MOV8rm_NOREX: + case X86::MOV16rm: + case X86::MOV32rm: + case X86::MOV64rm: + case X86::MOVSX16rm8: + case X86::MOVSX32rm16: + case X86::MOVSX32rm8: + case X86::MOVSX32rm8_NOREX: + case X86::MOVSX64rm16: + case X86::MOVSX64rm32: + case X86::MOVSX64rm8: + case X86::MOVZX16rm8: + case X86::MOVZX32rm16: + case X86::MOVZX32rm8: + case X86::MOVZX32rm8_NOREX: + case X86::MOVZX64rm16: + case X86::MOVZX64rm8: + return true; + } +} + +static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, + const TargetRegisterInfo &TRI) { + // Check if EFLAGS are alive by seeing if there is a def of them or they + // live-in, and then seeing if that def is in turn used. + for (MachineInstr &MI : llvm::reverse(llvm::make_range(MBB.begin(), I))) { + if (MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) { + // If the def is dead, then EFLAGS is not live. + if (DefOp->isDead()) + return false; + + // Otherwise we've def'ed it, and it is live. + return true; + } + // While at this instruction, also check if we use and kill EFLAGS + // which means it isn't live. + if (MI.killsRegister(X86::EFLAGS, &TRI)) + return false; + } + + // If we didn't find anything conclusive (neither definitely alive or + // definitely dead) return whether it lives into the block. + return MBB.isLiveIn(X86::EFLAGS); +} + +void X86SpeculativeLoadHardeningPass::checkAllLoads( + MachineFunction &MF, MachineSSAUpdater &PredStateSSA) { + // If the actual checking of loads is disabled, skip doing anything here. + if (!HardenLoads) + return; + + SmallPtrSet HardenPostLoad; + SmallPtrSet HardenLoadAddr; + + SmallSet HardenedAddrRegs; + + SmallDenseMap AddrRegToHardenedReg; + + // Track the set of load-dependent registers through the basic block. Because + // the values of these registers have an existing data dependency on a loaded + // value which we would have checked, we can omit any checks on them. + SparseBitVector<> LoadDepRegs; + + for (MachineBasicBlock &MBB : MF) { + // We harden the loads of a basic block in several passes: + // + // 1) Collect all the loads which can have their loaded value hardened + // and all the loads that instead need their address hardened. During + // this walk we propagate load dependence for address hardened loads and + // also look for LFENCE to stop hardening wherever possible. When + // deciding whether or not to harden the loaded value or not, we check + // to see if any registers used in the address will have been hardened + // at this point and if so, harden any remaining address registers as + // that often successfully re-uses hardened addresses and minimizes + // instructions. FIXME: We should consider an aggressive mode where we + // continue to keep as many loads value hardened even when some address + // register hardening would be free (due to reuse). + for (MachineInstr &MI : MBB) { + // We naively assume that all def'ed registers of an instruction have + // a data dependency on all of their operands. + // FIXME: Do a more careful analysis of x86 to build a conservative model + // here. + if (llvm::any_of(MI.uses(), [&](MachineOperand &Op) { + return Op.isReg() && LoadDepRegs.test(Op.getReg()); + })) + for (MachineOperand &Def : MI.defs()) + if (Def.isReg()) + LoadDepRegs.set(Def.getReg()); + + // Both Intel and AMD are guiding that they will change the semantics of + // LFENCE to be a speculation barrier, so if we see an LFENCE, there is + // no more need to guard things in this block. + if (MI.getOpcode() == X86::LFENCE) + break; + + // If this instruction cannot load, nothing to do. + if (!MI.mayLoad()) + continue; + + // Some instructions which "load" are trivially safe or unimportant. + if (MI.getOpcode() == X86::MFENCE) + continue; + + // Extract the memory operand information about this instruction. + // FIXME: This doesn't handle loading pseudo instructions which we often + // could handle with similarly generic logic. We probably need to add an + // MI-layer routine similar to the MC-layer one we use here which maps + // pseudos much like this maps real instructions. + const MCInstrDesc &Desc = MI.getDesc(); + int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags); + if (MemRefBeginIdx < 0) { + LLVM_DEBUG(dbgs() << "WARNING: unable to harden loading instruction: "; + MI.dump()); + continue; + } + + MemRefBeginIdx += X86II::getOperandBias(Desc); + + MachineOperand &BaseMO = MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg); + MachineOperand &IndexMO = + MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg); + + // If we have at least one (non-frame-index, non-RIP) register operand, + // and neither operand is load-dependent, we need to check the load. + unsigned BaseReg = 0, IndexReg = 0; + if (!BaseMO.isFI() && BaseMO.getReg() != X86::RIP && + BaseMO.getReg() != X86::NoRegister) + BaseReg = BaseMO.getReg(); + if (IndexMO.getReg() != X86::NoRegister) + IndexReg = IndexMO.getReg(); + + if (!BaseReg && !IndexReg) + // No register operands! + continue; + + // If any register operand is dependent, this load is dependent and we + // needn't check it. + // FIXME: Is this true in the case where we are hardening loads after + // they complete? Unclear, need to investigate. + if ((BaseReg && LoadDepRegs.test(BaseReg)) || + (IndexReg && LoadDepRegs.test(IndexReg))) + continue; + + // If post-load hardening is enabled, this load is known to be + // data-invariant, and we aren't already going to harden one of the + // address registers, queue it up to be hardened post-load. Notably, even + // once hardened this won't introduce a useful dependency that could prune + // out subsequent loads. + if (EnablePostLoadHardening && isDataInvariantLoad(MI) && + !HardenedAddrRegs.count(BaseReg) && + !HardenedAddrRegs.count(IndexReg)) { + HardenPostLoad.insert(&MI); + HardenedAddrRegs.insert(MI.getOperand(0).getReg()); + continue; + } + + // Record this instruction for address hardening and record its register + // operands as being address-hardened. + HardenLoadAddr.insert(&MI); + if (BaseReg) + HardenedAddrRegs.insert(BaseReg); + if (IndexReg) + HardenedAddrRegs.insert(IndexReg); + + for (MachineOperand &Def : MI.defs()) + if (Def.isReg()) + LoadDepRegs.set(Def.getReg()); + } + + // Now re-walk the instructions in the basic block, and apply whichever + // hardening strategy we have elected. Note that we do this in a second + // pass specifically so that we have the complete set of instructions for + // which we will do post-load hardening and can defer it in certain + // circumstances. + // + // FIXME: This could probably be made even more effective by doing it + // across the entire function. Rather than just walking the flat list + // backwards here, we could walk the function in PO and each block bottom + // up, allowing us to in some cases sink hardening across block blocks. As + // long as the in-block predicate state is used at the eventual hardening + // site, this remains safe. + for (MachineInstr &MI : MBB) { + // We cannot both require hardening the def of a load and its address. + assert(!(HardenLoadAddr.count(&MI) && HardenPostLoad.count(&MI)) && + "Requested to harden both the address and def of a load!"); + + // Check if this is a load whose address needs to be hardened. + if (HardenLoadAddr.erase(&MI)) { + const MCInstrDesc &Desc = MI.getDesc(); + int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags); + assert(MemRefBeginIdx >= 0 && "Cannot have an invalid index here!"); + + MemRefBeginIdx += X86II::getOperandBias(Desc); + + MachineOperand &BaseMO = + MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg); + MachineOperand &IndexMO = + MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg); + hardenLoadAddr(MI, BaseMO, IndexMO, PredStateSSA, AddrRegToHardenedReg); + continue; + } + + // Test if this instruction is one of our post load instructions (and + // remove it from the set if so). + if (HardenPostLoad.erase(&MI)) { + assert(!MI.isCall() && "Must not try to post-load harden a call!"); + + // If this is a data-invariant load, we want to try and sink any + // hardening as far as possible. + if (isDataInvariantLoad(MI)) { + // Sink the instruction we'll need to harden as far as we can down the + // graph. + MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad); + + // If we managed to sink this instruction, update everything so we + // harden that instruction when we reach it in the instruction + // sequence. + if (SunkMI != &MI) { + // If in sinking there was no instruction needing to be hardened, + // we're done. + if (!SunkMI) + continue; + + // Otherwise, add this to the set of defs we harden. + HardenPostLoad.insert(SunkMI); + continue; + } + } + + // The register def'ed by this instruction is trivially hardened so map + // it to itself. + AddrRegToHardenedReg[MI.getOperand(0).getReg()] = + MI.getOperand(0).getReg(); + + hardenPostLoad(MI, PredStateSSA); + continue; + } + + // After we finish processing the instruction and doing any hardening + // necessary for it, we need to handle transferring the predicate state + // into a call and recovering it after the call returns (if it returns). + if (!MI.isCall()) + continue; + + // If we're not hardening interprocedurally, we can just skip calls. + if (!HardenInterprocedurally) + continue; + + auto InsertPt = MI.getIterator(); + DebugLoc Loc = MI.getDebugLoc(); + + // First, we transfer the predicate state into the called function by + // merging it into the stack pointer. This will kill the current def of + // the state. + unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB); + mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg); + + // If this call is also a return (because it is a tail call) we're done. + if (MI.isReturn()) + continue; + + // Otherwise we need to step past the call and recover the predicate + // state from SP after the return, and make this new state available. + ++InsertPt; + unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc); + PredStateSSA.AddAvailableValue(&MBB, NewStateReg); + } + + HardenPostLoad.clear(); + HardenLoadAddr.clear(); + HardenedAddrRegs.clear(); + AddrRegToHardenedReg.clear(); + + // Currently, we only track data-dependent loads within a basic block. + // FIXME: We should see if this is necessary or if we could be more + // aggressive here without opening up attack avenues. + LoadDepRegs.clear(); + } +} + +/// Save EFLAGS into the returned GPR. This can in turn be restored with +/// `restoreEFLAGS`. +/// +/// Note that LLVM can only lower very simple patterns of saved and restored +/// EFLAGS registers. The restore should always be within the same basic block +/// as the save so that no PHI nodes are inserted. +unsigned X86SpeculativeLoadHardeningPass::saveEFLAGS( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, + DebugLoc Loc) { + // FIXME: Hard coding this to a 32-bit register class seems weird, but matches + // what instruction selection does. + unsigned Reg = MRI->createVirtualRegister(&X86::GR32RegClass); + // We directly copy the FLAGS register and rely on later lowering to clean + // this up into the appropriate setCC instructions. + BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), Reg).addReg(X86::EFLAGS); + ++NumInstsInserted; + return Reg; +} + +/// Restore EFLAGS from the provided GPR. This should be produced by +/// `saveEFLAGS`. +/// +/// This must be done within the same basic block as the save in order to +/// reliably lower. +void X86SpeculativeLoadHardeningPass::restoreEFLAGS( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc, + unsigned Reg) { + BuildMI(MBB, InsertPt, Loc, TII->get(X86::COPY), X86::EFLAGS).addReg(Reg); + ++NumInstsInserted; +} + +/// Takes the current predicate state (in a register) and merges it into the +/// stack pointer. The state is essentially a single bit, but we merge this in +/// a way that won't form non-canonical pointers and also will be preserved +/// across normal stack adjustments. +void X86SpeculativeLoadHardeningPass::mergePredStateIntoSP( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc, + unsigned PredStateReg) { + unsigned TmpReg = MRI->createVirtualRegister(PredStateRC); + // FIXME: This hard codes a shift distance based on the number of bits needed + // to stay canonical on 64-bit. We should compute this somehow and support + // 32-bit as part of that. + auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHL64ri), TmpReg) + .addReg(PredStateReg, RegState::Kill) + .addImm(47); + ShiftI->addRegisterDead(X86::EFLAGS, TRI); + ++NumInstsInserted; + auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), X86::RSP) + .addReg(X86::RSP) + .addReg(TmpReg, RegState::Kill); + OrI->addRegisterDead(X86::EFLAGS, TRI); + ++NumInstsInserted; +} + +/// Extracts the predicate state stored in the high bits of the stack pointer. +unsigned X86SpeculativeLoadHardeningPass::extractPredStateFromSP( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, + DebugLoc Loc) { + unsigned PredStateReg = MRI->createVirtualRegister(PredStateRC); + unsigned TmpReg = MRI->createVirtualRegister(PredStateRC); + + // We know that the stack pointer will have any preserved predicate state in + // its high bit. We just want to smear this across the other bits. Turns out, + // this is exactly what an arithmetic right shift does. + BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), TmpReg) + .addReg(X86::RSP); + auto ShiftI = + BuildMI(MBB, InsertPt, Loc, TII->get(X86::SAR64ri), PredStateReg) + .addReg(TmpReg, RegState::Kill) + .addImm(TRI->getRegSizeInBits(*PredStateRC) - 1); + ShiftI->addRegisterDead(X86::EFLAGS, TRI); + ++NumInstsInserted; + + return PredStateReg; +} + +void X86SpeculativeLoadHardeningPass::hardenLoadAddr( + MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO, + MachineSSAUpdater &PredStateSSA, + SmallDenseMap &AddrRegToHardenedReg) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc Loc = MI.getDebugLoc(); + + // Check if EFLAGS are alive by seeing if there is a def of them or they + // live-in, and then seeing if that def is in turn used. + bool EFLAGSLive = isEFLAGSLive(MBB, MI.getIterator(), *TRI); + + SmallVector HardenOpRegs; + + if (BaseMO.isFI()) { + // A frame index is never a dynamically controllable load, so only + // harden it if we're covering fixed address loads as well. + LLVM_DEBUG( + dbgs() << " Skipping hardening base of explicit stack frame load: "; + MI.dump(); dbgs() << "\n"); + } else if (BaseMO.getReg() == X86::RIP || + BaseMO.getReg() == X86::NoRegister) { + // For both RIP-relative addressed loads or absolute loads, we cannot + // meaningfully harden them because the address being loaded has no + // dynamic component. + // + // FIXME: When using a segment base (like TLS does) we end up with the + // dynamic address being the base plus -1 because we can't mutate the + // segment register here. This allows the signed 32-bit offset to point at + // valid segment-relative addresses and load them successfully. + LLVM_DEBUG( + dbgs() << " Cannot harden base of " + << (BaseMO.getReg() == X86::RIP ? "RIP-relative" : "no-base") + << " address in a load!"); + } else { + assert(BaseMO.isReg() && + "Only allowed to have a frame index or register base."); + HardenOpRegs.push_back(&BaseMO); + } + + if (IndexMO.getReg() != X86::NoRegister && + (HardenOpRegs.empty() || + HardenOpRegs.front()->getReg() != IndexMO.getReg())) + HardenOpRegs.push_back(&IndexMO); + + assert((HardenOpRegs.size() == 1 || HardenOpRegs.size() == 2) && + "Should have exactly one or two registers to harden!"); + assert((HardenOpRegs.size() == 1 || + HardenOpRegs[0]->getReg() != HardenOpRegs[1]->getReg()) && + "Should not have two of the same registers!"); + + // Remove any registers that have alreaded been checked. + llvm::erase_if(HardenOpRegs, [&](MachineOperand *Op) { + // See if this operand's register has already been checked. + auto It = AddrRegToHardenedReg.find(Op->getReg()); + if (It == AddrRegToHardenedReg.end()) + // Not checked, so retain this one. + return false; + + // Otherwise, we can directly update this operand and remove it. + Op->setReg(It->second); + return true; + }); + // If there are none left, we're done. + if (HardenOpRegs.empty()) + return; + + // Compute the current predicate state. + unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB); + + auto InsertPt = MI.getIterator(); + + // If EFLAGS are live and we don't have access to instructions that avoid + // clobbering EFLAGS we need to save and restore them. This in turn makes + // the EFLAGS no longer live. + unsigned FlagsReg = 0; + if (EFLAGSLive && !Subtarget->hasBMI2()) { + EFLAGSLive = false; + FlagsReg = saveEFLAGS(MBB, InsertPt, Loc); + } + + for (MachineOperand *Op : HardenOpRegs) { + auto *OpRC = MRI->getRegClass(Op->getReg()); + + unsigned OpReg = Op->getReg(); + unsigned TmpReg = MRI->createVirtualRegister(OpRC); + + if (!EFLAGSLive) { + // Merge our potential poison state into the value with an or. + auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::OR64rr), TmpReg) + .addReg(StateReg) + .addReg(OpReg); + OrI->addRegisterDead(X86::EFLAGS, TRI); + ++NumInstsInserted; + LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n"); + } else { + // We need to avoid touching EFLAGS so shift out all but the least + // significant bit using the instruction that doesn't update flags. + auto ShiftI = BuildMI(MBB, InsertPt, Loc, TII->get(X86::SHRX64rr), TmpReg) + .addReg(OpReg) + .addReg(StateReg); + (void)ShiftI; + ++NumInstsInserted; + LLVM_DEBUG(dbgs() << " Inserting shrx: "; ShiftI->dump(); + dbgs() << "\n"); + } + + // Record this register as checked and update the operand. + assert(!AddrRegToHardenedReg.count(Op->getReg()) && + "Should not have checked this register yet!"); + AddrRegToHardenedReg[Op->getReg()] = TmpReg; + Op->setReg(TmpReg); + ++NumAddrRegsHardened; + } + + // And restore the flags if needed. + if (FlagsReg) + restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg); +} + +MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst( + MachineInstr &InitialMI, SmallPtrSetImpl &HardenedLoads) { + assert(isDataInvariantLoad(InitialMI) && + "Cannot get here with a non-invariant load!"); + + // See if we can sink hardening the loaded value. + auto SinkCheckToSingleUse = + [&](MachineInstr &MI) -> Optional { + unsigned DefReg = MI.getOperand(0).getReg(); + + // We need to find a single use which we can sink the check. We can + // primarily do this because many uses may already end up checked on their + // own. + MachineInstr *SingleUseMI = nullptr; + for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) { + // If we're already going to harden this use, it is data invariant and + // within our block and we just need to check that the use isn't in an + // address. + if (HardenedLoads.count(&UseMI)) { + const MCInstrDesc &Desc = UseMI.getDesc(); + int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags); + assert(MemRefBeginIdx >= 0 && + "Should always have mem references here!"); + MemRefBeginIdx += X86II::getOperandBias(Desc); + + MachineOperand &BaseMO = + UseMI.getOperand(MemRefBeginIdx + X86::AddrBaseReg); + MachineOperand &IndexMO = + UseMI.getOperand(MemRefBeginIdx + X86::AddrIndexReg); + if ((BaseMO.isReg() && BaseMO.getReg() == DefReg) || + (IndexMO.isReg() && IndexMO.getReg() == DefReg)) + // The load uses the register as part of its address making it not + // invariant. + return {}; + + continue; + } + + if (SingleUseMI) + // We already have a single use, this would make two. Bail. + return {}; + + // If this single use isn't data invariant, isn't in this block, or has + // interfering EFLAGS, we can't sink the hardening to it. + if (!isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent()) + return {}; + + // If this instruction defines multiple registers bail as we won't harden + // all of them. + if (UseMI.getDesc().getNumDefs() > 1) + return {}; + + // If this register isn't a virtual register we can't walk uses of sanely, + // just bail. Also check that its register class is one of the ones we + // can harden. + unsigned UseDefReg = UseMI.getOperand(0).getReg(); + if (!TRI->isVirtualRegister(UseDefReg) || + !MRI->getRegClass(UseDefReg)->hasSubClassEq(&X86::GR64RegClass)) + return {}; + + SingleUseMI = &UseMI; + } + + // If SingleUseMI is still null, there is no use that needs its own + // checking. Otherwise, it is the single use that needs checking. + return {SingleUseMI}; + }; + + MachineInstr *MI = &InitialMI; + while (Optional SingleUse = SinkCheckToSingleUse(*MI)) { + // Update which MI we're checking now. + MI = *SingleUse; + if (!MI) + break; + } + + return MI; +} + +// We can harden non-leaking loads into register without touching the address +// by just hiding all of the loaded bits. We use an `or` instruction to do +// this because having the poison value be all ones allows us to use the same +// value below. And the goal is just for the loaded bits to not be exposed to +// execution and coercing them to one is sufficient. +void X86SpeculativeLoadHardeningPass::hardenPostLoad( + MachineInstr &MI, MachineSSAUpdater &PredStateSSA) { + assert(isDataInvariantLoad(MI) && + "Cannot get here with a non-invariant load!"); + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc Loc = MI.getDebugLoc(); + + // For all of these, the def'ed register operand is operand zero. + auto &DefOp = MI.getOperand(0); + unsigned OldDefReg = DefOp.getReg(); + + auto *DefRC = MRI->getRegClass(OldDefReg); + int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8; + + unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr}; + unsigned OrOpCode = OrOpCodes[Log2_32(DefRegBytes)]; + + unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit}; + + auto GetStateRegInRC = [&](const TargetRegisterClass &RC) { + unsigned StateReg = PredStateSSA.GetValueAtEndOfBlock(&MBB); + + int Bytes = TRI->getRegSizeInBits(RC) / 8; + // FIXME: Need to teach this about 32-bit mode. + if (Bytes != 8) { + unsigned SubRegImm = SubRegImms[Log2_32(Bytes)]; + unsigned NarrowStateReg = MRI->createVirtualRegister(&RC); + BuildMI(MBB, MI.getIterator(), Loc, TII->get(TargetOpcode::COPY), + NarrowStateReg) + .addReg(StateReg, 0, SubRegImm); + StateReg = NarrowStateReg; + } + return StateReg; + }; + + auto InsertPt = std::next(MI.getIterator()); + unsigned FlagsReg = 0; + bool EFLAGSLive = isEFLAGSLive(MBB, InsertPt, *TRI); + if (EFLAGSLive && !Subtarget->hasBMI2()) { + FlagsReg = saveEFLAGS(MBB, InsertPt, Loc); + EFLAGSLive = false; + } + + if (!EFLAGSLive) { + unsigned StateReg = GetStateRegInRC(*DefRC); + unsigned NewDefReg = MRI->createVirtualRegister(DefRC); + DefOp.setReg(NewDefReg); + auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), OldDefReg) + .addReg(StateReg) + .addReg(NewDefReg); + OrI->addRegisterDead(X86::EFLAGS, TRI); + ++NumInstsInserted; + LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n"); + } else { + assert(Subtarget->hasBMI2() && + "Cannot harden loads and preserve EFLAGS without BMI2!"); + + unsigned ShiftOpCode = DefRegBytes < 4 ? X86::SHRX32rr : X86::SHRX64rr; + auto &ShiftRC = + DefRegBytes < 4 ? X86::GR32_NOSPRegClass : X86::GR64_NOSPRegClass; + int ShiftRegBytes = TRI->getRegSizeInBits(ShiftRC) / 8; + unsigned DefSubRegImm = SubRegImms[Log2_32(DefRegBytes)]; + + unsigned StateReg = GetStateRegInRC(ShiftRC); + + // First have the def instruction def a temporary register. + unsigned TmpReg = MRI->createVirtualRegister(DefRC); + DefOp.setReg(TmpReg); + // Now copy it into a register of the shift RC. + unsigned ShiftInputReg = TmpReg; + if (DefRegBytes != ShiftRegBytes) { + unsigned UndefReg = MRI->createVirtualRegister(&ShiftRC); + BuildMI(MBB, InsertPt, Loc, TII->get(X86::IMPLICIT_DEF), UndefReg); + ShiftInputReg = MRI->createVirtualRegister(&ShiftRC); + BuildMI(MBB, InsertPt, Loc, TII->get(X86::INSERT_SUBREG), ShiftInputReg) + .addReg(UndefReg) + .addReg(TmpReg) + .addImm(DefSubRegImm); + } + + // We shift this once if the shift is wider than the def and thus we can + // shift *all* of the def'ed bytes out. Otherwise we need to do two shifts. + + unsigned ShiftedReg = MRI->createVirtualRegister(&ShiftRC); + auto Shift1I = + BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), ShiftedReg) + .addReg(ShiftInputReg) + .addReg(StateReg); + (void)Shift1I; + ++NumInstsInserted; + LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift1I->dump(); dbgs() << "\n"); + + // The only way we have a bit left is if all 8 bytes were defined. Do an + // extra shift to get the last bit in this case. + if (DefRegBytes == ShiftRegBytes) { + // We can just directly def the old def register as its the same size. + ShiftInputReg = ShiftedReg; + auto Shift2I = + BuildMI(MBB, InsertPt, Loc, TII->get(ShiftOpCode), OldDefReg) + .addReg(ShiftInputReg) + .addReg(StateReg); + (void)Shift2I; + ++NumInstsInserted; + LLVM_DEBUG(dbgs() << " Inserting shrx: "; Shift2I->dump(); + dbgs() << "\n"); + } else { + // When we have different size shift register we need to fix up the + // class. We can do that as we copy into the old def register. + BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), OldDefReg) + .addReg(ShiftedReg, 0, DefSubRegImm); + } + } + + if (FlagsReg) + restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg); + + ++NumPostLoadRegsHardened; +} + +void X86SpeculativeLoadHardeningPass::checkReturnInstr( + MachineInstr &MI, MachineSSAUpdater &PredStateSSA) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc Loc = MI.getDebugLoc(); + auto InsertPt = MI.getIterator(); + + if (FenceCallAndRet) { + // Simply forcibly block speculation of loads out of the function by using + // an LFENCE. This is potentially a heavy-weight mitigation strategy, but + // should be secure, is simple from an ABI perspective, and the cost can be + // minimized through inlining. + // + // FIXME: We should investigate ways to establish a strong data-dependency + // on the return. However, poisoning the stack pointer is unlikely to work + // because the return is *predicted* rather than relying on the load of the + // return address to actually resolve. + BuildMI(MBB, InsertPt, Loc, TII->get(X86::LFENCE)); + ++NumInstsInserted; + ++NumLFENCEsInserted; + return; + } + + // Take our predicate state, shift it to the high 17 bits (so that we keep + // pointers canonical) and merge it into RSP. This will allow the caller to + // extract it when we return (speculatively). + mergePredStateIntoSP(MBB, InsertPt, Loc, + PredStateSSA.GetValueAtEndOfBlock(&MBB)); +} + +INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, DEBUG_TYPE, + "X86 speculative load hardener", false, false) +INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, DEBUG_TYPE, + "X86 speculative load hardener", false, false) + +FunctionPass *llvm::createX86SpeculativeLoadHardeningPass() { + return new X86SpeculativeLoadHardeningPass(); +} Index: llvm/trunk/lib/Target/X86/X86TargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86TargetMachine.cpp +++ llvm/trunk/lib/Target/X86/X86TargetMachine.cpp @@ -54,6 +54,10 @@ cl::desc("Enable the machine combiner pass"), cl::init(true), cl::Hidden); +static cl::opt EnableSpeculativeLoadHardening( + "x86-speculative-load-hardening", + cl::desc("Enable speculative load hardening"), cl::init(false), cl::Hidden); + namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); @@ -463,6 +467,9 @@ addPass(createX86AvoidStoreForwardingBlocks()); } + if (EnableSpeculativeLoadHardening) + addPass(createX86SpeculativeLoadHardeningPass()); + addPass(createX86FlagsCopyLoweringPass()); addPass(createX86WinAllocaExpander()); } Index: llvm/trunk/test/CodeGen/X86/speculative-load-hardening.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/speculative-load-hardening.ll +++ llvm/trunk/test/CodeGen/X86/speculative-load-hardening.ll @@ -0,0 +1,571 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening | FileCheck %s --check-prefix=X64 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -x86-speculative-load-hardening-lfence | FileCheck %s --check-prefix=X64-LFENCE +; +; FIXME: Add support for 32-bit and other EH ABIs. + +declare void @leak(i32 %v1, i32 %v2) + +declare void @sink(i32) + +define void @test_basic_conditions(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr2, i32** %ptr3) nounwind { +; X64-LABEL: test_basic_conditions: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %r15 +; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %rbx +; X64-NEXT: movq %rsp, %rax +; X64-NEXT: movq $-1, %rbx +; X64-NEXT: sarq $63, %rax +; X64-NEXT: testl %edi, %edi +; X64-NEXT: jne .LBB0_1 +; X64-NEXT: # %bb.2: # %then1 +; X64-NEXT: cmovneq %rbx, %rax +; X64-NEXT: testl %esi, %esi +; X64-NEXT: je .LBB0_4 +; X64-NEXT: .LBB0_1: +; X64-NEXT: cmoveq %rbx, %rax +; X64-NEXT: .LBB0_8: # %exit +; X64-NEXT: shlq $47, %rax +; X64-NEXT: orq %rax, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: popq %r14 +; X64-NEXT: popq %r15 +; X64-NEXT: retq +; X64-NEXT: .LBB0_4: # %then2 +; X64-NEXT: movq %r8, %r15 +; X64-NEXT: cmovneq %rbx, %rax +; X64-NEXT: testl %edx, %edx +; X64-NEXT: je .LBB0_6 +; X64-NEXT: # %bb.5: # %else3 +; X64-NEXT: cmoveq %rbx, %rax +; X64-NEXT: movslq (%r9), %rcx +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: leaq (%r15,%rcx,4), %r14 +; X64-NEXT: movl %ecx, (%r15,%rcx,4) +; X64-NEXT: jmp .LBB0_7 +; X64-NEXT: .LBB0_6: # %then3 +; X64-NEXT: cmovneq %rbx, %rax +; X64-NEXT: movl (%rcx), %ecx +; X64-NEXT: addl (%r15), %ecx +; X64-NEXT: orl %eax, %ecx +; X64-NEXT: movslq %ecx, %rdi +; X64-NEXT: movl (%r15,%rdi,4), %esi +; X64-NEXT: orl %eax, %esi +; X64-NEXT: movq (%r9), %r14 +; X64-NEXT: orq %rax, %r14 +; X64-NEXT: addl (%r14), %esi +; X64-NEXT: shlq $47, %rax +; X64-NEXT: # kill: def $edi killed $edi killed $rdi +; X64-NEXT: orq %rax, %rsp +; X64-NEXT: callq leak +; X64-NEXT: movq %rsp, %rax +; X64-NEXT: sarq $63, %rax +; X64-NEXT: .LBB0_7: # %merge +; X64-NEXT: movslq (%r14), %rcx +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movl $0, (%r15,%rcx,4) +; X64-NEXT: jmp .LBB0_8 +; +; X64-LFENCE-LABEL: test_basic_conditions: +; X64-LFENCE: # %bb.0: # %entry +; X64-LFENCE-NEXT: testl %edi, %edi +; X64-LFENCE-NEXT: jne .LBB0_6 +; X64-LFENCE-NEXT: # %bb.1: # %then1 +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: testl %esi, %esi +; X64-LFENCE-NEXT: je .LBB0_2 +; X64-LFENCE-NEXT: .LBB0_6: # %exit +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: retq +; X64-LFENCE-NEXT: .LBB0_2: # %then2 +; X64-LFENCE-NEXT: pushq %r14 +; X64-LFENCE-NEXT: pushq %rbx +; X64-LFENCE-NEXT: pushq %rax +; X64-LFENCE-NEXT: movq %r8, %rbx +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: testl %edx, %edx +; X64-LFENCE-NEXT: je .LBB0_3 +; X64-LFENCE-NEXT: # %bb.4: # %else3 +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: movslq (%r9), %rax +; X64-LFENCE-NEXT: leaq (%rbx,%rax,4), %r14 +; X64-LFENCE-NEXT: movl %eax, (%rbx,%rax,4) +; X64-LFENCE-NEXT: jmp .LBB0_5 +; X64-LFENCE-NEXT: .LBB0_3: # %then3 +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: movl (%rcx), %eax +; X64-LFENCE-NEXT: addl (%rbx), %eax +; X64-LFENCE-NEXT: movslq %eax, %rdi +; X64-LFENCE-NEXT: movl (%rbx,%rdi,4), %esi +; X64-LFENCE-NEXT: movq (%r9), %r14 +; X64-LFENCE-NEXT: addl (%r14), %esi +; X64-LFENCE-NEXT: # kill: def $edi killed $edi killed $rdi +; X64-LFENCE-NEXT: callq leak +; X64-LFENCE-NEXT: .LBB0_5: # %merge +; X64-LFENCE-NEXT: movslq (%r14), %rax +; X64-LFENCE-NEXT: movl $0, (%rbx,%rax,4) +; X64-LFENCE-NEXT: addq $8, %rsp +; X64-LFENCE-NEXT: popq %rbx +; X64-LFENCE-NEXT: popq %r14 +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: retq +entry: + %a.cmp = icmp eq i32 %a, 0 + br i1 %a.cmp, label %then1, label %exit + +then1: + %b.cmp = icmp eq i32 %b, 0 + br i1 %b.cmp, label %then2, label %exit + +then2: + %c.cmp = icmp eq i32 %c, 0 + br i1 %c.cmp, label %then3, label %else3 + +then3: + %secret1 = load i32, i32* %ptr1 + %secret2 = load i32, i32* %ptr2 + %secret.sum1 = add i32 %secret1, %secret2 + %ptr2.idx = getelementptr i32, i32* %ptr2, i32 %secret.sum1 + %secret3 = load i32, i32* %ptr2.idx + %secret4 = load i32*, i32** %ptr3 + %secret5 = load i32, i32* %secret4 + %secret.sum2 = add i32 %secret3, %secret5 + call void @leak(i32 %secret.sum1, i32 %secret.sum2) + br label %merge + +else3: + %secret6 = load i32*, i32** %ptr3 + %cast = ptrtoint i32* %secret6 to i32 + %ptr2.idx2 = getelementptr i32, i32* %ptr2, i32 %cast + store i32 %cast, i32* %ptr2.idx2 + br label %merge + +merge: + %phi = phi i32* [ %secret4, %then3 ], [ %ptr2.idx2, %else3 ] + %secret7 = load i32, i32* %phi + %ptr2.idx3 = getelementptr i32, i32* %ptr2, i32 %secret7 + store i32 0, i32* %ptr2.idx3 + br label %exit + +exit: + ret void +} + +define void @test_basic_loop(i32 %a, i32 %b, i32* %ptr1, i32* %ptr2) nounwind { +; X64-LABEL: test_basic_loop: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rbp +; X64-NEXT: pushq %r15 +; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %r12 +; X64-NEXT: pushq %rbx +; X64-NEXT: movq %rsp, %rax +; X64-NEXT: movq $-1, %r15 +; X64-NEXT: sarq $63, %rax +; X64-NEXT: testl %edi, %edi +; X64-NEXT: je .LBB1_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: cmoveq %r15, %rax +; X64-NEXT: jmp .LBB1_5 +; X64-NEXT: .LBB1_2: # %l.header.preheader +; X64-NEXT: movq %rcx, %r14 +; X64-NEXT: movq %rdx, %r12 +; X64-NEXT: movl %esi, %ebp +; X64-NEXT: cmovneq %r15, %rax +; X64-NEXT: xorl %ebx, %ebx +; X64-NEXT: jmp .LBB1_3 +; X64-NEXT: .p2align 4, 0x90 +; X64-NEXT: .LBB1_6: # in Loop: Header=BB1_3 Depth=1 +; X64-NEXT: cmovgeq %r15, %rax +; X64-NEXT: .LBB1_3: # %l.header +; X64-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-NEXT: movslq (%r12), %rcx +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: orq %r14, %rdx +; X64-NEXT: movl (%rdx,%rcx,4), %edi +; X64-NEXT: shlq $47, %rax +; X64-NEXT: orq %rax, %rsp +; X64-NEXT: callq sink +; X64-NEXT: movq %rsp, %rax +; X64-NEXT: sarq $63, %rax +; X64-NEXT: incl %ebx +; X64-NEXT: cmpl %ebp, %ebx +; X64-NEXT: jl .LBB1_6 +; X64-NEXT: # %bb.4: +; X64-NEXT: cmovlq %r15, %rax +; X64-NEXT: .LBB1_5: # %exit +; X64-NEXT: shlq $47, %rax +; X64-NEXT: orq %rax, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: popq %r12 +; X64-NEXT: popq %r14 +; X64-NEXT: popq %r15 +; X64-NEXT: popq %rbp +; X64-NEXT: retq +; +; X64-LFENCE-LABEL: test_basic_loop: +; X64-LFENCE: # %bb.0: # %entry +; X64-LFENCE-NEXT: pushq %rbp +; X64-LFENCE-NEXT: pushq %r15 +; X64-LFENCE-NEXT: pushq %r14 +; X64-LFENCE-NEXT: pushq %rbx +; X64-LFENCE-NEXT: pushq %rax +; X64-LFENCE-NEXT: testl %edi, %edi +; X64-LFENCE-NEXT: jne .LBB1_3 +; X64-LFENCE-NEXT: # %bb.1: # %l.header.preheader +; X64-LFENCE-NEXT: movq %rcx, %r14 +; X64-LFENCE-NEXT: movq %rdx, %r15 +; X64-LFENCE-NEXT: movl %esi, %ebp +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: xorl %ebx, %ebx +; X64-LFENCE-NEXT: .p2align 4, 0x90 +; X64-LFENCE-NEXT: .LBB1_2: # %l.header +; X64-LFENCE-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: movslq (%r15), %rax +; X64-LFENCE-NEXT: movl (%r14,%rax,4), %edi +; X64-LFENCE-NEXT: callq sink +; X64-LFENCE-NEXT: incl %ebx +; X64-LFENCE-NEXT: cmpl %ebp, %ebx +; X64-LFENCE-NEXT: jl .LBB1_2 +; X64-LFENCE-NEXT: .LBB1_3: # %exit +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: addq $8, %rsp +; X64-LFENCE-NEXT: popq %rbx +; X64-LFENCE-NEXT: popq %r14 +; X64-LFENCE-NEXT: popq %r15 +; X64-LFENCE-NEXT: popq %rbp +; X64-LFENCE-NEXT: retq +entry: + %a.cmp = icmp eq i32 %a, 0 + br i1 %a.cmp, label %l.header, label %exit + +l.header: + %i = phi i32 [ 0, %entry ], [ %i.next, %l.header ] + %secret = load i32, i32* %ptr1 + %ptr2.idx = getelementptr i32, i32* %ptr2, i32 %secret + %leak = load i32, i32* %ptr2.idx + call void @sink(i32 %leak) + %i.next = add i32 %i, 1 + %i.cmp = icmp slt i32 %i.next, %b + br i1 %i.cmp, label %l.header, label %exit + +exit: + ret void +} + +define void @test_basic_nested_loop(i32 %a, i32 %b, i32 %c, i32* %ptr1, i32* %ptr2) nounwind { +; X64-LABEL: test_basic_nested_loop: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rbp +; X64-NEXT: pushq %r15 +; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %r13 +; X64-NEXT: pushq %r12 +; X64-NEXT: pushq %rbx +; X64-NEXT: pushq %rax +; X64-NEXT: movq %rsp, %rax +; X64-NEXT: movq $-1, %r12 +; X64-NEXT: sarq $63, %rax +; X64-NEXT: testl %edi, %edi +; X64-NEXT: je .LBB2_2 +; X64-NEXT: # %bb.1: +; X64-NEXT: cmoveq %r12, %rax +; X64-NEXT: jmp .LBB2_10 +; X64-NEXT: .LBB2_2: # %l1.header.preheader +; X64-NEXT: movq %r8, %r14 +; X64-NEXT: movq %rcx, %rbx +; X64-NEXT: movl %edx, %ebp +; X64-NEXT: movl %esi, %r15d +; X64-NEXT: cmovneq %r12, %rax +; X64-NEXT: xorl %r13d, %r13d +; X64-NEXT: movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; X64-NEXT: testl %r15d, %r15d +; X64-NEXT: jg .LBB2_5 +; X64-NEXT: jmp .LBB2_4 +; X64-NEXT: .p2align 4, 0x90 +; X64-NEXT: .LBB2_12: +; X64-NEXT: cmovgeq %r12, %rax +; X64-NEXT: testl %r15d, %r15d +; X64-NEXT: jle .LBB2_4 +; X64-NEXT: .LBB2_5: # %l2.header.preheader +; X64-NEXT: cmovleq %r12, %rax +; X64-NEXT: xorl %r15d, %r15d +; X64-NEXT: jmp .LBB2_6 +; X64-NEXT: .p2align 4, 0x90 +; X64-NEXT: .LBB2_11: # in Loop: Header=BB2_6 Depth=1 +; X64-NEXT: cmovgeq %r12, %rax +; X64-NEXT: .LBB2_6: # %l2.header +; X64-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-NEXT: movslq (%rbx), %rcx +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: orq %r14, %rdx +; X64-NEXT: movl (%rdx,%rcx,4), %edi +; X64-NEXT: shlq $47, %rax +; X64-NEXT: orq %rax, %rsp +; X64-NEXT: callq sink +; X64-NEXT: movq %rsp, %rax +; X64-NEXT: sarq $63, %rax +; X64-NEXT: incl %r15d +; X64-NEXT: cmpl %ebp, %r15d +; X64-NEXT: jl .LBB2_11 +; X64-NEXT: # %bb.7: +; X64-NEXT: cmovlq %r12, %rax +; X64-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Reload +; X64-NEXT: jmp .LBB2_8 +; X64-NEXT: .p2align 4, 0x90 +; X64-NEXT: .LBB2_4: +; X64-NEXT: cmovgq %r12, %rax +; X64-NEXT: .LBB2_8: # %l1.latch +; X64-NEXT: movslq (%rbx), %rcx +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movq %rax, %rdx +; X64-NEXT: orq %r14, %rdx +; X64-NEXT: movl (%rdx,%rcx,4), %edi +; X64-NEXT: shlq $47, %rax +; X64-NEXT: orq %rax, %rsp +; X64-NEXT: callq sink +; X64-NEXT: movq %rsp, %rax +; X64-NEXT: sarq $63, %rax +; X64-NEXT: incl %r13d +; X64-NEXT: cmpl %r15d, %r13d +; X64-NEXT: jl .LBB2_12 +; X64-NEXT: # %bb.9: +; X64-NEXT: cmovlq %r12, %rax +; X64-NEXT: .LBB2_10: # %exit +; X64-NEXT: shlq $47, %rax +; X64-NEXT: orq %rax, %rsp +; X64-NEXT: addq $8, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: popq %r12 +; X64-NEXT: popq %r13 +; X64-NEXT: popq %r14 +; X64-NEXT: popq %r15 +; X64-NEXT: popq %rbp +; X64-NEXT: retq +; +; X64-LFENCE-LABEL: test_basic_nested_loop: +; X64-LFENCE: # %bb.0: # %entry +; X64-LFENCE-NEXT: pushq %rbp +; X64-LFENCE-NEXT: pushq %r15 +; X64-LFENCE-NEXT: pushq %r14 +; X64-LFENCE-NEXT: pushq %r13 +; X64-LFENCE-NEXT: pushq %r12 +; X64-LFENCE-NEXT: pushq %rbx +; X64-LFENCE-NEXT: pushq %rax +; X64-LFENCE-NEXT: testl %edi, %edi +; X64-LFENCE-NEXT: jne .LBB2_6 +; X64-LFENCE-NEXT: # %bb.1: # %l1.header.preheader +; X64-LFENCE-NEXT: movq %r8, %r14 +; X64-LFENCE-NEXT: movq %rcx, %rbx +; X64-LFENCE-NEXT: movl %edx, %r13d +; X64-LFENCE-NEXT: movl %esi, %r15d +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: xorl %r12d, %r12d +; X64-LFENCE-NEXT: .p2align 4, 0x90 +; X64-LFENCE-NEXT: .LBB2_2: # %l1.header +; X64-LFENCE-NEXT: # =>This Loop Header: Depth=1 +; X64-LFENCE-NEXT: # Child Loop BB2_4 Depth 2 +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: testl %r15d, %r15d +; X64-LFENCE-NEXT: jle .LBB2_5 +; X64-LFENCE-NEXT: # %bb.3: # %l2.header.preheader +; X64-LFENCE-NEXT: # in Loop: Header=BB2_2 Depth=1 +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: xorl %ebp, %ebp +; X64-LFENCE-NEXT: .p2align 4, 0x90 +; X64-LFENCE-NEXT: .LBB2_4: # %l2.header +; X64-LFENCE-NEXT: # Parent Loop BB2_2 Depth=1 +; X64-LFENCE-NEXT: # => This Inner Loop Header: Depth=2 +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: movslq (%rbx), %rax +; X64-LFENCE-NEXT: movl (%r14,%rax,4), %edi +; X64-LFENCE-NEXT: callq sink +; X64-LFENCE-NEXT: incl %ebp +; X64-LFENCE-NEXT: cmpl %r13d, %ebp +; X64-LFENCE-NEXT: jl .LBB2_4 +; X64-LFENCE-NEXT: .LBB2_5: # %l1.latch +; X64-LFENCE-NEXT: # in Loop: Header=BB2_2 Depth=1 +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: movslq (%rbx), %rax +; X64-LFENCE-NEXT: movl (%r14,%rax,4), %edi +; X64-LFENCE-NEXT: callq sink +; X64-LFENCE-NEXT: incl %r12d +; X64-LFENCE-NEXT: cmpl %r15d, %r12d +; X64-LFENCE-NEXT: jl .LBB2_2 +; X64-LFENCE-NEXT: .LBB2_6: # %exit +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: addq $8, %rsp +; X64-LFENCE-NEXT: popq %rbx +; X64-LFENCE-NEXT: popq %r12 +; X64-LFENCE-NEXT: popq %r13 +; X64-LFENCE-NEXT: popq %r14 +; X64-LFENCE-NEXT: popq %r15 +; X64-LFENCE-NEXT: popq %rbp +; X64-LFENCE-NEXT: retq +entry: + %a.cmp = icmp eq i32 %a, 0 + br i1 %a.cmp, label %l1.header, label %exit + +l1.header: + %i = phi i32 [ 0, %entry ], [ %i.next, %l1.latch ] + %b.cmp = icmp sgt i32 %b, 0 + br i1 %b.cmp, label %l2.header, label %l1.latch + +l2.header: + %j = phi i32 [ 0, %l1.header ], [ %j.next, %l2.header ] + %secret = load i32, i32* %ptr1 + %ptr2.idx = getelementptr i32, i32* %ptr2, i32 %secret + %leak = load i32, i32* %ptr2.idx + call void @sink(i32 %leak) + %j.next = add i32 %j, 1 + %j.cmp = icmp slt i32 %j.next, %c + br i1 %j.cmp, label %l2.header, label %l1.latch + +l1.latch: + %secret2 = load i32, i32* %ptr1 + %ptr2.idx2 = getelementptr i32, i32* %ptr2, i32 %secret2 + %leak2 = load i32, i32* %ptr2.idx2 + call void @sink(i32 %leak2) + %i.next = add i32 %i, 1 + %i.cmp = icmp slt i32 %i.next, %b + br i1 %i.cmp, label %l1.header, label %exit + +exit: + ret void +} + +declare i32 @__gxx_personality_v0(...) + +declare i8* @__cxa_allocate_exception(i64) local_unnamed_addr + +declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr + +define void @test_basic_eh(i32 %a, i32* %ptr1, i32* %ptr2) nounwind personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +; X64-LABEL: test_basic_eh: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %rbp +; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %rbx +; X64-NEXT: movq %rsp, %rax +; X64-NEXT: movq $-1, %rcx +; X64-NEXT: sarq $63, %rax +; X64-NEXT: cmpl $41, %edi +; X64-NEXT: jg .LBB3_1 +; X64-NEXT: # %bb.2: # %thrower +; X64-NEXT: movq %rdx, %r14 +; X64-NEXT: movq %rsi, %rbx +; X64-NEXT: cmovgq %rcx, %rax +; X64-NEXT: movslq %edi, %rcx +; X64-NEXT: movl (%rsi,%rcx,4), %ebp +; X64-NEXT: orl %eax, %ebp +; X64-NEXT: movl $4, %edi +; X64-NEXT: shlq $47, %rax +; X64-NEXT: orq %rax, %rsp +; X64-NEXT: callq __cxa_allocate_exception +; X64-NEXT: movq %rsp, %rcx +; X64-NEXT: sarq $63, %rcx +; X64-NEXT: movl %ebp, (%rax) +; X64-NEXT: .Ltmp0: +; X64-NEXT: xorl %esi, %esi +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: shlq $47, %rcx +; X64-NEXT: movq %rax, %rdi +; X64-NEXT: orq %rcx, %rsp +; X64-NEXT: callq __cxa_throw +; X64-NEXT: movq %rsp, %rax +; X64-NEXT: sarq $63, %rax +; X64-NEXT: .Ltmp1: +; X64-NEXT: jmp .LBB3_3 +; X64-NEXT: .LBB3_1: +; X64-NEXT: cmovleq %rcx, %rax +; X64-NEXT: .LBB3_3: # %exit +; X64-NEXT: shlq $47, %rax +; X64-NEXT: orq %rax, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: popq %r14 +; X64-NEXT: popq %rbp +; X64-NEXT: retq +; X64-NEXT: .LBB3_4: # %lpad +; X64-NEXT: .Ltmp2: +; X64-NEXT: movq %rsp, %rcx +; X64-NEXT: sarq $63, %rcx +; X64-NEXT: movl (%rax), %eax +; X64-NEXT: addl (%rbx), %eax +; X64-NEXT: orl %ecx, %eax +; X64-NEXT: cltq +; X64-NEXT: movl (%r14,%rax,4), %edi +; X64-NEXT: orl %ecx, %edi +; X64-NEXT: shlq $47, %rcx +; X64-NEXT: orq %rcx, %rsp +; X64-NEXT: callq sink +; X64-NEXT: movq %rsp, %rax +; X64-NEXT: sarq $63, %rax +; +; X64-LFENCE-LABEL: test_basic_eh: +; X64-LFENCE: # %bb.0: # %entry +; X64-LFENCE-NEXT: pushq %rbp +; X64-LFENCE-NEXT: pushq %r14 +; X64-LFENCE-NEXT: pushq %rbx +; X64-LFENCE-NEXT: cmpl $41, %edi +; X64-LFENCE-NEXT: jg .LBB3_2 +; X64-LFENCE-NEXT: # %bb.1: # %thrower +; X64-LFENCE-NEXT: movq %rdx, %r14 +; X64-LFENCE-NEXT: movq %rsi, %rbx +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: movslq %edi, %rax +; X64-LFENCE-NEXT: movl (%rsi,%rax,4), %ebp +; X64-LFENCE-NEXT: movl $4, %edi +; X64-LFENCE-NEXT: callq __cxa_allocate_exception +; X64-LFENCE-NEXT: movl %ebp, (%rax) +; X64-LFENCE-NEXT: .Ltmp0: +; X64-LFENCE-NEXT: xorl %esi, %esi +; X64-LFENCE-NEXT: xorl %edx, %edx +; X64-LFENCE-NEXT: movq %rax, %rdi +; X64-LFENCE-NEXT: callq __cxa_throw +; X64-LFENCE-NEXT: .Ltmp1: +; X64-LFENCE-NEXT: .LBB3_2: # %exit +; X64-LFENCE-NEXT: lfence +; X64-LFENCE-NEXT: popq %rbx +; X64-LFENCE-NEXT: popq %r14 +; X64-LFENCE-NEXT: popq %rbp +; X64-LFENCE-NEXT: retq +; X64-LFENCE-NEXT: .LBB3_3: # %lpad +; X64-LFENCE-NEXT: .Ltmp2: +; X64-LFENCE-NEXT: movl (%rax), %eax +; X64-LFENCE-NEXT: addl (%rbx), %eax +; X64-LFENCE-NEXT: cltq +; X64-LFENCE-NEXT: movl (%r14,%rax,4), %edi +; X64-LFENCE-NEXT: callq sink +entry: + %a.cmp = icmp slt i32 %a, 42 + br i1 %a.cmp, label %thrower, label %exit + +thrower: + %badidx = getelementptr i32, i32* %ptr1, i32 %a + %secret1 = load i32, i32* %badidx + %e.ptr = call i8* @__cxa_allocate_exception(i64 4) + %e.ptr.cast = bitcast i8* %e.ptr to i32* + store i32 %secret1, i32* %e.ptr.cast + invoke void @__cxa_throw(i8* %e.ptr, i8* null, i8* null) + to label %exit unwind label %lpad + +exit: + ret void + +lpad: + %e = landingpad { i8*, i32 } + catch i8* null + %e.catch.ptr = extractvalue { i8*, i32 } %e, 0 + %e.catch.ptr.cast = bitcast i8* %e.catch.ptr to i32* + %secret1.catch = load i32, i32* %e.catch.ptr.cast + %secret2 = load i32, i32* %ptr1 + %secret.sum = add i32 %secret1.catch, %secret2 + %ptr2.idx = getelementptr i32, i32* %ptr2, i32 %secret.sum + %leak = load i32, i32* %ptr2.idx + call void @sink(i32 %leak) + unreachable +}