Index: llvm/lib/Target/X86/X86InstrFoldTables.cpp =================================================================== --- llvm/lib/Target/X86/X86InstrFoldTables.cpp +++ llvm/lib/Target/X86/X86InstrFoldTables.cpp @@ -340,6 +340,8 @@ { X86::TAILJMPr, X86::TAILJMPm, TB_FOLDED_LOAD }, { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD }, { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD }, + { X86::TCRETURNri, X86::TCRETURNmi, TB_FOLDED_LOAD }, + { X86::TCRETURNri64, X86::TCRETURNmi64, TB_FOLDED_LOAD }, { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD }, { X86::TEST16rr, X86::TEST16mr, TB_FOLDED_LOAD }, { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD }, Index: llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp =================================================================== --- llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -23,6 +23,7 @@ #include "X86.h" #include "X86InstrBuilder.h" #include "X86InstrInfo.h" +#include "X86InstrFoldTables.h" #include "X86Subtarget.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -70,6 +71,8 @@ "Number of address mode used registers hardaned"); STATISTIC(NumPostLoadRegsHardened, "Number of post-load register values hardened"); +STATISTIC(NumCallsOrJumpsHardened, + "Number of calls or jumps requiring extra hardening"); STATISTIC(NumInstsInserted, "Number of instructions inserted"); STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted"); @@ -105,6 +108,13 @@ "significant security is provided."), cl::init(true), cl::Hidden); +static cl::opt HardenIndirectCallsAndJumps( + PASS_KEY "-indirect", + cl::desc("Harden indirect calls and jumps against using speculatively " + "stored attacker controlled addresses. This is designed to " + "mitigate Spectre v1.2 style attacks."), + cl::init(true), cl::Hidden); + namespace llvm { void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); @@ -168,6 +178,8 @@ SmallVector tracePredStateThroughCFG(MachineFunction &MF, ArrayRef Infos); + void unfoldCallAndJumpLoads(MachineFunction &MF); + void hardenAllLoads(MachineFunction &MF); unsigned saveEFLAGS(MachineBasicBlock &MBB, @@ -193,6 +205,9 @@ bool canHardenRegister(unsigned Reg); void hardenPostLoad(MachineInstr &MI); void hardenReturnInstr(MachineInstr &MI); + void hardenIndirectCallOrJumpInstr( + MachineInstr &MI, + SmallDenseMap &AddrRegToHardenedReg); }; } // end anonymous namespace @@ -504,6 +519,11 @@ } } + // If we are going to harden calls and jumps we need to unfold their memory + // operands. + if (HardenIndirectCallsAndJumps) + unfoldCallAndJumpLoads(MF); + // Now harden all of the loads in the function using the predicate state. hardenAllLoads(MF); @@ -814,6 +834,112 @@ return CMovs; } +/// Compute the register class for the unfolded load. +/// +/// FIXME: This should probably live in X86InstrInfo, potentially by adding +/// a way to unfold into a newly created vreg rather than requiring a register +/// input. +static const TargetRegisterClass * +getRegClassForUnfoldedLoad(MachineFunction &MF, const X86InstrInfo &TII, + unsigned Opcode) { + const X86MemoryFoldTableEntry *I = lookupUnfoldTable(Opcode); + if (I == nullptr) + return nullptr; + unsigned UnfoldedOpc = I->DstOp; + unsigned Index = I->Flags & TB_INDEX_MASK; + const MCInstrDesc &MCID = TII.get(UnfoldedOpc); + return TII.getRegClass(MCID, Index, &TII.getRegisterInfo(), MF); +} + +void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads( + MachineFunction &MF) { + for (MachineBasicBlock &MBB : MF) + for (auto MII = MBB.instr_begin(), MIE = MBB.instr_end(); MII != MIE;) { + // Grab a reference and increment the iterator so we can remove this + // instruction if needed without disturbing the iteration. + MachineInstr &MI = *MII++; + + // Must either be a call or a branch. + if (!MI.isCall() && !MI.isBranch()) + continue; + // We only care about loading variants of these instructions. + if (!MI.mayLoad()) + continue; + + switch (MI.getOpcode()) { + default: { + LLVM_DEBUG( + dbgs() << "ERROR: Found an unexpected loading branch or call " + "instruction:\n"; + MI.dump(); dbgs() << "\n"); + report_fatal_error("Unexpected loading branch or call!"); + } + + case X86::FARCALL16m: + case X86::FARCALL32m: + case X86::FARCALL64: + case X86::FARJMP16m: + case X86::FARJMP32m: + case X86::FARJMP64: + // We cannot mitigate far jumps or calls, but we also don't expect them + // to be vulnerable to Spectre v1.2 style attacks. + continue; + + case X86::CALL16m: + case X86::CALL16m_NT: + case X86::CALL32m: + case X86::CALL32m_NT: + case X86::CALL64m: + case X86::CALL64m_NT: + case X86::JMP16m: + case X86::JMP16m_NT: + case X86::JMP32m: + case X86::JMP32m_NT: + case X86::JMP64m: + case X86::JMP64m_NT: + case X86::TAILJMPm64: + case X86::TAILJMPm64_REX: + case X86::TAILJMPm: + case X86::TCRETURNmi64: + case X86::TCRETURNmi: { + // Use the generic unfold logic now that we know we're dealing with + // expected instructions. + // FIXME: We don't have test coverage for all of these! + auto *UnfoldedRC = getRegClassForUnfoldedLoad(MF, *TII, MI.getOpcode()); + if (!UnfoldedRC) { + LLVM_DEBUG(dbgs() + << "ERROR: Unable to unfold load from instruction:\n"; + MI.dump(); dbgs() << "\n"); + report_fatal_error("Unable to unfold load!"); + } + unsigned Reg = MRI->createVirtualRegister(UnfoldedRC); + SmallVector NewMIs; + // If we were able to compute an unfolded reg class, any failure here + // is just a programming error so just assert. + bool Unfolded = + TII->unfoldMemoryOperand(MF, MI, Reg, /*UnfoldLoad*/ true, + /*UnfoldStore*/ false, NewMIs); + (void)Unfolded; + assert(Unfolded && + "Computed unfolded register class but failed to unfold"); + // Now stitch the new instructions into place and erase the old one. + for (auto *NewMI : NewMIs) + MBB.insert(MI.getIterator(), NewMI); + MI.eraseFromParent(); + LLVM_DEBUG({ + dbgs() << "Unfolded load successfully into:\n"; + for (auto *NewMI : NewMIs) { + NewMI->dump(); + dbgs() << "\n"; + } + }); + continue; + } + } + llvm_unreachable("Escaped switch with default!"); + } +} + /// Returns true if the instruction has no behavior (specified or otherwise) /// that is based on the value of any of its register operands /// @@ -1439,6 +1565,14 @@ continue; } + // Check for an indirect call or branch that may need its input hardened + // even if we couldn't find the specific load used, or were able to avoid + // hardening it for some reason. Note that here we cannot break out + // afterward as we may still need to handle any call aspect of this + // instruction. + if ((MI.isCall() || MI.isBranch()) && HardenIndirectCallsAndJumps) + hardenIndirectCallOrJumpInstr(MI, AddrRegToHardenedReg); + // After we finish processing the instruction and doing any hardening // necessary for it, we need to handle transferring the predicate state // into a call and recovering it after the call returns (if it returns). @@ -2039,6 +2173,100 @@ mergePredStateIntoSP(MBB, InsertPt, Loc, PS->SSA.GetValueAtEndOfBlock(&MBB)); } +/// An attacker may speculatively store over a value that is then speculatively +/// loaded and used as the target of an indirect call or jump instruction. This +/// is called Spectre v1.2 or Bounds Check Bypass Store (BCBS) and is described +/// in this paper: +/// https://people.csail.mit.edu/vlk/spectre11.pdf +/// +/// When this happens, the speculative +/// execution of the call or jump will end up being steered to this attacker +/// controlled address. While most such loads will be adequately hardened +/// already, we want to ensure that they are definitively treated as needing +/// post-load hardening. While address hardening is sufficient to prevent secret +/// data from leaking to the attacker, it may not be sufficient to prevent an +/// attacker from steering speculative execution. We forcibly unfolded all +/// relevant loads above and so will always have an opportunity to post-load +/// harden here, we just need to scan for cases not already flagged and add +/// them. +void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr( + MachineInstr &MI, + SmallDenseMap &AddrRegToHardenedReg) { + switch (MI.getOpcode()) { + case X86::FARCALL16m: + case X86::FARCALL32m: + case X86::FARCALL64: + case X86::FARJMP16m: + case X86::FARJMP32m: + case X86::FARJMP64: + // We don't need to harden either far calls or far jumps as they are + // safe from Spectre. + return; + + default: + break; + } + + // We should never see a loading instruction at this point, as those should + // have been unfolded. + assert(!MI.mayLoad() && "Found a lingering loading instruction!"); + + // If the first operand isn't a register, this is a branch or call + // instruction with an immediate operand which doesn't need to be hardened. + if (!MI.getOperand(0).isReg()) + return; + + // For all of these, the target register is zero. + auto &TargetOp = MI.getOperand(0); + unsigned OldTargetReg = TargetOp.getReg(); + assert(canHardenRegister(OldTargetReg) && + "Cannot harden this instruction's target register!"); + + // If we have already hardened the target register, there is nothing to do. + if (AddrRegToHardenedReg.count(OldTargetReg)) + return; + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc Loc = MI.getDebugLoc(); + auto InsertPt = MI.getIterator(); + + // FIXME: All the hardening code is essentially duplicated with the post-load + // hardening code. Should refactor that to be sharable here. + auto *TargetRC = MRI->getRegClass(OldTargetReg); + int TargetRegBytes = TRI->getRegSizeInBits(*TargetRC) / 8; + + unsigned FlagsReg = 0; + if (isEFLAGSLive(MBB, InsertPt, *TRI)) + FlagsReg = saveEFLAGS(MBB, InsertPt, Loc); + + unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB); + // FIXME: Need to teach this about 32-bit mode. + if (TargetRegBytes != 8) { + unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit}; + unsigned SubRegImm = SubRegImms[Log2_32(TargetRegBytes)]; + unsigned NarrowStateReg = MRI->createVirtualRegister(TargetRC); + BuildMI(MBB, InsertPt, Loc, TII->get(TargetOpcode::COPY), NarrowStateReg) + .addReg(StateReg, 0, SubRegImm); + StateReg = NarrowStateReg; + } + + unsigned NewTargetReg = MRI->createVirtualRegister(TargetRC); + unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr}; + unsigned OrOpCode = OrOpCodes[Log2_32(TargetRegBytes)]; + auto OrI = BuildMI(MBB, InsertPt, Loc, TII->get(OrOpCode), NewTargetReg) + .addReg(StateReg) + .addReg(OldTargetReg); + OrI->addRegisterDead(X86::EFLAGS, TRI); + TargetOp.setReg(NewTargetReg); + ++NumInstsInserted; + LLVM_DEBUG(dbgs() << " Inserting or: "; OrI->dump(); dbgs() << "\n"); + + if (FlagsReg) + restoreEFLAGS(MBB, InsertPt, Loc, FlagsReg); + + ++NumCallsOrJumpsHardened; +} + INITIALIZE_PASS_BEGIN(X86SpeculativeLoadHardeningPass, DEBUG_TYPE, "X86 speculative load hardener", false, false) INITIALIZE_PASS_END(X86SpeculativeLoadHardeningPass, DEBUG_TYPE, Index: llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll =================================================================== --- llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll +++ llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll @@ -15,15 +15,20 @@ define i32 @test_indirect_call(i32 ()** %ptr) nounwind { ; X64-LABEL: test_indirect_call: ; X64: # %bb.0: # %entry -; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rsp, %rbx -; X64-NEXT: movq $-1, %rax -; X64-NEXT: sarq $63, %rbx -; X64-NEXT: orq %rbx, %rdi -; X64-NEXT: callq *(%rdi) -; X64-NEXT: shlq $47, %rbx -; X64-NEXT: orq %rbx, %rsp -; X64-NEXT: popq %rbx +; X64-NEXT: pushq %rax +; X64-NEXT: movq %rsp, %rax +; X64-NEXT: movq $-1, %rcx +; X64-NEXT: sarq $63, %rax +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: shlq $47, %rax +; X64-NEXT: orq %rax, %rsp +; X64-NEXT: callq *%rcx +; X64-NEXT: movq %rsp, %rcx +; X64-NEXT: sarq $63, %rcx +; X64-NEXT: shlq $47, %rcx +; X64-NEXT: orq %rcx, %rsp +; X64-NEXT: popq %rcx ; X64-NEXT: retq entry: %fp = load i32 ()*, i32 ()** %ptr @@ -37,9 +42,11 @@ ; X64-NEXT: movq %rsp, %rax ; X64-NEXT: movq $-1, %rcx ; X64-NEXT: sarq $63, %rax +; X64-NEXT: movq (%rdi), %rcx +; X64-NEXT: orq %rax, %rcx ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp -; X64-NEXT: jmpq *(%rdi) # TAILCALL +; X64-NEXT: jmpq *%rcx # TAILCALL entry: %fp = load i32 ()*, i32 ()** %ptr %v = tail call i32 %fp() @@ -53,9 +60,11 @@ ; X64-NEXT: movq %rsp, %rax ; X64-NEXT: movq $-1, %rcx ; X64-NEXT: sarq $63, %rax +; X64-NEXT: movq {{.*}}(%rip), %rcx +; X64-NEXT: orq %rax, %rcx ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp -; X64-NEXT: callq *{{.*}}(%rip) +; X64-NEXT: callq *%rcx ; X64-NEXT: movq %rsp, %rcx ; X64-NEXT: sarq $63, %rcx ; X64-NEXT: shlq $47, %rcx @@ -74,9 +83,11 @@ ; X64-NEXT: movq %rsp, %rax ; X64-NEXT: movq $-1, %rcx ; X64-NEXT: sarq $63, %rax +; X64-NEXT: movq {{.*}}(%rip), %rcx +; X64-NEXT: orq %rax, %rcx ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp -; X64-NEXT: jmpq *{{.*}}(%rip) # TAILCALL +; X64-NEXT: jmpq *%rcx # TAILCALL entry: %fp = load i32 ()*, i32 ()** @global_fnptr %v = tail call i32 %fp() @@ -89,8 +100,9 @@ ; X64-NEXT: movq %rsp, %rcx ; X64-NEXT: movq $-1, %rax ; X64-NEXT: sarq $63, %rcx -; X64-NEXT: orq %rcx, %rdi -; X64-NEXT: jmpq *(%rdi) +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: orq %rcx, %rax +; X64-NEXT: jmpq *%rax ; X64-NEXT: .LBB4_1: # %bb0 ; X64-NEXT: movl $2, %eax ; X64-NEXT: jmp .LBB4_2 @@ -130,8 +142,9 @@ ; X64-NEXT: movq $-1, %rax ; X64-NEXT: sarq $63, %rcx ; X64-NEXT: movslq %edi, %rax +; X64-NEXT: movq global_blockaddrs(,%rax,8), %rax ; X64-NEXT: orq %rcx, %rax -; X64-NEXT: jmpq *global_blockaddrs(,%rax,8) +; X64-NEXT: jmpq *%rax ; X64-NEXT: .Ltmp0: # Block address taken ; X64-NEXT: .LBB5_1: # %bb0 ; X64-NEXT: movl $2, %eax @@ -182,8 +195,9 @@ ; X64-NEXT: # %bb.1: # %entry ; X64-NEXT: cmovaq %rax, %rcx ; X64-NEXT: movl %edi, %eax +; X64-NEXT: movq .LJTI6_0(,%rax,8), %rax ; X64-NEXT: orq %rcx, %rax -; X64-NEXT: jmpq *.LJTI6_0(,%rax,8) +; X64-NEXT: jmpq *%rax ; X64-NEXT: .LBB6_3: # %bb1 ; X64-NEXT: movl $7, %eax ; X64-NEXT: jmp .LBB6_4