Index: llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp =================================================================== --- llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -114,6 +114,13 @@ "mitigate Spectre v1.2 style attacks."), cl::init(true), cl::Hidden); +static cl::opt HardenReturnsNearby( + PASS_KEY "-returns-nearby", + cl::desc("Force hardening RSP nearby to the RET instruction. Relying an " + "earlier hardening of RSP is appealing but hasn't had its " + "effectiveness vetted."), + cl::init(true), cl::Hidden); + namespace llvm { void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &); @@ -193,10 +200,13 @@ unsigned extractPredStateFromSP(MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc); + unsigned getCurrentPredState(MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPt, + DebugLoc Loc, bool &IsPredStateSSACurrent); void hardenLoadAddr(MachineInstr &MI, MachineOperand &BaseMO, - MachineOperand &IndexMO, + MachineOperand &IndexMO, bool &IsPredStateSSACurrent, SmallDenseMap &AddrRegToHardenedReg); MachineInstr * sinkPostLoadHardenedInst(MachineInstr &MI, @@ -204,11 +214,11 @@ bool canHardenRegister(unsigned Reg); unsigned hardenValueInRegister(unsigned Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, - DebugLoc Loc); - unsigned hardenPostLoad(MachineInstr &MI); - void hardenReturnInstr(MachineInstr &MI); + DebugLoc Loc, bool &IsPredStateSSACurrent); + unsigned hardenPostLoad(MachineInstr &MI, bool &IsPredStateSSACurrent); + void hardenReturnInstr(MachineInstr &MI, bool IsPredStateInSP); void hardenIndirectCallOrJumpInstr( - MachineInstr &MI, + MachineInstr &MI, bool &IsPredStateSSACurrent, SmallDenseMap &AddrRegToHardenedReg); }; @@ -1492,6 +1502,26 @@ LoadDepRegs.set(Def.getReg()); } + // While actually hardening in this block we also trace the predicate state + // in and out of functions being called. However, within a basic block there + // will never be *local* updates to the predicate state. So once we merge + // the predicate state into the stack pointer, the stack pointer will + // always have the latest predicate state for this function. We can also + // delay extracting the predicate state from the stack pointer until we need + // it for some hardening operation. We use two flags to coordinate this. The + // first tracks whether we have the current predicate state in the stack + // pointer, and therefore no longer need to merge it. The second tracks + // whether the predicate state in the SSA updater is current, or if it needs + // to be freshly extracted from the stack pointer. + bool IsPredStateInSP = false; + bool IsPredStateSSACurrent = true; + + // Lastly, we remember the last call instruction (if any) where the + // predicate state ceased to be current in the SSA updater. We use this to + // figure out where to extract the current predicate state if needed for + // successors of the block. + MachineInstr *LastCall; + // Now re-walk the instructions in the basic block, and apply whichever // hardening strategy we have elected. Note that we do this in a second // pass specifically so that we have the complete set of instructions for @@ -1522,7 +1552,7 @@ MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg); MachineOperand &IndexMO = MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg); - hardenLoadAddr(MI, BaseMO, IndexMO, AddrRegToHardenedReg); + hardenLoadAddr(MI, BaseMO, IndexMO, IsPredStateSSACurrent, AddrRegToHardenedReg); continue; } @@ -1553,7 +1583,7 @@ } } - unsigned HardenedReg = hardenPostLoad(MI); + unsigned HardenedReg = hardenPostLoad(MI, IsPredStateSSACurrent); // Mark the resulting hardened register as such so we don't re-harden. AddrRegToHardenedReg[HardenedReg] = HardenedReg; @@ -1567,7 +1597,7 @@ // out afterward as we may still need to handle any call aspect of this // instruction. if ((MI.isCall() || MI.isBranch()) && HardenIndirectCallsAndJumps) - hardenIndirectCallOrJumpInstr(MI, AddrRegToHardenedReg); + hardenIndirectCallOrJumpInstr(MI, IsPredStateSSACurrent, AddrRegToHardenedReg); } // After we finish hardening loads we handle interprocedural hardening if @@ -1578,10 +1608,10 @@ continue; // If this is a direct return (IE, not a tail call) just directly harden - // it. + // it and then we're done with the block. if (MI.isReturn() && !MI.isCall()) { - hardenReturnInstr(MI); - continue; + hardenReturnInstr(MI, IsPredStateInSP); + break; } // Otherwise we have a call. We need to handle transferring the predicate @@ -1592,21 +1622,36 @@ DebugLoc Loc = MI.getDebugLoc(); // First, we transfer the predicate state into the called function by - // merging it into the stack pointer. This will kill the current def of - // the state. - unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB); - mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg); + // merging it into the stack pointer if it is not already there. + if (!IsPredStateInSP) { + assert(IsPredStateSSACurrent && + "PredState must be in SSA until it enters the SP!"); + mergePredStateIntoSP(MBB, InsertPt, Loc, + PS->SSA.GetValueAtEndOfBlock(&MBB)); + IsPredStateInSP = true; + } // If this call is also a return, it is a tail call and we don't need - // anything else to handle it so just continue. + // anything else to handle it, and we're done with the block. if (MI.isReturn()) - continue; + break; - // We need to step past the call and recover the predicate - // state from SP after the return, and make this new state available. - ++InsertPt; - unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc); - PS->SSA.AddAvailableValue(&MBB, NewStateReg); + // Mark that the predicate state in the SSA updater is no longer current + // so that we will extract it from the stack pointer if it is needed, and + // this instruction as the last safe place to extract fresh predicate + // state if needed. + IsPredStateSSACurrent = false; + LastCall = &MI; + } + + // If we end the block without the current predicate state in the SSA + // updater and there are any successors to the block, we need to forcibly + // extract it and add it to the SSA updater. Otherwise we won't correctly + // PHI together the virtual registers for it along interesting CFGs. + if (!IsPredStateSSACurrent && !MBB.succ_empty()) { + unsigned StateReg = extractPredStateFromSP( + MBB, std::next(LastCall->getIterator()), LastCall->getDebugLoc()); + PS->SSA.AddAvailableValue(&MBB, StateReg); } HardenPostLoad.clear(); @@ -1697,8 +1742,32 @@ return PredStateReg; } +/// Gets the current predicate state. +/// +/// This will either return the predicate state in the SSA updater if current, +/// or it will extract fresh predicate state from the stack pointer, make the +/// SSA updater current with that and return it for immediate use. +unsigned X86SpeculativeLoadHardeningPass::getCurrentPredState( + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, DebugLoc Loc, + bool &IsPredStateSSACurrent) { + // If the SSA updater already has the current value, just return it. + if (IsPredStateSSACurrent) + return PS->SSA.GetValueAtEndOfBlock(&MBB); + + // Otherwise we need to compute a fresh value from the stack pointer. + unsigned PredStateReg = extractPredStateFromSP(MBB, InsertPt, Loc); + + // Tell the SSA updater about this now current value and mark that it is in + // fact now current. + PS->SSA.AddAvailableValue(&MBB, PredStateReg); + IsPredStateSSACurrent = true; + + return PredStateReg; +} + void X86SpeculativeLoadHardeningPass::hardenLoadAddr( MachineInstr &MI, MachineOperand &BaseMO, MachineOperand &IndexMO, + bool &IsPredStateSSACurrent, SmallDenseMap &AddrRegToHardenedReg) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc Loc = MI.getDebugLoc(); @@ -1762,9 +1831,6 @@ if (HardenOpRegs.empty()) return; - // Compute the current predicate state. - unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB); - auto InsertPt = MI.getIterator(); // If EFLAGS are live and we don't have access to instructions that avoid @@ -1776,6 +1842,11 @@ FlagsReg = saveEFLAGS(MBB, InsertPt, Loc); } + // Compute the current predicate state. This may emit instructions that + // clobber EFLAGS! + unsigned StateReg = + getCurrentPredState(MBB, InsertPt, Loc, IsPredStateSSACurrent); + for (MachineOperand *Op : HardenOpRegs) { unsigned OpReg = Op->getReg(); auto *OpRC = MRI->getRegClass(OpReg); @@ -2022,15 +2093,22 @@ /// register class as `Reg`. unsigned X86SpeculativeLoadHardeningPass::hardenValueInRegister( unsigned Reg, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertPt, - DebugLoc Loc) { + DebugLoc Loc, bool &IsPredStateSSACurrent) { assert(canHardenRegister(Reg) && "Cannot harden this register!"); assert(TRI->isVirtualRegister(Reg) && "Cannot harden a physical register!"); + unsigned FlagsReg = 0; + if (isEFLAGSLive(MBB, InsertPt, *TRI)) + FlagsReg = saveEFLAGS(MBB, InsertPt, Loc); + + // Compute the current predicate state. This may emit instructions that + // clobber EFLAGS! + unsigned StateReg = + getCurrentPredState(MBB, InsertPt, Loc, IsPredStateSSACurrent); + auto *RC = MRI->getRegClass(Reg); int Bytes = TRI->getRegSizeInBits(*RC) / 8; - unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB); - // FIXME: Need to teach this about 32-bit mode. if (Bytes != 8) { unsigned SubRegImms[] = {X86::sub_8bit, X86::sub_16bit, X86::sub_32bit}; @@ -2041,10 +2119,6 @@ StateReg = NarrowStateReg; } - unsigned FlagsReg = 0; - if (isEFLAGSLive(MBB, InsertPt, *TRI)) - FlagsReg = saveEFLAGS(MBB, InsertPt, Loc); - unsigned NewReg = MRI->createVirtualRegister(RC); unsigned OrOpCodes[] = {X86::OR8rr, X86::OR16rr, X86::OR32rr, X86::OR64rr}; unsigned OrOpCode = OrOpCodes[Log2_32(Bytes)]; @@ -2070,7 +2144,9 @@ /// execution and coercing them to one is sufficient. /// /// Returns the newly hardened register. -unsigned X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI) { +unsigned +X86SpeculativeLoadHardeningPass::hardenPostLoad(MachineInstr &MI, + bool &IsPredStateSSACurrent) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc Loc = MI.getDebugLoc(); @@ -2087,8 +2163,9 @@ // Now harden this register's value, getting a hardened reg that is safe to // use. Note that we insert the instructions to compute this *after* the // defining instruction, not before it. - unsigned HardenedReg = hardenValueInRegister( - UnhardenedReg, MBB, std::next(MI.getIterator()), Loc); + unsigned HardenedReg = + hardenValueInRegister(UnhardenedReg, MBB, std::next(MI.getIterator()), + Loc, IsPredStateSSACurrent); // Finally, replace the old register (which now only has the uses of the // original def) with the hardened register. @@ -2121,7 +2198,8 @@ /// speculatively even during a BCBS-attacked return until the steering takes /// effect. Whenever this happens, the caller can recover the (poisoned) /// predicate state from the stack pointer and continue to harden loads. -void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI) { +void X86SpeculativeLoadHardeningPass::hardenReturnInstr(MachineInstr &MI, + bool IsPredStateInSP) { MachineBasicBlock &MBB = *MI.getParent(); DebugLoc Loc = MI.getDebugLoc(); auto InsertPt = MI.getIterator(); @@ -2142,6 +2220,13 @@ return; } + // If we already have the predicate state in RSP, nothing to do here. + // FIXME: This is currently disabled by a flag until confirmed that using an + // earlier tagged stack pointer remains sufficient to mitigate BCBS attacks on + // the return address. + if (!HardenReturnsNearby && IsPredStateInSP) + return; + // Take our predicate state, shift it to the high 17 bits (so that we keep // pointers canonical) and merge it into RSP. This will allow the caller to // extract it when we return (speculatively). @@ -2164,7 +2249,7 @@ /// have an opportunity to post-load harden here, we just need to scan for cases /// not already flagged and add them. void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr( - MachineInstr &MI, + MachineInstr &MI, bool &IsPredStateSSACurrent, SmallDenseMap &AddrRegToHardenedReg) { switch (MI.getOpcode()) { case X86::FARCALL16m: @@ -2208,8 +2293,9 @@ // continues to pile up. Should definitively measure its value and consider // eliminating it. if (!HardenedTargetReg) - HardenedTargetReg = hardenValueInRegister( - OldTargetReg, *MI.getParent(), MI.getIterator(), MI.getDebugLoc()); + HardenedTargetReg = + hardenValueInRegister(OldTargetReg, *MI.getParent(), MI.getIterator(), + MI.getDebugLoc(), IsPredStateSSACurrent); // Set the target operand to the hardened register. TargetOp.setReg(HardenedTargetReg); Index: llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll =================================================================== --- llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll +++ llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll @@ -16,38 +16,36 @@ define i32 @test_indirect_call(i32 ()** %ptr) nounwind { ; X64-LABEL: test_indirect_call: ; X64: # %bb.0: # %entry -; X64-NEXT: pushq %rax -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: movq $-1, %rcx -; X64-NEXT: sarq $63, %rax -; X64-NEXT: movq (%rdi), %rcx -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp -; X64-NEXT: callq *%rcx -; X64-NEXT: movq %rsp, %rcx -; X64-NEXT: sarq $63, %rcx +; X64-NEXT: pushq %rbx +; X64-NEXT: movq %rsp, %rbx +; X64-NEXT: movq $-1, %rax +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq (%rdi), %rax +; X64-NEXT: orq %rbx, %rax +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: shlq $47, %rcx ; X64-NEXT: orq %rcx, %rsp -; X64-NEXT: popq %rcx +; X64-NEXT: callq *%rax +; X64-NEXT: shlq $47, %rbx +; X64-NEXT: orq %rbx, %rsp +; X64-NEXT: popq %rbx ; X64-NEXT: retq ; ; X64-RETPOLINE-LABEL: test_indirect_call: ; X64-RETPOLINE: # %bb.0: # %entry -; X64-RETPOLINE-NEXT: pushq %rax -; X64-RETPOLINE-NEXT: movq %rsp, %rax -; X64-RETPOLINE-NEXT: movq $-1, %rcx -; X64-RETPOLINE-NEXT: sarq $63, %rax +; X64-RETPOLINE-NEXT: pushq %rbx +; X64-RETPOLINE-NEXT: movq %rsp, %rbx +; X64-RETPOLINE-NEXT: movq $-1, %rax +; X64-RETPOLINE-NEXT: sarq $63, %rbx ; X64-RETPOLINE-NEXT: movq (%rdi), %r11 -; X64-RETPOLINE-NEXT: orq %rax, %r11 +; X64-RETPOLINE-NEXT: orq %rbx, %r11 +; X64-RETPOLINE-NEXT: movq %rbx, %rax ; X64-RETPOLINE-NEXT: shlq $47, %rax ; X64-RETPOLINE-NEXT: orq %rax, %rsp ; X64-RETPOLINE-NEXT: callq __llvm_retpoline_r11 -; X64-RETPOLINE-NEXT: movq %rsp, %rcx -; X64-RETPOLINE-NEXT: sarq $63, %rcx -; X64-RETPOLINE-NEXT: shlq $47, %rcx -; X64-RETPOLINE-NEXT: orq %rcx, %rsp -; X64-RETPOLINE-NEXT: popq %rcx +; X64-RETPOLINE-NEXT: shlq $47, %rbx +; X64-RETPOLINE-NEXT: orq %rbx, %rsp +; X64-RETPOLINE-NEXT: popq %rbx ; X64-RETPOLINE-NEXT: retq entry: %fp = load i32 ()*, i32 ()** %ptr @@ -86,37 +84,35 @@ define i32 @test_indirect_call_global() nounwind { ; X64-LABEL: test_indirect_call_global: ; X64: # %bb.0: # %entry -; X64-NEXT: pushq %rax -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: movq $-1, %rcx -; X64-NEXT: sarq $63, %rax -; X64-NEXT: movq {{.*}}(%rip), %rcx -; X64-NEXT: orq %rax, %rcx -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp -; X64-NEXT: callq *%rcx -; X64-NEXT: movq %rsp, %rcx -; X64-NEXT: sarq $63, %rcx +; X64-NEXT: pushq %rbx +; X64-NEXT: movq %rsp, %rbx +; X64-NEXT: movq $-1, %rax +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: movq {{.*}}(%rip), %rax +; X64-NEXT: orq %rbx, %rax +; X64-NEXT: movq %rbx, %rcx ; X64-NEXT: shlq $47, %rcx ; X64-NEXT: orq %rcx, %rsp -; X64-NEXT: popq %rcx +; X64-NEXT: callq *%rax +; X64-NEXT: shlq $47, %rbx +; X64-NEXT: orq %rbx, %rsp +; X64-NEXT: popq %rbx ; X64-NEXT: retq ; ; X64-RETPOLINE-LABEL: test_indirect_call_global: ; X64-RETPOLINE: # %bb.0: # %entry -; X64-RETPOLINE-NEXT: pushq %rax -; X64-RETPOLINE-NEXT: movq %rsp, %rax -; X64-RETPOLINE-NEXT: movq $-1, %rcx -; X64-RETPOLINE-NEXT: sarq $63, %rax +; X64-RETPOLINE-NEXT: pushq %rbx +; X64-RETPOLINE-NEXT: movq %rsp, %rbx +; X64-RETPOLINE-NEXT: movq $-1, %rax +; X64-RETPOLINE-NEXT: sarq $63, %rbx ; X64-RETPOLINE-NEXT: movq {{.*}}(%rip), %r11 +; X64-RETPOLINE-NEXT: movq %rbx, %rax ; X64-RETPOLINE-NEXT: shlq $47, %rax ; X64-RETPOLINE-NEXT: orq %rax, %rsp ; X64-RETPOLINE-NEXT: callq __llvm_retpoline_r11 -; X64-RETPOLINE-NEXT: movq %rsp, %rcx -; X64-RETPOLINE-NEXT: sarq $63, %rcx -; X64-RETPOLINE-NEXT: shlq $47, %rcx -; X64-RETPOLINE-NEXT: orq %rcx, %rsp -; X64-RETPOLINE-NEXT: popq %rcx +; X64-RETPOLINE-NEXT: shlq $47, %rbx +; X64-RETPOLINE-NEXT: orq %rbx, %rsp +; X64-RETPOLINE-NEXT: popq %rbx ; X64-RETPOLINE-NEXT: retq entry: %fp = load i32 ()*, i32 ()** @global_fnptr Index: llvm/test/CodeGen/X86/speculative-load-hardening.ll =================================================================== --- llvm/test/CodeGen/X86/speculative-load-hardening.ll +++ llvm/test/CodeGen/X86/speculative-load-hardening.ll @@ -487,15 +487,11 @@ ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq __cxa_allocate_exception -; X64-NEXT: movq %rsp, %rcx -; X64-NEXT: sarq $63, %rcx ; X64-NEXT: movl %ebp, (%rax) ; X64-NEXT: .Ltmp0: ; X64-NEXT: xorl %esi, %esi ; X64-NEXT: xorl %edx, %edx -; X64-NEXT: shlq $47, %rcx ; X64-NEXT: movq %rax, %rdi -; X64-NEXT: orq %rcx, %rsp ; X64-NEXT: callq __cxa_throw ; X64-NEXT: movq %rsp, %rax ; X64-NEXT: sarq $63, %rax @@ -523,8 +519,6 @@ ; X64-NEXT: shlq $47, %rcx ; X64-NEXT: orq %rcx, %rsp ; X64-NEXT: callq sink -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax ; ; X64-LFENCE-LABEL: test_basic_eh: ; X64-LFENCE: # %bb.0: # %entry @@ -600,79 +594,55 @@ ; X64: # %bb.0: # %entry ; X64-NEXT: pushq %r15 ; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %r13 ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx -; X64-NEXT: pushq %rax ; X64-NEXT: movq %rsp, %rax -; X64-NEXT: movq %rcx, %r15 +; X64-NEXT: movq %rcx, %r12 ; X64-NEXT: movq %rdx, %r14 -; X64-NEXT: movq %rsi, %rbx -; X64-NEXT: movq %rdi, %r12 +; X64-NEXT: movq %rsi, %r15 +; X64-NEXT: movq %rdi, %r13 ; X64-NEXT: movq $-1, %rcx ; X64-NEXT: sarq $63, %rax -; X64-NEXT: orq %rax, %r12 +; X64-NEXT: orq %rax, %r13 ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: shlq $47, %rax ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_float -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax -; X64-NEXT: orq %rax, %rbx +; X64-NEXT: movq %rsp, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: orq %r15, %rbx ; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_double -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax ; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X64-NEXT: cvtsd2ss %xmm0, %xmm0 -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_float -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: cvtss2sd %xmm0, %xmm0 -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink_double -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax -; X64-NEXT: orq %rax, %r14 +; X64-NEXT: movq %rsp, %r15 +; X64-NEXT: sarq $63, %r15 +; X64-NEXT: orq %r14, %r15 ; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: cvtsi2ssl (%r14), %xmm0 -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp +; X64-NEXT: cvtsi2ssl (%r15), %xmm0 ; X64-NEXT: callq sink_float -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax -; X64-NEXT: orq %rax, %r15 +; X64-NEXT: movq %rsp, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: orq %rbx, %r12 ; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: cvtsi2sdq (%r15), %xmm0 -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp +; X64-NEXT: cvtsi2sdq (%r12), %xmm0 ; X64-NEXT: callq sink_double -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax ; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: cvtsi2ssq (%r15), %xmm0 -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp +; X64-NEXT: cvtsi2ssq (%r12), %xmm0 ; X64-NEXT: callq sink_float -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax ; X64-NEXT: xorps %xmm0, %xmm0 -; X64-NEXT: cvtsi2sdl (%r14), %xmm0 -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp +; X64-NEXT: cvtsi2sdl (%r15), %xmm0 ; X64-NEXT: callq sink_double -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp -; X64-NEXT: addq $8, %rsp +; X64-NEXT: shlq $47, %rbx +; X64-NEXT: orq %rbx, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 +; X64-NEXT: popq %r13 ; X64-NEXT: popq %r14 ; X64-NEXT: popq %r15 ; X64-NEXT: retq @@ -759,10 +729,10 @@ ; X64-NEXT: pushq %r12 ; X64-NEXT: pushq %rbx ; X64-NEXT: movq %rsp, %rax -; X64-NEXT: movq %r9, %r14 -; X64-NEXT: movq %r8, %r15 -; X64-NEXT: movq %rcx, %r12 -; X64-NEXT: movq %rdx, %r13 +; X64-NEXT: movq %r9, %r13 +; X64-NEXT: movq %r8, %r14 +; X64-NEXT: movq %rcx, %r15 +; X64-NEXT: movq %rdx, %r12 ; X64-NEXT: movq %rsi, %rbx ; X64-NEXT: movq $-1, %rcx ; X64-NEXT: sarq $63, %rax @@ -773,43 +743,31 @@ ; X64-NEXT: callq sink_v4f32 ; X64-NEXT: movq %rsp, %rax ; X64-NEXT: sarq $63, %rax -; X64-NEXT: orq %rax, %rbx -; X64-NEXT: movaps (%rbx), %xmm0 -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp +; X64-NEXT: orq %rbx, %rax +; X64-NEXT: movaps (%rax), %xmm0 ; X64-NEXT: callq sink_v2f64 ; X64-NEXT: movq %rsp, %rax ; X64-NEXT: sarq $63, %rax -; X64-NEXT: orq %rax, %r13 -; X64-NEXT: movaps (%r13), %xmm0 -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp +; X64-NEXT: orq %r12, %rax +; X64-NEXT: movaps (%rax), %xmm0 ; X64-NEXT: callq sink_v16i8 ; X64-NEXT: movq %rsp, %rax ; X64-NEXT: sarq $63, %rax -; X64-NEXT: orq %rax, %r12 -; X64-NEXT: movaps (%r12), %xmm0 -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp +; X64-NEXT: orq %r15, %rax +; X64-NEXT: movaps (%rax), %xmm0 ; X64-NEXT: callq sink_v8i16 ; X64-NEXT: movq %rsp, %rax ; X64-NEXT: sarq $63, %rax -; X64-NEXT: orq %rax, %r15 -; X64-NEXT: movaps (%r15), %xmm0 -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp +; X64-NEXT: orq %r14, %rax +; X64-NEXT: movaps (%rax), %xmm0 ; X64-NEXT: callq sink_v4i32 -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax -; X64-NEXT: orq %rax, %r14 -; X64-NEXT: movaps (%r14), %xmm0 -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp +; X64-NEXT: movq %rsp, %rbx +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: orq %rbx, %r13 +; X64-NEXT: movaps (%r13), %xmm0 ; X64-NEXT: callq sink_v2i64 -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp +; X64-NEXT: shlq $47, %rbx +; X64-NEXT: orq %rbx, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r12 ; X64-NEXT: popq %r13 @@ -866,12 +824,12 @@ define void @test_deferred_hardening(i32* %ptr1, i32* %ptr2, i32 %x) nounwind { ; X64-LABEL: test_deferred_hardening: ; X64: # %bb.0: # %entry +; X64-NEXT: pushq %r15 ; X64-NEXT: pushq %r14 ; X64-NEXT: pushq %rbx -; X64-NEXT: pushq %rax ; X64-NEXT: movq %rsp, %rax ; X64-NEXT: movq %rsi, %r14 -; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: movq %rdi, %r15 ; X64-NEXT: movq $-1, %rcx ; X64-NEXT: sarq $63, %rax ; X64-NEXT: movl (%rdi), %edi @@ -882,49 +840,39 @@ ; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink ; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax -; X64-NEXT: movl (%rbx), %ecx +; X64-NEXT: movl (%r15), %ecx ; X64-NEXT: movl (%r14), %edx ; X64-NEXT: leal 1(%rcx,%rdx), %edi +; X64-NEXT: sarq $63, %rax ; X64-NEXT: orl %eax, %edi -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink ; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax -; X64-NEXT: movl (%rbx), %edi +; X64-NEXT: movl (%r15), %edi ; X64-NEXT: shll $7, %edi +; X64-NEXT: sarq $63, %rax ; X64-NEXT: orl %eax, %edi -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink ; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax -; X64-NEXT: movzwl (%rbx), %ecx +; X64-NEXT: movzwl (%r15), %ecx ; X64-NEXT: sarw $7, %cx ; X64-NEXT: movzwl %cx, %edi ; X64-NEXT: notl %edi +; X64-NEXT: sarq $63, %rax ; X64-NEXT: orl %eax, %edi -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp ; X64-NEXT: callq sink -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax -; X64-NEXT: movzwl (%rbx), %ecx -; X64-NEXT: rolw $9, %cx -; X64-NEXT: movswl %cx, %edi +; X64-NEXT: movq %rsp, %rbx +; X64-NEXT: movzwl (%r15), %eax +; X64-NEXT: rolw $9, %ax +; X64-NEXT: movswl %ax, %edi ; X64-NEXT: negl %edi -; X64-NEXT: orl %eax, %edi -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp +; X64-NEXT: sarq $63, %rbx +; X64-NEXT: orl %ebx, %edi ; X64-NEXT: callq sink -; X64-NEXT: movq %rsp, %rax -; X64-NEXT: sarq $63, %rax -; X64-NEXT: shlq $47, %rax -; X64-NEXT: orq %rax, %rsp -; X64-NEXT: addq $8, %rsp +; X64-NEXT: shlq $47, %rbx +; X64-NEXT: orq %rbx, %rsp ; X64-NEXT: popq %rbx ; X64-NEXT: popq %r14 +; X64-NEXT: popq %r15 ; X64-NEXT: retq ; ; X64-LFENCE-LABEL: test_deferred_hardening: