Index: llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
===================================================================
--- llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -1498,13 +1498,6 @@
     // pass specifically so that we have the complete set of instructions for
     // which we will do post-load hardening and can defer it in certain
     // circumstances.
-    //
-    // FIXME: This could probably be made even more effective by doing it
-    // across the entire function. Rather than just walking the flat list
-    // backwards here, we could walk the function in PO and each block bottom
-    // up, allowing us to in some cases sink hardening across block blocks. As
-    // long as the in-block predicate state is used at the eventual hardening
-    // site, this remains safe.
     for (MachineInstr &MI : MBB) {
       if (HardenLoads) {
         // We cannot both require hardening the def of a load and its address.
@@ -1586,8 +1579,8 @@
       }
 
       // Otherwise we have a call. We need to handle transferring the predicate
-      // state into a call and recovering it after the call returns unless this
-      // is a tail call.
+      // state into a call and recovering it after the call returns (unless this
+      // is a tail call).
       assert(MI.isCall() && "Should only reach here for calls!");
       tracePredStateThroughCall(MI);
     }
@@ -2109,21 +2102,10 @@
   DebugLoc Loc = MI.getDebugLoc();
   auto InsertPt = MI.getIterator();
 
-  if (FenceCallAndRet) {
-    // Simply forcibly block speculation of loads out of the function by using
-    // an LFENCE. This is potentially a heavy-weight mitigation strategy, but
-    // should be secure, is simple from an ABI perspective, and the cost can be
-    // minimized through inlining.
-    //
-    // FIXME: We should investigate ways to establish a strong data-dependency
-    // on the return. However, poisoning the stack pointer is unlikely to work
-    // because the return is *predicted* rather than relying on the load of the
-    // return address to actually resolve.
-    BuildMI(MBB, InsertPt, Loc, TII->get(X86::LFENCE));
-    ++NumInstsInserted;
-    ++NumLFENCEsInserted;
+  if (FenceCallAndRet)
+    // No need to fence here as we'll fence at the return site itself. That
+    // handles more cases than we can handle here.
     return;
-  }
 
   // Take our predicate state, shift it to the high 17 bits (so that we keep
   // pointers canonical) and merge it into RSP. This will allow the caller to
@@ -2141,31 +2123,164 @@
 ///
 /// For tail calls, this is all we need to do.
 ///
-/// For calls where we might return to control flow, we further need to extract
-/// the predicate state built up within that function from the high bits of the
-/// stack pointer, and make that the newly available predicate state.
+/// For calls where we might return and resume the control flow, we need to
+/// extract the predicate state from the high bits of the stack pointer after
+/// control returns from the called function.
+///
+/// We also need to verify that we intended to return to this location in the
+/// code. An attacker might arrange for the processor to mispredict the return
+/// to this valid but incorrect return address in the program rather than the
+/// correct one. See the paper on this attack, called "ret2spec" by the
+/// researchers, here:
+/// https://christian-rossow.de/publications/ret2spec-ccs2018.pdf
+///
+/// The way we verify that we returned to the correct location is by preserving
+/// the expected return address across the call. One technique involves taking
+/// advantage of the red-zone to load the return address from `8(%rsp)` where it
+/// was left by the RET instruction when it popped `%rsp`. Alternatively, we can
+/// directly save the address into a register that will be preserved across the
+/// call. We compare this intended return address against the address
+/// immediately following the call (the observed return address). If these
+/// mismatch, we have detected misspeculation and can poison our predicate
+/// state.
 void X86SpeculativeLoadHardeningPass::tracePredStateThroughCall(
     MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
   auto InsertPt = MI.getIterator();
   DebugLoc Loc = MI.getDebugLoc();
 
+  if (FenceCallAndRet) {
+    if (MI.isReturn())
+      // Tail call, we don't return to this function.
+      // FIXME: We should also handle noreturn calls.
+      return;
+
+    // We don't need to fence before the call because the function should fence
+    // in its entry. However, we do need to fence after the call returns.
+    // Fencing before the return doesn't correctly handle cases where the return
+    // itself is mispredicted.
+    BuildMI(MBB, std::next(InsertPt), Loc, TII->get(X86::LFENCE));
+    ++NumInstsInserted;
+    ++NumLFENCEsInserted;
+    return;
+  }
+
   // First, we transfer the predicate state into the called function by merging
   // it into the stack pointer. This will kill the current def of the state.
   unsigned StateReg = PS->SSA.GetValueAtEndOfBlock(&MBB);
   mergePredStateIntoSP(MBB, InsertPt, Loc, StateReg);
 
   // If this call is also a return, it is a tail call and we don't need anything
-  // else to handle it so just continue.
-  // FIXME: We should also handle noreturn calls.
-  if (MI.isReturn())
+  // else to handle it so just return. Also, if there are no further
+  // instructions and no successors, this call does not return so we can also
+  // bail.
+  if (MI.isReturn() || (std::next(InsertPt) == MBB.end() && MBB.succ_empty()))
     return;
 
-  // We need to step past the call and recover the predicate state from SP after
-  // the return, and make this new state available.
+  // Create a symbol to track the return address and attach it to the call
+  // machine instruction. We will lower extra symbols attached to call
+  // instructions as label immediately following the call.
+  MCSymbol *RetSymbol = MF.getContext().createTempSymbol();
+  MI.setPostInstrSymbol(MF, RetSymbol);
+
+  const TargetRegisterClass *AddrRC = &X86::GR64RegClass;
+  unsigned ExpectedRetAddrReg;
+
+  // If we have no red zones, we need to save the expected return address prior
+  // to the call.
+  if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone)) {
+    // If we don't have red zones, we need to compute the expected return
+    // address prior to the call and store it in a register that lives across
+    // the call.
+    //
+    // In some ways, this is doubly satisfying as a mitigation because it will
+    // also successfully detect stack smashing bugs in some cases (typically,
+    // when a callee-saved register is used and the callee doesn't push it onto
+    // the stack). But that isn't our primary goal, so we only use it as
+    // a fallback.
+    //
+    // FIXME: It isn't clear that this is reliable in the face of
+    // rematerialization in the register allocator. We somehow need to force
+    // that to not occur for this particular instruction, and instead to spill
+    // or otherwise preserve the value computed *prior* to the call.
+    //
+    // FIXME: It is even less clear why MachienCSE can't just fold this when we
+    // end up having to use identical instructions both before and after the
+    // call to feed the comparison.
+    ExpectedRetAddrReg = MRI->createVirtualRegister(AddrRC);
+    if (MF.getTarget().getCodeModel() == CodeModel::Small &&
+        !Subtarget->isPositionIndependent()) {
+      BuildMI(MBB, InsertPt, Loc, TII->get(X86::MOV64ri32), ExpectedRetAddrReg)
+          .addSym(RetSymbol);
+    } else {
+      BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ExpectedRetAddrReg)
+          .addReg(/*Base*/ X86::RIP)
+          .addImm(/*Scale*/ 0)
+          .addReg(/*Index*/ 0)
+          .addSym(RetSymbol)
+          .addReg(/*Segment*/ 0);
+    }
+  }
+
+  // Step past the call to handle when it returns.
   ++InsertPt;
+
+  // If we have red zones enabled, then the return address is still available on
+  // the stack immediately after the call. As the very first instruction, we
+  // load it into a register.
+  if (!MF.getFunction().hasFnAttribute(Attribute::NoRedZone)) {
+    ExpectedRetAddrReg = MRI->createVirtualRegister(AddrRC);
+    BuildMI(MBB, InsertPt, Loc, TII->get(X86::MOV64rm), ExpectedRetAddrReg)
+        .addReg(/*Base*/ X86::RSP)
+        .addImm(/*Scale*/ 0)
+        .addReg(/*Index*/ 0)
+        .addImm(/*Displacement*/ -8) // The stack pointer has been popped, so
+                                     // the return address is 8-bytes past it.
+        .addReg(/*Segment*/ 0);
+  }
+
+  // Now we extract the callee's predicate state from the stack pointer.
   unsigned NewStateReg = extractPredStateFromSP(MBB, InsertPt, Loc);
-  PS->SSA.AddAvailableValue(&MBB, NewStateReg);
+
+  // Test the expected return address against our actual address. If we can
+  // form this basic block's address as an immediate, this is easy. Otherwise
+  // we compute it.
+  if (MF.getTarget().getCodeModel() == CodeModel::Small &&
+      !Subtarget->isPositionIndependent()) {
+    // FIXME: Could we fold this with the load? It would require careful EFLAGS
+    // management.
+    BuildMI(MBB, InsertPt, Loc, TII->get(X86::CMP64ri32))
+      .addReg(ExpectedRetAddrReg, RegState::Kill)
+      .addSym(RetSymbol);
+  } else {
+    unsigned ActualRetAddrReg = MRI->createVirtualRegister(AddrRC);
+    BuildMI(MBB, InsertPt, Loc, TII->get(X86::LEA64r), ActualRetAddrReg)
+        .addReg(/*Base*/ X86::RIP)
+        .addImm(/*Scale*/ 0)
+        .addReg(/*Index*/ 0)
+        .addSym(RetSymbol)
+        .addReg(/*Segment*/ 0);
+    BuildMI(MBB, InsertPt, Loc, TII->get(X86::CMP64rr))
+      .addReg(ExpectedRetAddrReg, RegState::Kill)
+      .addReg(ActualRetAddrReg, RegState::Kill);
+  }
+
+  // Now conditionally update the predicate state we just extracted if we ended
+  // up at a different return address than expected.
+  int PredStateSizeInBytes = TRI->getRegSizeInBits(*PS->RC) / 8;
+  auto CMovOp = X86::getCMovFromCond(X86::COND_NE, PredStateSizeInBytes);
+
+  unsigned UpdatedStateReg = MRI->createVirtualRegister(PS->RC);
+  auto CMovI = BuildMI(MBB, InsertPt, Loc, TII->get(CMovOp),
+                       UpdatedStateReg)
+                   .addReg(NewStateReg, RegState::Kill)
+                   .addReg(PS->PoisonReg);
+  CMovI->findRegisterUseOperand(X86::EFLAGS)->setIsKill(true);
+  ++NumInstsInserted;
+  LLVM_DEBUG(dbgs() << "  Inserting cmov: "; CMovI->dump(); dbgs() << "\n");
+
+  PS->SSA.AddAvailableValue(&MBB, UpdatedStateReg);
 }
 
 /// An attacker may speculatively store over a value that is then speculatively
Index: llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening | FileCheck %s --check-prefix=X64-NOPIC
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -code-model medium | FileCheck %s --check-prefix=X64-NOPIC-MCM
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -x86-speculative-load-hardening -relocation-model pic | FileCheck %s --check-prefix=X64-PIC
+;
+; FIXME: Add support for 32-bit.
+
+declare void @f()
+
+define i32 @test_calls_and_rets(i32 *%ptr) nounwind {
+; X64-NOPIC-LABEL: test_calls_and_rets:
+; X64-NOPIC:       # %bb.0: # %entry
+; X64-NOPIC-NEXT:    pushq %rbp
+; X64-NOPIC-NEXT:    pushq %r14
+; X64-NOPIC-NEXT:    pushq %rbx
+; X64-NOPIC-NEXT:    movq %rsp, %rax
+; X64-NOPIC-NEXT:    movq %rdi, %rbx
+; X64-NOPIC-NEXT:    movq $-1, %r14
+; X64-NOPIC-NEXT:    sarq $63, %rax
+; X64-NOPIC-NEXT:    shlq $47, %rax
+; X64-NOPIC-NEXT:    orq %rax, %rsp
+; X64-NOPIC-NEXT:    callq f
+; X64-NOPIC-NEXT:  .Ltmp0:
+; X64-NOPIC-NEXT:    movq %rsp, %rax
+; X64-NOPIC-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-NOPIC-NEXT:    sarq $63, %rax
+; X64-NOPIC-NEXT:    cmpq $.Ltmp0, %rcx
+; X64-NOPIC-NEXT:    cmovneq %r14, %rax
+; X64-NOPIC-NEXT:    movl (%rbx), %ebp
+; X64-NOPIC-NEXT:    shlq $47, %rax
+; X64-NOPIC-NEXT:    orq %rax, %rsp
+; X64-NOPIC-NEXT:    callq f
+; X64-NOPIC-NEXT:  .Ltmp1:
+; X64-NOPIC-NEXT:    movq %rsp, %rcx
+; X64-NOPIC-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-NOPIC-NEXT:    sarq $63, %rcx
+; X64-NOPIC-NEXT:    cmpq $.Ltmp1, %rax
+; X64-NOPIC-NEXT:    cmovneq %r14, %rcx
+; X64-NOPIC-NEXT:    addl (%rbx), %ebp
+; X64-NOPIC-NEXT:    orl %ecx, %ebp
+; X64-NOPIC-NEXT:    shlq $47, %rcx
+; X64-NOPIC-NEXT:    movl %ebp, %eax
+; X64-NOPIC-NEXT:    orq %rcx, %rsp
+; X64-NOPIC-NEXT:    popq %rbx
+; X64-NOPIC-NEXT:    popq %r14
+; X64-NOPIC-NEXT:    popq %rbp
+; X64-NOPIC-NEXT:    retq
+;
+; X64-NOPIC-MCM-LABEL: test_calls_and_rets:
+; X64-NOPIC-MCM:       # %bb.0: # %entry
+; X64-NOPIC-MCM-NEXT:    pushq %rbp
+; X64-NOPIC-MCM-NEXT:    pushq %r14
+; X64-NOPIC-MCM-NEXT:    pushq %rbx
+; X64-NOPIC-MCM-NEXT:    movq %rsp, %rax
+; X64-NOPIC-MCM-NEXT:    movq %rdi, %rbx
+; X64-NOPIC-MCM-NEXT:    movq $-1, %r14
+; X64-NOPIC-MCM-NEXT:    sarq $63, %rax
+; X64-NOPIC-MCM-NEXT:    shlq $47, %rax
+; X64-NOPIC-MCM-NEXT:    orq %rax, %rsp
+; X64-NOPIC-MCM-NEXT:    callq f
+; X64-NOPIC-MCM-NEXT:  .Ltmp0:
+; X64-NOPIC-MCM-NEXT:    movq %rsp, %rax
+; X64-NOPIC-MCM-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-NOPIC-MCM-NEXT:    sarq $63, %rax
+; X64-NOPIC-MCM-NEXT:    leaq {{.*}}(%rip), %rdx
+; X64-NOPIC-MCM-NEXT:    cmpq %rdx, %rcx
+; X64-NOPIC-MCM-NEXT:    cmovneq %r14, %rax
+; X64-NOPIC-MCM-NEXT:    movl (%rbx), %ebp
+; X64-NOPIC-MCM-NEXT:    shlq $47, %rax
+; X64-NOPIC-MCM-NEXT:    orq %rax, %rsp
+; X64-NOPIC-MCM-NEXT:    callq f
+; X64-NOPIC-MCM-NEXT:  .Ltmp1:
+; X64-NOPIC-MCM-NEXT:    movq %rsp, %rcx
+; X64-NOPIC-MCM-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-NOPIC-MCM-NEXT:    sarq $63, %rcx
+; X64-NOPIC-MCM-NEXT:    leaq {{.*}}(%rip), %rdx
+; X64-NOPIC-MCM-NEXT:    cmpq %rdx, %rax
+; X64-NOPIC-MCM-NEXT:    cmovneq %r14, %rcx
+; X64-NOPIC-MCM-NEXT:    addl (%rbx), %ebp
+; X64-NOPIC-MCM-NEXT:    orl %ecx, %ebp
+; X64-NOPIC-MCM-NEXT:    shlq $47, %rcx
+; X64-NOPIC-MCM-NEXT:    movl %ebp, %eax
+; X64-NOPIC-MCM-NEXT:    orq %rcx, %rsp
+; X64-NOPIC-MCM-NEXT:    popq %rbx
+; X64-NOPIC-MCM-NEXT:    popq %r14
+; X64-NOPIC-MCM-NEXT:    popq %rbp
+; X64-NOPIC-MCM-NEXT:    retq
+;
+; X64-PIC-LABEL: test_calls_and_rets:
+; X64-PIC:       # %bb.0: # %entry
+; X64-PIC-NEXT:    pushq %rbp
+; X64-PIC-NEXT:    pushq %r14
+; X64-PIC-NEXT:    pushq %rbx
+; X64-PIC-NEXT:    movq %rsp, %rax
+; X64-PIC-NEXT:    movq %rdi, %rbx
+; X64-PIC-NEXT:    movq $-1, %r14
+; X64-PIC-NEXT:    sarq $63, %rax
+; X64-PIC-NEXT:    shlq $47, %rax
+; X64-PIC-NEXT:    orq %rax, %rsp
+; X64-PIC-NEXT:    callq f@PLT
+; X64-PIC-NEXT:  .Ltmp0:
+; X64-PIC-NEXT:    movq %rsp, %rax
+; X64-PIC-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-PIC-NEXT:    sarq $63, %rax
+; X64-PIC-NEXT:    leaq {{.*}}(%rip), %rdx
+; X64-PIC-NEXT:    cmpq %rdx, %rcx
+; X64-PIC-NEXT:    cmovneq %r14, %rax
+; X64-PIC-NEXT:    movl (%rbx), %ebp
+; X64-PIC-NEXT:    shlq $47, %rax
+; X64-PIC-NEXT:    orq %rax, %rsp
+; X64-PIC-NEXT:    callq f@PLT
+; X64-PIC-NEXT:  .Ltmp1:
+; X64-PIC-NEXT:    movq %rsp, %rcx
+; X64-PIC-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; X64-PIC-NEXT:    sarq $63, %rcx
+; X64-PIC-NEXT:    leaq {{.*}}(%rip), %rdx
+; X64-PIC-NEXT:    cmpq %rdx, %rax
+; X64-PIC-NEXT:    cmovneq %r14, %rcx
+; X64-PIC-NEXT:    addl (%rbx), %ebp
+; X64-PIC-NEXT:    orl %ecx, %ebp
+; X64-PIC-NEXT:    shlq $47, %rcx
+; X64-PIC-NEXT:    movl %ebp, %eax
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    popq %rbx
+; X64-PIC-NEXT:    popq %r14
+; X64-PIC-NEXT:    popq %rbp
+; X64-PIC-NEXT:    retq
+entry:
+  call void @f()
+  %x = load i32, i32* %ptr
+  call void @f()
+  %y = load i32, i32* %ptr
+  %z = add i32 %x, %y
+  ret i32 %z
+}
+
+define i32 @test_calls_and_rets_noredzone(i32 *%ptr) nounwind noredzone {
+; X64-NOPIC-LABEL: test_calls_and_rets_noredzone:
+; X64-NOPIC:       # %bb.0: # %entry
+; X64-NOPIC-NEXT:    pushq %rbp
+; X64-NOPIC-NEXT:    pushq %r15
+; X64-NOPIC-NEXT:    pushq %r14
+; X64-NOPIC-NEXT:    pushq %rbx
+; X64-NOPIC-NEXT:    pushq %rax
+; X64-NOPIC-NEXT:    movq %rsp, %rax
+; X64-NOPIC-NEXT:    movq %rdi, %rbx
+; X64-NOPIC-NEXT:    movq $-1, %r14
+; X64-NOPIC-NEXT:    sarq $63, %rax
+; X64-NOPIC-NEXT:    shlq $47, %rax
+; X64-NOPIC-NEXT:    orq %rax, %rsp
+; X64-NOPIC-NEXT:    movq $.Ltmp2, %rbp
+; X64-NOPIC-NEXT:    callq f
+; X64-NOPIC-NEXT:  .Ltmp2:
+; X64-NOPIC-NEXT:    movq %rsp, %rax
+; X64-NOPIC-NEXT:    sarq $63, %rax
+; X64-NOPIC-NEXT:    cmpq $.Ltmp2, %rbp
+; X64-NOPIC-NEXT:    cmovneq %r14, %rax
+; X64-NOPIC-NEXT:    movl (%rbx), %ebp
+; X64-NOPIC-NEXT:    shlq $47, %rax
+; X64-NOPIC-NEXT:    orq %rax, %rsp
+; X64-NOPIC-NEXT:    movq $.Ltmp3, %r15
+; X64-NOPIC-NEXT:    callq f
+; X64-NOPIC-NEXT:  .Ltmp3:
+; X64-NOPIC-NEXT:    movq %rsp, %rcx
+; X64-NOPIC-NEXT:    sarq $63, %rcx
+; X64-NOPIC-NEXT:    cmpq $.Ltmp3, %r15
+; X64-NOPIC-NEXT:    cmovneq %r14, %rcx
+; X64-NOPIC-NEXT:    addl (%rbx), %ebp
+; X64-NOPIC-NEXT:    orl %ecx, %ebp
+; X64-NOPIC-NEXT:    shlq $47, %rcx
+; X64-NOPIC-NEXT:    movl %ebp, %eax
+; X64-NOPIC-NEXT:    orq %rcx, %rsp
+; X64-NOPIC-NEXT:    addq $8, %rsp
+; X64-NOPIC-NEXT:    popq %rbx
+; X64-NOPIC-NEXT:    popq %r14
+; X64-NOPIC-NEXT:    popq %r15
+; X64-NOPIC-NEXT:    popq %rbp
+; X64-NOPIC-NEXT:    retq
+;
+; X64-NOPIC-MCM-LABEL: test_calls_and_rets_noredzone:
+; X64-NOPIC-MCM:       # %bb.0: # %entry
+; X64-NOPIC-MCM-NEXT:    pushq %rbp
+; X64-NOPIC-MCM-NEXT:    pushq %r15
+; X64-NOPIC-MCM-NEXT:    pushq %r14
+; X64-NOPIC-MCM-NEXT:    pushq %rbx
+; X64-NOPIC-MCM-NEXT:    pushq %rax
+; X64-NOPIC-MCM-NEXT:    movq %rsp, %rax
+; X64-NOPIC-MCM-NEXT:    movq %rdi, %rbx
+; X64-NOPIC-MCM-NEXT:    movq $-1, %r14
+; X64-NOPIC-MCM-NEXT:    sarq $63, %rax
+; X64-NOPIC-MCM-NEXT:    shlq $47, %rax
+; X64-NOPIC-MCM-NEXT:    orq %rax, %rsp
+; X64-NOPIC-MCM-NEXT:    leaq {{.*}}(%rip), %rbp
+; X64-NOPIC-MCM-NEXT:    callq f
+; X64-NOPIC-MCM-NEXT:  .Ltmp2:
+; X64-NOPIC-MCM-NEXT:    movq %rsp, %rax
+; X64-NOPIC-MCM-NEXT:    sarq $63, %rax
+; X64-NOPIC-MCM-NEXT:    leaq {{.*}}(%rip), %rcx
+; X64-NOPIC-MCM-NEXT:    cmpq %rcx, %rbp
+; X64-NOPIC-MCM-NEXT:    cmovneq %r14, %rax
+; X64-NOPIC-MCM-NEXT:    movl (%rbx), %ebp
+; X64-NOPIC-MCM-NEXT:    shlq $47, %rax
+; X64-NOPIC-MCM-NEXT:    orq %rax, %rsp
+; X64-NOPIC-MCM-NEXT:    leaq {{.*}}(%rip), %r15
+; X64-NOPIC-MCM-NEXT:    callq f
+; X64-NOPIC-MCM-NEXT:  .Ltmp3:
+; X64-NOPIC-MCM-NEXT:    movq %rsp, %rcx
+; X64-NOPIC-MCM-NEXT:    sarq $63, %rcx
+; X64-NOPIC-MCM-NEXT:    leaq {{.*}}(%rip), %rax
+; X64-NOPIC-MCM-NEXT:    cmpq %rax, %r15
+; X64-NOPIC-MCM-NEXT:    cmovneq %r14, %rcx
+; X64-NOPIC-MCM-NEXT:    addl (%rbx), %ebp
+; X64-NOPIC-MCM-NEXT:    orl %ecx, %ebp
+; X64-NOPIC-MCM-NEXT:    shlq $47, %rcx
+; X64-NOPIC-MCM-NEXT:    movl %ebp, %eax
+; X64-NOPIC-MCM-NEXT:    orq %rcx, %rsp
+; X64-NOPIC-MCM-NEXT:    addq $8, %rsp
+; X64-NOPIC-MCM-NEXT:    popq %rbx
+; X64-NOPIC-MCM-NEXT:    popq %r14
+; X64-NOPIC-MCM-NEXT:    popq %r15
+; X64-NOPIC-MCM-NEXT:    popq %rbp
+; X64-NOPIC-MCM-NEXT:    retq
+;
+; X64-PIC-LABEL: test_calls_and_rets_noredzone:
+; X64-PIC:       # %bb.0: # %entry
+; X64-PIC-NEXT:    pushq %rbp
+; X64-PIC-NEXT:    pushq %r15
+; X64-PIC-NEXT:    pushq %r14
+; X64-PIC-NEXT:    pushq %rbx
+; X64-PIC-NEXT:    pushq %rax
+; X64-PIC-NEXT:    movq %rsp, %rax
+; X64-PIC-NEXT:    movq %rdi, %rbx
+; X64-PIC-NEXT:    movq $-1, %r14
+; X64-PIC-NEXT:    sarq $63, %rax
+; X64-PIC-NEXT:    shlq $47, %rax
+; X64-PIC-NEXT:    orq %rax, %rsp
+; X64-PIC-NEXT:    leaq {{.*}}(%rip), %rbp
+; X64-PIC-NEXT:    callq f@PLT
+; X64-PIC-NEXT:  .Ltmp2:
+; X64-PIC-NEXT:    movq %rsp, %rax
+; X64-PIC-NEXT:    sarq $63, %rax
+; X64-PIC-NEXT:    leaq {{.*}}(%rip), %rcx
+; X64-PIC-NEXT:    cmpq %rcx, %rbp
+; X64-PIC-NEXT:    cmovneq %r14, %rax
+; X64-PIC-NEXT:    movl (%rbx), %ebp
+; X64-PIC-NEXT:    shlq $47, %rax
+; X64-PIC-NEXT:    orq %rax, %rsp
+; X64-PIC-NEXT:    leaq {{.*}}(%rip), %r15
+; X64-PIC-NEXT:    callq f@PLT
+; X64-PIC-NEXT:  .Ltmp3:
+; X64-PIC-NEXT:    movq %rsp, %rcx
+; X64-PIC-NEXT:    sarq $63, %rcx
+; X64-PIC-NEXT:    leaq {{.*}}(%rip), %rax
+; X64-PIC-NEXT:    cmpq %rax, %r15
+; X64-PIC-NEXT:    cmovneq %r14, %rcx
+; X64-PIC-NEXT:    addl (%rbx), %ebp
+; X64-PIC-NEXT:    orl %ecx, %ebp
+; X64-PIC-NEXT:    shlq $47, %rcx
+; X64-PIC-NEXT:    movl %ebp, %eax
+; X64-PIC-NEXT:    orq %rcx, %rsp
+; X64-PIC-NEXT:    addq $8, %rsp
+; X64-PIC-NEXT:    popq %rbx
+; X64-PIC-NEXT:    popq %r14
+; X64-PIC-NEXT:    popq %r15
+; X64-PIC-NEXT:    popq %rbp
+; X64-PIC-NEXT:    retq
+entry:
+  call void @f()
+  %x = load i32, i32* %ptr
+  call void @f()
+  %y = load i32, i32* %ptr
+  %z = add i32 %x, %y
+  ret i32 %z
+}
Index: llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
===================================================================
--- llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
+++ llvm/test/CodeGen/X86/speculative-load-hardening-indirect.ll
@@ -16,38 +16,46 @@
 define i32 @test_indirect_call(i32 ()** %ptr) nounwind {
 ; X64-LABEL: test_indirect_call:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    pushq %rax
+; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    movq %rsp, %rax
-; X64-NEXT:    movq $-1, %rcx
+; X64-NEXT:    movq $-1, %rbx
 ; X64-NEXT:    sarq $63, %rax
 ; X64-NEXT:    movq (%rdi), %rcx
 ; X64-NEXT:    orq %rax, %rcx
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq *%rcx
+; X64-NEXT:  .Ltmp0:
 ; X64-NEXT:    movq %rsp, %rcx
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
 ; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    cmpq $.Ltmp0, %rdx
+; X64-NEXT:    cmovneq %rbx, %rcx
 ; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    orq %rcx, %rsp
-; X64-NEXT:    popq %rcx
+; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
 ;
 ; X64-RETPOLINE-LABEL: test_indirect_call:
 ; X64-RETPOLINE:       # %bb.0: # %entry
-; X64-RETPOLINE-NEXT:    pushq %rax
+; X64-RETPOLINE-NEXT:    pushq %rbx
 ; X64-RETPOLINE-NEXT:    movq %rsp, %rax
-; X64-RETPOLINE-NEXT:    movq $-1, %rcx
+; X64-RETPOLINE-NEXT:    movq $-1, %rbx
 ; X64-RETPOLINE-NEXT:    sarq $63, %rax
 ; X64-RETPOLINE-NEXT:    movq (%rdi), %r11
 ; X64-RETPOLINE-NEXT:    orq %rax, %r11
 ; X64-RETPOLINE-NEXT:    shlq $47, %rax
 ; X64-RETPOLINE-NEXT:    orq %rax, %rsp
 ; X64-RETPOLINE-NEXT:    callq __llvm_retpoline_r11
+; X64-RETPOLINE-NEXT:  .Ltmp0:
 ; X64-RETPOLINE-NEXT:    movq %rsp, %rcx
+; X64-RETPOLINE-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
 ; X64-RETPOLINE-NEXT:    sarq $63, %rcx
+; X64-RETPOLINE-NEXT:    cmpq $.Ltmp0, %rdx
+; X64-RETPOLINE-NEXT:    cmovneq %rbx, %rcx
 ; X64-RETPOLINE-NEXT:    shlq $47, %rcx
 ; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
-; X64-RETPOLINE-NEXT:    popq %rcx
+; X64-RETPOLINE-NEXT:    popq %rbx
 ; X64-RETPOLINE-NEXT:    retq
 entry:
   %fp = load i32 ()*, i32 ()** %ptr
@@ -86,37 +94,45 @@
 define i32 @test_indirect_call_global() nounwind {
 ; X64-LABEL: test_indirect_call_global:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    pushq %rax
+; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    movq %rsp, %rax
-; X64-NEXT:    movq $-1, %rcx
+; X64-NEXT:    movq $-1, %rbx
 ; X64-NEXT:    sarq $63, %rax
 ; X64-NEXT:    movq {{.*}}(%rip), %rcx
 ; X64-NEXT:    orq %rax, %rcx
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq *%rcx
+; X64-NEXT:  .Ltmp1:
 ; X64-NEXT:    movq %rsp, %rcx
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
 ; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    cmpq $.Ltmp1, %rdx
+; X64-NEXT:    cmovneq %rbx, %rcx
 ; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    orq %rcx, %rsp
-; X64-NEXT:    popq %rcx
+; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
 ;
 ; X64-RETPOLINE-LABEL: test_indirect_call_global:
 ; X64-RETPOLINE:       # %bb.0: # %entry
-; X64-RETPOLINE-NEXT:    pushq %rax
+; X64-RETPOLINE-NEXT:    pushq %rbx
 ; X64-RETPOLINE-NEXT:    movq %rsp, %rax
-; X64-RETPOLINE-NEXT:    movq $-1, %rcx
+; X64-RETPOLINE-NEXT:    movq $-1, %rbx
 ; X64-RETPOLINE-NEXT:    sarq $63, %rax
 ; X64-RETPOLINE-NEXT:    movq {{.*}}(%rip), %r11
 ; X64-RETPOLINE-NEXT:    shlq $47, %rax
 ; X64-RETPOLINE-NEXT:    orq %rax, %rsp
 ; X64-RETPOLINE-NEXT:    callq __llvm_retpoline_r11
+; X64-RETPOLINE-NEXT:  .Ltmp1:
 ; X64-RETPOLINE-NEXT:    movq %rsp, %rcx
+; X64-RETPOLINE-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
 ; X64-RETPOLINE-NEXT:    sarq $63, %rcx
+; X64-RETPOLINE-NEXT:    cmpq $.Ltmp1, %rdx
+; X64-RETPOLINE-NEXT:    cmovneq %rbx, %rcx
 ; X64-RETPOLINE-NEXT:    shlq $47, %rcx
 ; X64-RETPOLINE-NEXT:    orq %rcx, %rsp
-; X64-RETPOLINE-NEXT:    popq %rcx
+; X64-RETPOLINE-NEXT:    popq %rbx
 ; X64-RETPOLINE-NEXT:    retq
 entry:
   %fp = load i32 ()*, i32 ()** @global_fnptr
@@ -205,19 +221,19 @@
 ; X64-NEXT:    movq global_blockaddrs(,%rax,8), %rax
 ; X64-NEXT:    orq %rcx, %rax
 ; X64-NEXT:    jmpq *%rax
-; X64-NEXT:  .Ltmp0: # Block address taken
+; X64-NEXT:  .Ltmp2: # Block address taken
 ; X64-NEXT:  .LBB5_1: # %bb0
 ; X64-NEXT:    movl $2, %eax
 ; X64-NEXT:    jmp .LBB5_2
-; X64-NEXT:  .Ltmp1: # Block address taken
+; X64-NEXT:  .Ltmp3: # Block address taken
 ; X64-NEXT:  .LBB5_4: # %bb2
 ; X64-NEXT:    movl $13, %eax
 ; X64-NEXT:    jmp .LBB5_2
-; X64-NEXT:  .Ltmp2: # Block address taken
+; X64-NEXT:  .Ltmp4: # Block address taken
 ; X64-NEXT:  .LBB5_5: # %bb3
 ; X64-NEXT:    movl $42, %eax
 ; X64-NEXT:    jmp .LBB5_2
-; X64-NEXT:  .Ltmp3: # Block address taken
+; X64-NEXT:  .Ltmp5: # Block address taken
 ; X64-NEXT:  .LBB5_3: # %bb1
 ; X64-NEXT:    movl $7, %eax
 ; X64-NEXT:  .LBB5_2: # %bb0
@@ -243,22 +259,22 @@
 ; X64-RETPOLINE-NEXT:    cmoveq %rax, %rcx
 ; X64-RETPOLINE-NEXT:    cmpq $4, %rdx
 ; X64-RETPOLINE-NEXT:    jne .LBB6_3
-; X64-RETPOLINE-NEXT:  .Ltmp0: # Block address taken
+; X64-RETPOLINE-NEXT:  .Ltmp2: # Block address taken
 ; X64-RETPOLINE-NEXT:  # %bb.7: # %bb3
 ; X64-RETPOLINE-NEXT:    cmovneq %rax, %rcx
 ; X64-RETPOLINE-NEXT:    movl $42, %eax
 ; X64-RETPOLINE-NEXT:    jmp .LBB6_4
-; X64-RETPOLINE-NEXT:  .Ltmp1: # Block address taken
+; X64-RETPOLINE-NEXT:  .Ltmp3: # Block address taken
 ; X64-RETPOLINE-NEXT:  .LBB6_5: # %bb1
 ; X64-RETPOLINE-NEXT:    cmovneq %rax, %rcx
 ; X64-RETPOLINE-NEXT:    movl $7, %eax
 ; X64-RETPOLINE-NEXT:    jmp .LBB6_4
-; X64-RETPOLINE-NEXT:  .Ltmp2: # Block address taken
+; X64-RETPOLINE-NEXT:  .Ltmp4: # Block address taken
 ; X64-RETPOLINE-NEXT:  .LBB6_6: # %bb2
 ; X64-RETPOLINE-NEXT:    cmovneq %rax, %rcx
 ; X64-RETPOLINE-NEXT:    movl $13, %eax
 ; X64-RETPOLINE-NEXT:    jmp .LBB6_4
-; X64-RETPOLINE-NEXT:  .Ltmp3: # Block address taken
+; X64-RETPOLINE-NEXT:  .Ltmp5: # Block address taken
 ; X64-RETPOLINE-NEXT:  .LBB6_3: # %bb0
 ; X64-RETPOLINE-NEXT:    cmoveq %rax, %rcx
 ; X64-RETPOLINE-NEXT:    movl $2, %eax
Index: llvm/test/CodeGen/X86/speculative-load-hardening.ll
===================================================================
--- llvm/test/CodeGen/X86/speculative-load-hardening.ll
+++ llvm/test/CodeGen/X86/speculative-load-hardening.ll
@@ -64,7 +64,7 @@
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB1_4: # %then2
 ; X64-NEXT:    .cfi_def_cfa_offset 32
-; X64-NEXT:    movq %r8, %r15
+; X64-NEXT:    movq %r8, %r14
 ; X64-NEXT:    cmovneq %rbx, %rax
 ; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    je .LBB1_6
@@ -72,30 +72,34 @@
 ; X64-NEXT:    cmoveq %rbx, %rax
 ; X64-NEXT:    movslq (%r9), %rcx
 ; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    leaq (%r15,%rcx,4), %r14
-; X64-NEXT:    movl %ecx, (%r15,%rcx,4)
+; X64-NEXT:    leaq (%r14,%rcx,4), %r15
+; X64-NEXT:    movl %ecx, (%r14,%rcx,4)
 ; X64-NEXT:    jmp .LBB1_7
 ; X64-NEXT:  .LBB1_6: # %then3
 ; X64-NEXT:    cmovneq %rbx, %rax
 ; X64-NEXT:    movl (%rcx), %ecx
-; X64-NEXT:    addl (%r15), %ecx
+; X64-NEXT:    addl (%r14), %ecx
 ; X64-NEXT:    movslq %ecx, %rdi
 ; X64-NEXT:    orq %rax, %rdi
-; X64-NEXT:    movl (%r15,%rdi,4), %esi
+; X64-NEXT:    movl (%r14,%rdi,4), %esi
 ; X64-NEXT:    orl %eax, %esi
-; X64-NEXT:    movq (%r9), %r14
-; X64-NEXT:    orq %rax, %r14
-; X64-NEXT:    addl (%r14), %esi
+; X64-NEXT:    movq (%r9), %r15
+; X64-NEXT:    orq %rax, %r15
+; X64-NEXT:    addl (%r15), %esi
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    # kill: def $edi killed $edi killed $rdi
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq leak
+; X64-NEXT:  .Ltmp0:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp0, %rcx
+; X64-NEXT:    cmovneq %rbx, %rax
 ; X64-NEXT:  .LBB1_7: # %merge
-; X64-NEXT:    movslq (%r14), %rcx
+; X64-NEXT:    movslq (%r15), %rcx
 ; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    movl $0, (%r15,%rcx,4)
+; X64-NEXT:    movl $0, (%r14,%rcx,4)
 ; X64-NEXT:    jmp .LBB1_8
 ;
 ; X64-LFENCE-LABEL: test_basic_conditions:
@@ -225,8 +229,12 @@
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink
+; X64-NEXT:  .Ltmp1:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp1, %rcx
+; X64-NEXT:    cmovneq %r15, %rax
 ; X64-NEXT:    incl %ebx
 ; X64-NEXT:    cmpl %ebp, %ebx
 ; X64-NEXT:    jl .LBB2_6
@@ -304,19 +312,19 @@
 ; X64-NEXT:    pushq %rbx
 ; X64-NEXT:    pushq %rax
 ; X64-NEXT:    movq %rsp, %rax
-; X64-NEXT:    movq $-1, %r12
+; X64-NEXT:    movq $-1, %rbp
 ; X64-NEXT:    sarq $63, %rax
 ; X64-NEXT:    testl %edi, %edi
 ; X64-NEXT:    je .LBB3_2
 ; X64-NEXT:  # %bb.1:
-; X64-NEXT:    cmoveq %r12, %rax
+; X64-NEXT:    cmoveq %rbp, %rax
 ; X64-NEXT:    jmp .LBB3_10
 ; X64-NEXT:  .LBB3_2: # %l1.header.preheader
 ; X64-NEXT:    movq %r8, %r14
 ; X64-NEXT:    movq %rcx, %rbx
-; X64-NEXT:    movl %edx, %ebp
+; X64-NEXT:    movl %edx, %r12d
 ; X64-NEXT:    movl %esi, %r15d
-; X64-NEXT:    cmovneq %r12, %rax
+; X64-NEXT:    cmovneq %rbp, %rax
 ; X64-NEXT:    xorl %r13d, %r13d
 ; X64-NEXT:    movl %esi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
 ; X64-NEXT:    testl %r15d, %r15d
@@ -324,16 +332,16 @@
 ; X64-NEXT:    jmp .LBB3_4
 ; X64-NEXT:    .p2align 4, 0x90
 ; X64-NEXT:  .LBB3_12:
-; X64-NEXT:    cmovgeq %r12, %rax
+; X64-NEXT:    cmovgeq %rbp, %rax
 ; X64-NEXT:    testl %r15d, %r15d
 ; X64-NEXT:    jle .LBB3_4
 ; X64-NEXT:  .LBB3_5: # %l2.header.preheader
-; X64-NEXT:    cmovleq %r12, %rax
+; X64-NEXT:    cmovleq %rbp, %rax
 ; X64-NEXT:    xorl %r15d, %r15d
 ; X64-NEXT:    jmp .LBB3_6
 ; X64-NEXT:    .p2align 4, 0x90
 ; X64-NEXT:  .LBB3_11: # in Loop: Header=BB3_6 Depth=1
-; X64-NEXT:    cmovgeq %r12, %rax
+; X64-NEXT:    cmovgeq %rbp, %rax
 ; X64-NEXT:  .LBB3_6: # %l2.header
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X64-NEXT:    movslq (%rbx), %rcx
@@ -344,18 +352,22 @@
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink
+; X64-NEXT:  .Ltmp2:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp2, %rcx
+; X64-NEXT:    cmovneq %rbp, %rax
 ; X64-NEXT:    incl %r15d
-; X64-NEXT:    cmpl %ebp, %r15d
+; X64-NEXT:    cmpl %r12d, %r15d
 ; X64-NEXT:    jl .LBB3_11
 ; X64-NEXT:  # %bb.7:
-; X64-NEXT:    cmovlq %r12, %rax
+; X64-NEXT:    cmovlq %rbp, %rax
 ; X64-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 4-byte Reload
 ; X64-NEXT:    jmp .LBB3_8
 ; X64-NEXT:    .p2align 4, 0x90
 ; X64-NEXT:  .LBB3_4:
-; X64-NEXT:    cmovgq %r12, %rax
+; X64-NEXT:    cmovgq %rbp, %rax
 ; X64-NEXT:  .LBB3_8: # %l1.latch
 ; X64-NEXT:    movslq (%rbx), %rcx
 ; X64-NEXT:    orq %rax, %rcx
@@ -365,13 +377,17 @@
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink
+; X64-NEXT:  .Ltmp3:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp3, %rcx
+; X64-NEXT:    cmovneq %rbp, %rax
 ; X64-NEXT:    incl %r13d
 ; X64-NEXT:    cmpl %r15d, %r13d
 ; X64-NEXT:    jl .LBB3_12
 ; X64-NEXT:  # %bb.9:
-; X64-NEXT:    cmovlq %r12, %rax
+; X64-NEXT:    cmovlq %rbp, %rax
 ; X64-NEXT:  .LBB3_10: # %exit
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
@@ -486,22 +502,27 @@
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    pushq %rbp
 ; X64-NEXT:    .cfi_def_cfa_offset 16
-; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r15
 ; X64-NEXT:    .cfi_def_cfa_offset 24
-; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %r14
 ; X64-NEXT:    .cfi_def_cfa_offset 32
-; X64-NEXT:    .cfi_offset %rbx, -32
-; X64-NEXT:    .cfi_offset %r14, -24
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    .cfi_def_cfa_offset 40
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    .cfi_def_cfa_offset 48
+; X64-NEXT:    .cfi_offset %rbx, -40
+; X64-NEXT:    .cfi_offset %r14, -32
+; X64-NEXT:    .cfi_offset %r15, -24
 ; X64-NEXT:    .cfi_offset %rbp, -16
 ; X64-NEXT:    movq %rsp, %rax
-; X64-NEXT:    movq $-1, %rcx
+; X64-NEXT:    movq $-1, %r15
 ; X64-NEXT:    sarq $63, %rax
 ; X64-NEXT:    cmpl $41, %edi
 ; X64-NEXT:    jg .LBB4_1
 ; X64-NEXT:  # %bb.2: # %thrower
 ; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rsi, %rbx
-; X64-NEXT:    cmovgq %rcx, %rax
+; X64-NEXT:    cmovgq %r15, %rax
 ; X64-NEXT:    movslq %edi, %rcx
 ; X64-NEXT:    movl (%rsi,%rcx,4), %ebp
 ; X64-NEXT:    orl %eax, %ebp
@@ -509,35 +530,47 @@
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq __cxa_allocate_exception
+; X64-NEXT:  .Ltmp7:
 ; X64-NEXT:    movq %rsp, %rcx
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rdx
 ; X64-NEXT:    sarq $63, %rcx
+; X64-NEXT:    cmpq $.Ltmp7, %rdx
+; X64-NEXT:    cmovneq %r15, %rcx
 ; X64-NEXT:    movl %ebp, (%rax)
-; X64-NEXT:  .Ltmp0:
+; X64-NEXT:  .Ltmp4:
 ; X64-NEXT:    xorl %esi, %esi
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    movq %rax, %rdi
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    callq __cxa_throw
+; X64-NEXT:  .Ltmp8:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
-; X64-NEXT:  .Ltmp1:
+; X64-NEXT:    cmpq $.Ltmp8, %rcx
+; X64-NEXT:    cmovneq %r15, %rax
+; X64-NEXT:  .Ltmp5:
 ; X64-NEXT:    jmp .LBB4_3
 ; X64-NEXT:  .LBB4_1:
-; X64-NEXT:    cmovleq %rcx, %rax
+; X64-NEXT:    cmovleq %r15, %rax
 ; X64-NEXT:  .LBB4_3: # %exit
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    .cfi_def_cfa_offset 40
 ; X64-NEXT:    popq %rbx
-; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    .cfi_def_cfa_offset 32
 ; X64-NEXT:    popq %r14
+; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    popq %r15
 ; X64-NEXT:    .cfi_def_cfa_offset 16
 ; X64-NEXT:    popq %rbp
 ; X64-NEXT:    .cfi_def_cfa_offset 8
 ; X64-NEXT:    retq
 ; X64-NEXT:  .LBB4_4: # %lpad
-; X64-NEXT:    .cfi_def_cfa_offset 32
-; X64-NEXT:  .Ltmp2:
+; X64-NEXT:    .cfi_def_cfa_offset 48
+; X64-NEXT:  .Ltmp6:
 ; X64-NEXT:    movq %rsp, %rcx
 ; X64-NEXT:    sarq $63, %rcx
 ; X64-NEXT:    movl (%rax), %eax
@@ -549,8 +582,12 @@
 ; X64-NEXT:    shlq $47, %rcx
 ; X64-NEXT:    orq %rcx, %rsp
 ; X64-NEXT:    callq sink
+; X64-NEXT:  .Ltmp9:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp9, %rcx
+; X64-NEXT:    cmovneq %r15, %rax
 ;
 ; X64-LFENCE-LABEL: test_basic_eh:
 ; X64-LFENCE:       # %bb.0: # %entry
@@ -636,79 +673,111 @@
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    pushq %r15
 ; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    pushq %rax
 ; X64-NEXT:    movq %rsp, %rax
 ; X64-NEXT:    movq %rcx, %r15
 ; X64-NEXT:    movq %rdx, %r14
 ; X64-NEXT:    movq %rsi, %rbx
 ; X64-NEXT:    movq %rdi, %r12
-; X64-NEXT:    movq $-1, %rcx
+; X64-NEXT:    movq $-1, %r13
 ; X64-NEXT:    sarq $63, %rax
 ; X64-NEXT:    orq %rax, %r12
 ; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink_float
+; X64-NEXT:  .Ltmp10:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp10, %rcx
+; X64-NEXT:    cmovneq %r13, %rax
 ; X64-NEXT:    orq %rax, %rbx
 ; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink_double
+; X64-NEXT:  .Ltmp11:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp11, %rcx
+; X64-NEXT:    cmovneq %r13, %rax
 ; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; X64-NEXT:    cvtsd2ss %xmm0, %xmm0
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink_float
+; X64-NEXT:  .Ltmp12:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp12, %rcx
+; X64-NEXT:    cmovneq %r13, %rax
 ; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NEXT:    cvtss2sd %xmm0, %xmm0
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink_double
+; X64-NEXT:  .Ltmp13:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp13, %rcx
+; X64-NEXT:    cmovneq %r13, %rax
 ; X64-NEXT:    orq %rax, %r14
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    cvtsi2ssl (%r14), %xmm0
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink_float
+; X64-NEXT:  .Ltmp14:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp14, %rcx
+; X64-NEXT:    cmovneq %r13, %rax
 ; X64-NEXT:    orq %rax, %r15
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    cvtsi2sdq (%r15), %xmm0
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink_double
+; X64-NEXT:  .Ltmp15:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp15, %rcx
+; X64-NEXT:    cmovneq %r13, %rax
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    cvtsi2ssq (%r15), %xmm0
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink_float
+; X64-NEXT:  .Ltmp16:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp16, %rcx
+; X64-NEXT:    cmovneq %r13, %rax
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    cvtsi2sdl (%r14), %xmm0
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink_double
+; X64-NEXT:  .Ltmp17:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp17, %rcx
+; X64-NEXT:    cmovneq %r13, %rax
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
-; X64-NEXT:    addq $8, %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
 ; X64-NEXT:    popq %r14
 ; X64-NEXT:    popq %r15
 ; X64-NEXT:    retq
@@ -789,68 +858,96 @@
 define void @test_vec_loads(<4 x float>* %v4f32ptr, <2 x double>* %v2f64ptr, <16 x i8>* %v16i8ptr, <8 x i16>* %v8i16ptr, <4 x i32>* %v4i32ptr, <2 x i64>* %v2i64ptr) nounwind {
 ; X64-LABEL: test_vec_loads:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %rbp
 ; X64-NEXT:    pushq %r15
 ; X64-NEXT:    pushq %r14
 ; X64-NEXT:    pushq %r13
 ; X64-NEXT:    pushq %r12
 ; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %rax
 ; X64-NEXT:    movq %rsp, %rax
 ; X64-NEXT:    movq %r9, %r14
 ; X64-NEXT:    movq %r8, %r15
 ; X64-NEXT:    movq %rcx, %r12
 ; X64-NEXT:    movq %rdx, %r13
 ; X64-NEXT:    movq %rsi, %rbx
-; X64-NEXT:    movq $-1, %rcx
+; X64-NEXT:    movq $-1, %rbp
 ; X64-NEXT:    sarq $63, %rax
 ; X64-NEXT:    orq %rax, %rdi
 ; X64-NEXT:    movaps (%rdi), %xmm0
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink_v4f32
+; X64-NEXT:  .Ltmp18:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp18, %rcx
+; X64-NEXT:    cmovneq %rbp, %rax
 ; X64-NEXT:    orq %rax, %rbx
 ; X64-NEXT:    movaps (%rbx), %xmm0
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink_v2f64
+; X64-NEXT:  .Ltmp19:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp19, %rcx
+; X64-NEXT:    cmovneq %rbp, %rax
 ; X64-NEXT:    orq %rax, %r13
 ; X64-NEXT:    movaps (%r13), %xmm0
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink_v16i8
+; X64-NEXT:  .Ltmp20:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp20, %rcx
+; X64-NEXT:    cmovneq %rbp, %rax
 ; X64-NEXT:    orq %rax, %r12
 ; X64-NEXT:    movaps (%r12), %xmm0
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink_v8i16
+; X64-NEXT:  .Ltmp21:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp21, %rcx
+; X64-NEXT:    cmovneq %rbp, %rax
 ; X64-NEXT:    orq %rax, %r15
 ; X64-NEXT:    movaps (%r15), %xmm0
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink_v4i32
+; X64-NEXT:  .Ltmp22:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp22, %rcx
+; X64-NEXT:    cmovneq %rbp, %rax
 ; X64-NEXT:    orq %rax, %r14
 ; X64-NEXT:    movaps (%r14), %xmm0
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink_v2i64
+; X64-NEXT:  .Ltmp23:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp23, %rcx
+; X64-NEXT:    cmovneq %rbp, %rax
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
+; X64-NEXT:    addq $8, %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13
 ; X64-NEXT:    popq %r14
 ; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
 ;
 ; X64-LFENCE-LABEL: test_vec_loads:
@@ -902,13 +999,13 @@
 define void @test_deferred_hardening(i32* %ptr1, i32* %ptr2, i32 %x) nounwind {
 ; X64-LABEL: test_deferred_hardening:
 ; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %r15
 ; X64-NEXT:    pushq %r14
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    pushq %rax
 ; X64-NEXT:    movq %rsp, %rax
 ; X64-NEXT:    movq %rsi, %r14
 ; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    movq $-1, %rcx
+; X64-NEXT:    movq $-1, %r15
 ; X64-NEXT:    sarq $63, %rax
 ; X64-NEXT:    movl (%rdi), %edi
 ; X64-NEXT:    incl %edi
@@ -917,8 +1014,12 @@
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink
+; X64-NEXT:  .Ltmp24:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp24, %rcx
+; X64-NEXT:    cmovneq %r15, %rax
 ; X64-NEXT:    movl (%rbx), %ecx
 ; X64-NEXT:    movl (%r14), %edx
 ; X64-NEXT:    leal 1(%rcx,%rdx), %edi
@@ -926,16 +1027,24 @@
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink
+; X64-NEXT:  .Ltmp25:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp25, %rcx
+; X64-NEXT:    cmovneq %r15, %rax
 ; X64-NEXT:    movl (%rbx), %edi
 ; X64-NEXT:    shll $7, %edi
 ; X64-NEXT:    orl %eax, %edi
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink
+; X64-NEXT:  .Ltmp26:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp26, %rcx
+; X64-NEXT:    cmovneq %r15, %rax
 ; X64-NEXT:    movzwl (%rbx), %ecx
 ; X64-NEXT:    sarw $7, %cx
 ; X64-NEXT:    movzwl %cx, %edi
@@ -944,8 +1053,12 @@
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink
+; X64-NEXT:  .Ltmp27:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp27, %rcx
+; X64-NEXT:    cmovneq %r15, %rax
 ; X64-NEXT:    movzwl (%rbx), %ecx
 ; X64-NEXT:    rolw $9, %cx
 ; X64-NEXT:    movswl %cx, %edi
@@ -954,13 +1067,17 @@
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
 ; X64-NEXT:    callq sink
+; X64-NEXT:  .Ltmp28:
 ; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
 ; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    cmpq $.Ltmp28, %rcx
+; X64-NEXT:    cmovneq %r15, %rax
 ; X64-NEXT:    shlq $47, %rax
 ; X64-NEXT:    orq %rax, %rsp
-; X64-NEXT:    addq $8, %rsp
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
 ; X64-NEXT:    retq
 ;
 ; X64-LFENCE-LABEL: test_deferred_hardening: