Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -965,6 +965,11 @@
   if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF))
     return true;
   switch (MI.getOpcode()) {
+  case AArch64::HINT:
+    // CSDB hints are scheduling barriers.
+    if (MI.getOperand(0).getImm() == 0x14)
+      return true;
+    break;
   case AArch64::DSB:
   case AArch64::ISB:
     // DSB and ISB also are scheduling barriers.
Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td
+++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td
@@ -520,6 +520,14 @@
                    [(set GPR64:$Rd, (int_aarch64_space imm:$size, GPR64:$Rn))]>,
             Sched<[]>;
 
+let hasSideEffects = 1, isCodeGenOnly = 1 in {
+  def SpeculationSafeValueX
+      : Pseudo<(outs GPR64:$dst), (ins GPR64:$src), []>, Sched<[]>;
+  def SpeculationSafeValueW
+      : Pseudo<(outs GPR32:$dst), (ins GPR32:$src), []>, Sched<[]>;
+}
+
+
 //===----------------------------------------------------------------------===//
 // System instructions.
 //===----------------------------------------------------------------------===//
Index: llvm/trunk/lib/Target/AArch64/AArch64SpeculationHardening.cpp
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64SpeculationHardening.cpp
+++ llvm/trunk/lib/Target/AArch64/AArch64SpeculationHardening.cpp
@@ -16,17 +16,15 @@
 // execution".
 // This pass is aimed at mitigating against SpectreV1-style vulnarabilities.
 //
-// At the moment, it implements the tracking of miss-speculation of control
-// flow into a taint register, but doesn't implement a mechanism yet to then
-// use that taint register to mask of vulnerable data in registers (something
-// for a follow-on improvement). Possible strategies to mask out vulnerable
-// data that can be implemented on top of this are:
-// - speculative load hardening to automatically mask of data loaded
-//   in registers.
-// - using intrinsics to mask of data in registers as indicated by the
-//   programmer (see https://lwn.net/Articles/759423/).
+// It also implements speculative load hardening, i.e. using the taint register
+// to automatically mask off loaded data.
 //
-// For AArch64, the following implementation choices are made below.
+// As a possible follow-on improvement, also an intrinsics-based approach as
+// explained at https://lwn.net/Articles/759423/ could be implemented on top of
+// the current design.
+//
+// For AArch64, the following implementation choices are made to implement the
+// tracking of control flow miss-speculation into a taint register:
 // Some of these are different than the implementation choices made in
 // the similar pass implemented in X86SpeculativeLoadHardening.cpp, as
 // the instruction set characteristics result in different trade-offs.
@@ -65,6 +63,24 @@
 // - On function call boundaries, the miss-speculation state is transferred from
 //   the taint register X16 to be encoded in the SP register as value 0.
 //
+// For the aspect of automatically hardening loads, using the taint register,
+// (a.k.a. speculative load hardening, see
+//  https://llvm.org/docs/SpeculativeLoadHardening.html), the following
+// implementation choices are made for AArch64:
+//   - Many of the optimizations described at
+//     https://llvm.org/docs/SpeculativeLoadHardening.html to harden fewer
+//     loads haven't been implemented yet - but for some of them there are
+//     FIXMEs in the code.
+//   - loads that load into general purpose (X or W) registers get hardened by
+//     masking the loaded data. For loads that load into other registers, the
+//     address loaded from gets hardened. It is expected that hardening the
+//     loaded data may be more efficient; but masking data in registers other
+//     than X or W is not easy and may result in being slower than just
+//     hardening the X address register loaded from.
+//   - On AArch64, CSDB instructions are inserted between the masking of the
+//     register and its first use, to ensure there's no non-control-flow
+//     speculation that might undermine the hardening mechanism.
+//
 // Future extensions/improvements could be:
 // - Implement this functionality using full speculation barriers, akin to the
 //   x86-slh-lfence option. This may be more useful for the intrinsics-based
@@ -99,6 +115,10 @@
 
 #define AARCH64_SPECULATION_HARDENING_NAME "AArch64 speculation hardening pass"
 
+cl::opt<bool> HardenLoads("aarch64-slh-loads", cl::Hidden,
+                          cl::desc("Sanitize loads from memory."),
+                          cl::init(true));
+
 namespace {
 
 class AArch64SpeculationHardening : public MachineFunctionPass {
@@ -120,7 +140,10 @@
 
 private:
   unsigned MisspeculatingTaintReg;
+  unsigned MisspeculatingTaintReg32Bit;
   bool UseControlFlowSpeculationBarrier;
+  BitVector RegsNeedingCSDBBeforeUse;
+  BitVector RegsAlreadyMasked;
 
   bool functionUsesHardeningRegister(MachineFunction &MF) const;
   bool instrumentControlFlow(MachineBasicBlock &MBB);
@@ -134,6 +157,16 @@
   void insertRegToSPTaintPropagation(MachineBasicBlock *MBB,
                                      MachineBasicBlock::iterator MBBI,
                                      unsigned TmpReg) const;
+
+  bool slhLoads(MachineBasicBlock &MBB);
+  bool makeGPRSpeculationSafe(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI,
+                              MachineInstr &MI, unsigned Reg);
+  bool lowerSpeculationSafeValuePseudos(MachineBasicBlock &MBB);
+  bool expandSpeculationSafeValue(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MBBI);
+  bool insertCSDB(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                  DebugLoc DL);
 };
 
 } // end anonymous namespace
@@ -330,23 +363,257 @@
   return false;
 }
 
+// Make GPR register Reg speculation-safe by putting it through the
+// SpeculationSafeValue pseudo instruction, if we can't prove that
+// the value in the register has already been hardened.
+bool AArch64SpeculationHardening::makeGPRSpeculationSafe(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineInstr &MI,
+    unsigned Reg) {
+  assert(AArch64::GPR32allRegClass.contains(Reg) ||
+         AArch64::GPR64allRegClass.contains(Reg));
+
+  // Loads cannot directly load a value into the SP (nor WSP).
+  // Therefore, if Reg is SP or WSP, it is because the instruction loads from
+  // the stack through the stack pointer.
+  //
+  // Since the stack pointer is never dynamically controllable, don't harden it.
+  if (Reg == AArch64::SP || Reg == AArch64::WSP)
+    return false;
+
+  // Do not harden the register again if already hardened before.
+  if (RegsAlreadyMasked[Reg])
+    return false;
+
+  const bool Is64Bit = AArch64::GPR64allRegClass.contains(Reg);
+  LLVM_DEBUG(dbgs() << "About to harden register : " << Reg << "\n");
+  BuildMI(MBB, MBBI, MI.getDebugLoc(),
+          TII->get(Is64Bit ? AArch64::SpeculationSafeValueX
+                           : AArch64::SpeculationSafeValueW))
+      .addDef(Reg)
+      .addUse(Reg);
+  RegsAlreadyMasked.set(Reg);
+  return true;
+}
+
+bool AArch64SpeculationHardening::slhLoads(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  LLVM_DEBUG(dbgs() << "slhLoads running on MBB: " << MBB);
+
+  RegsAlreadyMasked.reset();
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  MachineBasicBlock::iterator NextMBBI;
+  for (; MBBI != E; MBBI = NextMBBI) {
+    MachineInstr &MI = *MBBI;
+    NextMBBI = std::next(MBBI);
+    // Only harden loaded values or addresses used in loads.
+    if (!MI.mayLoad())
+      continue;
+
+    LLVM_DEBUG(dbgs() << "About to harden: " << MI);
+
+    // For general purpose register loads, harden the registers loaded into.
+    // For other loads, harden the address loaded from.
+    // Masking the loaded value is expected to result in less performance
+    // overhead, as the load can still execute speculatively in comparison to
+    // when the address loaded from gets masked. However, masking is only
+    // easy to do efficiently on GPR registers, so for loads into non-GPR
+    // registers (e.g. floating point loads), mask the address loaded from.
+    bool AllDefsAreGPR = llvm::all_of(MI.defs(), [&](MachineOperand &Op) {
+      return Op.isReg() && (AArch64::GPR32allRegClass.contains(Op.getReg()) ||
+                            AArch64::GPR64allRegClass.contains(Op.getReg()));
+    });
+    // FIXME: it might be a worthwhile optimization to not mask loaded
+    // values if all the registers involved in address calculation are already
+    // hardened, leading to this load not able to execute on a miss-speculated
+    // path.
+    bool HardenLoadedData = AllDefsAreGPR;
+    bool HardenAddressLoadedFrom = !HardenLoadedData;
+
+    // First remove registers from AlreadyMaskedRegisters if their value is
+    // updated by this instruction - it makes them contain a new value that is
+    // not guaranteed to already have been masked.
+    for (MachineOperand Op : MI.defs())
+      for (MCRegAliasIterator AI(Op.getReg(), TRI, true); AI.isValid(); ++AI)
+        RegsAlreadyMasked.reset(*AI);
+
+    // FIXME: loads from the stack with an immediate offset from the stack
+    // pointer probably shouldn't be hardened, which could result in a
+    // significant optimization. See section "Don’t check loads from
+    // compile-time constant stack offsets", in
+    // https://llvm.org/docs/SpeculativeLoadHardening.html
+
+    if (HardenLoadedData)
+      for (auto Def : MI.defs()) {
+        if (Def.isDead())
+          // Do not mask a register that is not used further.
+          continue;
+        // FIXME: For pre/post-increment addressing modes, the base register
+        // used in address calculation is also defined by this instruction.
+        // It might be a worthwhile optimization to not harden that
+        // base register increment/decrement when the increment/decrement is
+        // an immediate.
+        Modified |= makeGPRSpeculationSafe(MBB, NextMBBI, MI, Def.getReg());
+      }
+
+    if (HardenAddressLoadedFrom)
+      for (auto Use : MI.uses()) {
+        if (!Use.isReg())
+          continue;
+        unsigned Reg = Use.getReg();
+        // Some loads of floating point data have implicit defs/uses on a
+        // super register of that floating point data. Some examples:
+        // $s0 = LDRSui $sp, 22, implicit-def $q0
+        // $q0 = LD1i64 $q0, 1, renamable $x0
+        // We need to filter out these uses for non-GPR register which occur
+        // because the load partially fills a non-GPR register with the loaded
+        // data. Just skipping all non-GPR registers is safe (for now) as all
+        // AArch64 load instructions only use GPR registers to perform the
+        // address calculation. FIXME: However that might change once we can
+        // produce SVE gather instructions.
+        if (!(AArch64::GPR32allRegClass.contains(Reg) ||
+              AArch64::GPR64allRegClass.contains(Reg)))
+          continue;
+        Modified |= makeGPRSpeculationSafe(MBB, MBBI, MI, Reg);
+      }
+  }
+  return Modified;
+}
+
+/// \brief If MBBI references a pseudo instruction that should be expanded
+/// here, do the expansion and return true. Otherwise return false.
+bool AArch64SpeculationHardening::expandSpeculationSafeValue(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  bool Is64Bit = true;
+
+  switch (Opcode) {
+  default:
+    break;
+  case AArch64::SpeculationSafeValueW:
+    Is64Bit = false;
+    LLVM_FALLTHROUGH;
+  case AArch64::SpeculationSafeValueX:
+    // Just remove the SpeculationSafe pseudo's if control flow
+    // miss-speculation isn't happening because we're already inserting barriers
+    // to guarantee that.
+    if (!UseControlFlowSpeculationBarrier) {
+      unsigned DstReg = MI.getOperand(0).getReg();
+      unsigned SrcReg = MI.getOperand(1).getReg();
+      // Mark this register and all its aliasing registers as needing to be
+      // value speculation hardened before its next use, by using a CSDB
+      // barrier instruction.
+      for (MachineOperand Op : MI.defs())
+        for (MCRegAliasIterator AI(Op.getReg(), TRI, true); AI.isValid(); ++AI)
+          RegsNeedingCSDBBeforeUse.set(*AI);
+
+      // Mask off with taint state.
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              Is64Bit ? TII->get(AArch64::ANDXrs) : TII->get(AArch64::ANDWrs))
+          .addDef(DstReg)
+          .addUse(SrcReg, RegState::Kill)
+          .addUse(Is64Bit ? MisspeculatingTaintReg
+                          : MisspeculatingTaintReg32Bit)
+          .addImm(0);
+    }
+    MI.eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
+bool AArch64SpeculationHardening::insertCSDB(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MBBI,
+                                             DebugLoc DL) {
+  assert(!UseControlFlowSpeculationBarrier && "No need to insert CSDBs when "
+                                              "control flow miss-speculation "
+                                              "is already blocked");
+  // insert data value speculation barrier (CSDB)
+  BuildMI(MBB, MBBI, DL, TII->get(AArch64::HINT)).addImm(0x14);
+  RegsNeedingCSDBBeforeUse.reset();
+  return true;
+}
+
+bool AArch64SpeculationHardening::lowerSpeculationSafeValuePseudos(
+    MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  RegsNeedingCSDBBeforeUse.reset();
+
+  // The following loop iterates over all instructions in the basic block,
+  // and performs 2 operations:
+  // 1. Insert a CSDB at this location if needed.
+  // 2. Expand the SpeculationSafeValuePseudo if the current instruction is
+  // one.
+  //
+  // The insertion of the CSDB is done as late as possible (i.e. just before
+  // the use of a masked register), in the hope that that will reduce the
+  // total number of CSDBs in a block when there are multiple masked registers
+  // in the block.
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  DebugLoc DL;
+  while (MBBI != E) {
+    MachineInstr &MI = *MBBI;
+    DL = MI.getDebugLoc();
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+
+    // First check if a CSDB needs to be inserted due to earlier registers
+    // that were masked and that are used by the next instruction.
+    // Also emit the barrier on any potential control flow changes.
+    bool NeedToEmitBarrier = false;
+    if (RegsNeedingCSDBBeforeUse.any() && (MI.isCall() || MI.isTerminator()))
+      NeedToEmitBarrier = true;
+    if (!NeedToEmitBarrier)
+      for (MachineOperand Op : MI.uses())
+        if (Op.isReg() && RegsNeedingCSDBBeforeUse[Op.getReg()]) {
+          NeedToEmitBarrier = true;
+          break;
+        }
+
+    if (NeedToEmitBarrier)
+      Modified |= insertCSDB(MBB, MBBI, DL);
+
+    Modified |= expandSpeculationSafeValue(MBB, MBBI);
+
+    MBBI = NMBBI;
+  }
+
+  if (RegsNeedingCSDBBeforeUse.any())
+    Modified |= insertCSDB(MBB, MBBI, DL);
+
+  return Modified;
+}
+
 bool AArch64SpeculationHardening::runOnMachineFunction(MachineFunction &MF) {
   if (!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening))
     return false;
 
   MisspeculatingTaintReg = AArch64::X16;
+  MisspeculatingTaintReg32Bit = AArch64::W16;
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getSubtarget().getRegisterInfo();
+  RegsNeedingCSDBBeforeUse.resize(TRI->getNumRegs());
+  RegsAlreadyMasked.resize(TRI->getNumRegs());
+  UseControlFlowSpeculationBarrier = functionUsesHardeningRegister(MF);
+
   bool Modified = false;
 
-  UseControlFlowSpeculationBarrier = functionUsesHardeningRegister(MF);
+  // Step 1: Enable automatic insertion of SpeculationSafeValue.
+  if (HardenLoads) {
+    LLVM_DEBUG(
+        dbgs() << "***** AArch64SpeculationHardening - automatic insertion of "
+                  "SpeculationSafeValue intrinsics *****\n");
+    for (auto &MBB : MF)
+      Modified |= slhLoads(MBB);
+  }
 
-  // Instrument control flow speculation tracking, if requested.
+  // 2.a Add instrumentation code to function entry and exits.
   LLVM_DEBUG(
       dbgs()
       << "***** AArch64SpeculationHardening - track control flow *****\n");
 
-  // 1. Add instrumentation code to function entry and exits.
   SmallVector<MachineBasicBlock *, 2> EntryBlocks;
   EntryBlocks.push_back(&MF.front());
   for (const LandingPadInfo &LPI : MF.getLandingPads())
@@ -355,10 +622,16 @@
     insertSPToRegTaintPropagation(
         Entry, Entry->SkipPHIsLabelsAndDebug(Entry->begin()));
 
-  // 2. Add instrumentation code to every basic block.
+  // 2.b Add instrumentation code to every basic block.
   for (auto &MBB : MF)
     Modified |= instrumentControlFlow(MBB);
 
+  LLVM_DEBUG(dbgs() << "***** AArch64SpeculationHardening - Lowering "
+                       "SpeculationSafeValue Pseudos *****\n");
+  // Step 3: Lower SpeculationSafeValue pseudo instructions.
+  for (auto &MBB : MF)
+    Modified |= lowerSpeculationSafeValuePseudos(MBB);
+
   return Modified;
 }
 
Index: llvm/trunk/test/CodeGen/AArch64/speculation-hardening-loads.ll
===================================================================
--- llvm/trunk/test/CodeGen/AArch64/speculation-hardening-loads.ll
+++ llvm/trunk/test/CodeGen/AArch64/speculation-hardening-loads.ll
@@ -0,0 +1,157 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s --dump-input-on-failure
+
+define i128 @ldp_single_csdb(i128* %p) speculative_load_hardening {
+entry:
+  %0 = load i128, i128* %p, align 16
+  ret i128 %0
+; CHECK-LABEL: ldp_single_csdb
+; CHECK:      ldp   x8, x1, [x0]
+; CHECK-NEXT: cmp sp, #0
+; CHECK-NEXT: csetm x16, ne
+; CHECK-NEXT: and   x8, x8, x16
+; CHECK-NEXT: and   x1, x1, x16
+; CHECK-NEXT: csdb
+; CHECK-NEXT: mov x17, sp
+; CHECK-NEXT: and x17, x17, x16
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: mov sp, x17
+; CHECK-NEXT: ret
+}
+
+define double @ld_double(double* %p) speculative_load_hardening {
+entry:
+  %0 = load double, double* %p, align 8
+  ret double %0
+; Checking that the address laoded from is masked for a floating point load.
+; CHECK-LABEL: ld_double
+; CHECK:      cmp sp, #0
+; CHECK-NEXT: csetm x16, ne
+; CHECK-NEXT: and   x0, x0, x16
+; CHECK-NEXT: csdb
+; CHECK-NEXT: ldr   d0, [x0]
+; CHECK-NEXT: mov x17, sp
+; CHECK-NEXT: and x17, x17, x16
+; CHECK-NEXT: mov sp, x17
+; CHECK-NEXT: ret
+}
+
+define i32 @csdb_emitted_for_subreg_use(i64* %p, i32 %b) speculative_load_hardening {
+entry:
+  %X = load i64, i64* %p, align 8
+  %X_trunc = trunc i64 %X to i32
+  %add = add i32 %b, %X_trunc
+  %iszero = icmp eq i64 %X, 0
+  %ret = select i1 %iszero, i32 %b, i32 %add
+  ret i32 %ret
+; Checking that the address laoded from is masked for a floating point load.
+; CHECK-LABEL: csdb_emitted_for_subreg_use
+; CHECK:      ldr x8, [x0]
+; CHECK-NEXT: cmp sp, #0
+; CHECK-NEXT: csetm x16, ne
+; CHECK-NEXT: and x8, x8, x16
+; csdb instruction must occur before the add instruction with w8 as operand.
+; CHECK-NEXT: csdb
+; CHECK-NEXT: mov x17, sp
+; CHECK-NEXT: add w9, w1, w8
+; CHECK-NEXT: cmp x8, #0
+; CHECK-NEXT: and x17, x17, x16
+; CHECK-NEXT: csel w0, w1, w9, eq
+; CHECK-NEXT: mov sp, x17
+; CHECK-NEXT: ret
+}
+
+define i64 @csdb_emitted_for_superreg_use(i32* %p, i64 %b) speculative_load_hardening {
+entry:
+  %X = load i32, i32* %p, align 4
+  %X_ext = zext i32 %X to i64
+  %add = add i64 %b, %X_ext
+  %iszero = icmp eq i32 %X, 0
+  %ret = select i1 %iszero, i64 %b, i64 %add
+  ret i64 %ret
+; Checking that the address laoded from is masked for a floating point load.
+; CHECK-LABEL: csdb_emitted_for_superreg_use
+; CHECK:      ldr w8, [x0]
+; CHECK-NEXT: cmp sp, #0
+; CHECK-NEXT: csetm x16, ne
+; CHECK-NEXT: and w8, w8, w16
+; csdb instruction must occur before the add instruction with x8 as operand.
+; CHECK-NEXT: csdb
+; CHECK-NEXT: mov x17, sp
+; CHECK-NEXT: add x9, x1, x8
+; CHECK-NEXT: cmp w8, #0
+; CHECK-NEXT: and x17, x17, x16
+; CHECK-NEXT: csel x0, x1, x9, eq
+; CHECK-NEXT: mov sp, x17
+; CHECK-NEXT: ret
+}
+
+define i64 @no_masking_with_full_control_flow_barriers(i64 %a, i64 %b, i64* %p) speculative_load_hardening {
+; CHECK-LABEL: no_masking_with_full_control_flow_barriers
+; CHECK: dsb sy
+; CHECK: isb
+entry:
+  %0 = tail call i64 asm "autia1716", "={x17},{x16},0"(i64 %b, i64 %a)
+  %X = load i64, i64* %p, align 8
+  %ret = add i64 %X, %0
+; CHECK-NOT: csdb
+; CHECK-NOT: and
+; CHECK: ret
+  ret i64 %ret
+}
+
+define void @f_implicitdef_vector_load(<4 x i32>* %dst, <2 x i32>* %src) speculative_load_hardening
+{
+entry:
+  %0 = load <2 x i32>, <2 x i32>* %src, align 8
+  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  store <4 x i32> %shuffle, <4 x i32>* %dst, align 4
+  ret void
+; CHECK-LABEL: f_implicitdef_vector_load
+; CHECK:       cmp     sp, #0
+; CHECK-NEXT:  csetm   x16, ne
+; CHECK-NEXT:  and     x1, x1, x16
+; CHECK-NEXT:  csdb
+; CHECK-NEXT:  ldr     d0, [x1]
+; CHECK-NEXT:  mov     x17, sp
+; CHECK-NEXT:  and     x17, x17, x16
+; CHECK-NEXT:  mov     v0.d[1], v0.d[0]
+; CHECK-NEXT:  str     q0, [x0]
+; CHECK-NEXT:  mov     sp, x17
+; CHECK-NEXT:  ret
+}
+
+define <2 x double> @f_usedefvectorload(double* %a, double* %b) speculative_load_hardening {
+entry:
+; CHECK-LABEL: f_usedefvectorload
+; CHECK:       cmp     sp, #0
+; CHECK-NEXT:  csetm   x16, ne
+; CHECK-NEXT:  movi    v0.2d, #0000000000000000
+; CHECK-NEXT:  and     x1, x1, x16
+; CHECK-NEXT:  csdb
+; CHECK-NEXT:  ld1     { v0.d }[0], [x1]
+; CHECK-NEXT:  mov     x17, sp
+; CHECK-NEXT:  and     x17, x17, x16
+; CHECK-NEXT:  mov     sp, x17
+; CHECK-NEXT:  ret
+  %0 = load double, double* %b, align 16
+  %vld1_lane = insertelement <2 x double> <double undef, double 0.000000e+00>, double %0, i32 0
+  ret <2 x double> %vld1_lane
+}
+
+define i32 @deadload() speculative_load_hardening {
+entry:
+; CHECK-LABEL: deadload
+; CHECK:       cmp     sp, #0
+; CHECK-NEXT:  csetm   x16, ne
+; CHECK-NEXT:  sub     sp, sp, #16
+; CHECK-NEXT:  .cfi_def_cfa_offset 16
+; CHECK-NEXT:  ldr     w8, [sp, #12]
+; CHECK-NEXT:  add     sp, sp, #16
+; CHECK-NEXT:  mov     x17, sp
+; CHECK-NEXT:  and     x17, x17, x16
+; CHECK-NEXT:  mov     sp, x17
+; CHECK-NEXT:  ret
+  %a = alloca i32, align 4
+  %val = load volatile i32, i32* %a, align 4
+  ret i32 undef
+}