Index: llvm/trunk/lib/Target/X86/X86SpeculativeLoadHardening.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ llvm/trunk/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -172,8 +172,8 @@
                  SmallDenseMap<unsigned, unsigned, 32> &AddrRegToHardenedReg);
   MachineInstr *
   sinkPostLoadHardenedInst(MachineInstr &MI,
-                           SmallPtrSetImpl<MachineInstr *> &HardenedLoads);
-  bool canHardenPostLoad(MachineInstr &MI);
+                           SmallPtrSetImpl<MachineInstr *> &HardenedInstrs);
+  bool canHardenRegister(unsigned Reg);
   void hardenPostLoad(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
   void checkReturnInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
   void checkCallInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA);
@@ -790,10 +790,152 @@
     // By default, assume that the instruction is not data invariant.
     return false;
 
-    // FIXME: For now, we just use a very boring, conservative set of unary
-    // instructions because we're mostly interested in handling simple
-    // transformations.
+    // Some target-independent operations that trivially lower to data-invariant
+    // instructions.
   case TargetOpcode::COPY:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+    return true;
+
+  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
+  // However, they set flags and are perhaps the most surprisingly constant
+  // time operations so we call them out here separately.
+  case X86::IMUL16rr:
+  case X86::IMUL16rri8:
+  case X86::IMUL16rri:
+  case X86::IMUL32rr:
+  case X86::IMUL32rri8:
+  case X86::IMUL32rri:
+  case X86::IMUL64rr:
+  case X86::IMUL64rri32:
+  case X86::IMUL64rri8:
+
+  // Bit scanning and counting instructions that are somewhat surprisingly
+  // constant time as they scan across bits and do other fairly complex
+  // operations like popcnt, but are believed to be constant time on x86.
+  // However, these set flags.
+  case X86::BSF16rr:
+  case X86::BSF32rr:
+  case X86::BSF64rr:
+  case X86::BSR16rr:
+  case X86::BSR32rr:
+  case X86::BSR64rr:
+  case X86::LZCNT16rr:
+  case X86::LZCNT32rr:
+  case X86::LZCNT64rr:
+  case X86::POPCNT16rr:
+  case X86::POPCNT32rr:
+  case X86::POPCNT64rr:
+  case X86::TZCNT16rr:
+  case X86::TZCNT32rr:
+  case X86::TZCNT64rr:
+
+  // Bit manipulation instructions are effectively combinations of basic
+  // arithmetic ops, and should still execute in constant time. These also
+  // set flags.
+  case X86::BLCFILL32rr:
+  case X86::BLCFILL64rr:
+  case X86::BLCI32rr:
+  case X86::BLCI64rr:
+  case X86::BLCIC32rr:
+  case X86::BLCIC64rr:
+  case X86::BLCMSK32rr:
+  case X86::BLCMSK64rr:
+  case X86::BLCS32rr:
+  case X86::BLCS64rr:
+  case X86::BLSFILL32rr:
+  case X86::BLSFILL64rr:
+  case X86::BLSI32rr:
+  case X86::BLSI64rr:
+  case X86::BLSIC32rr:
+  case X86::BLSIC64rr:
+  case X86::BLSMSK32rr:
+  case X86::BLSMSK64rr:
+  case X86::BLSR32rr:
+  case X86::BLSR64rr:
+  case X86::TZMSK32rr:
+  case X86::TZMSK64rr:
+
+  // Bit extracting and clearing instructions should execute in constant time,
+  // and set flags.
+  case X86::BEXTR32rr:
+  case X86::BEXTR64rr:
+  case X86::BEXTRI32ri:
+  case X86::BEXTRI64ri:
+  case X86::BZHI32rr:
+  case X86::BZHI64rr:
+
+  // Basic arithmetic is constant time on the input but does set flags.
+  case X86::ADC8rr:   case X86::ADC8ri:
+  case X86::ADC16rr:  case X86::ADC16ri:   case X86::ADC16ri8:
+  case X86::ADC32rr:  case X86::ADC32ri:   case X86::ADC32ri8:
+  case X86::ADC64rr:  case X86::ADC64ri8:  case X86::ADC64ri32:
+  case X86::ADD8rr:   case X86::ADD8ri:
+  case X86::ADD16rr:  case X86::ADD16ri:   case X86::ADD16ri8:
+  case X86::ADD32rr:  case X86::ADD32ri:   case X86::ADD32ri8:
+  case X86::ADD64rr:  case X86::ADD64ri8:  case X86::ADD64ri32:
+  case X86::AND8rr:   case X86::AND8ri:
+  case X86::AND16rr:  case X86::AND16ri:   case X86::AND16ri8:
+  case X86::AND32rr:  case X86::AND32ri:   case X86::AND32ri8:
+  case X86::AND64rr:  case X86::AND64ri8:  case X86::AND64ri32:
+  case X86::OR8rr:    case X86::OR8ri:
+  case X86::OR16rr:   case X86::OR16ri:    case X86::OR16ri8:
+  case X86::OR32rr:   case X86::OR32ri:    case X86::OR32ri8:
+  case X86::OR64rr:   case X86::OR64ri8:   case X86::OR64ri32:
+  case X86::SBB8rr:   case X86::SBB8ri:
+  case X86::SBB16rr:  case X86::SBB16ri:   case X86::SBB16ri8:
+  case X86::SBB32rr:  case X86::SBB32ri:   case X86::SBB32ri8:
+  case X86::SBB64rr:  case X86::SBB64ri8:  case X86::SBB64ri32:
+  case X86::SUB8rr:   case X86::SUB8ri:
+  case X86::SUB16rr:  case X86::SUB16ri:   case X86::SUB16ri8:
+  case X86::SUB32rr:  case X86::SUB32ri:   case X86::SUB32ri8:
+  case X86::SUB64rr:  case X86::SUB64ri8:  case X86::SUB64ri32:
+  case X86::XOR8rr:   case X86::XOR8ri:
+  case X86::XOR16rr:  case X86::XOR16ri:   case X86::XOR16ri8:
+  case X86::XOR32rr:  case X86::XOR32ri:   case X86::XOR32ri8:
+  case X86::XOR64rr:  case X86::XOR64ri8:  case X86::XOR64ri32:
+  // Arithmetic with just 32-bit and 64-bit variants and no immediates.
+  case X86::ADCX32rr: case X86::ADCX64rr:
+  case X86::ADOX32rr: case X86::ADOX64rr:
+  case X86::ANDN32rr: case X86::ANDN64rr:
+  // Just one operand for inc and dec.
+  case X86::INC8r: case X86::INC16r: case X86::INC32r: case X86::INC64r:
+  case X86::DEC8r: case X86::DEC16r: case X86::DEC32r: case X86::DEC64r:
+    // Check whether the EFLAGS implicit-def is dead. We assume that this will
+    // always find the implicit-def because this code should only be reached
+    // for instructions that do in fact implicitly def this.
+    if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
+      // If we would clobber EFLAGS that are used, just bail for now.
+      LLVM_DEBUG(dbgs() << "    Unable to harden post-load due to EFLAGS: ";
+                 MI.dump(); dbgs() << "\n");
+      return false;
+    }
+
+    // Otherwise, fallthrough to handle these the same as instructions that
+    // don't set EFLAGS.
+    LLVM_FALLTHROUGH;
+
+  // Integer multiply w/o affecting flags is still believed to be constant
+  // time on x86. Called out separately as this is among the most surprising
+  // instructions to exhibit that behavior.
+  case X86::MULX32rr:
+  case X86::MULX64rr:
+
+  // Arithmetic instructions that are both constant time and don't set flags.
+  case X86::RORX32ri:
+  case X86::RORX64ri:
+  case X86::SARX32rr:
+  case X86::SARX64rr:
+  case X86::SHLX32rr:
+  case X86::SHLX64rr:
+  case X86::SHRX32rr:
+  case X86::SHRX64rr:
+
+  // LEA doesn't actually access memory, and its arithmetic is constant time.
+  case X86::LEA16r:
+  case X86::LEA32r:
+  case X86::LEA64_32r:
+  case X86::LEA64r:
     return true;
   }
 }
@@ -1121,7 +1263,9 @@
       // address registers, queue it up to be hardened post-load. Notably, even
       // once hardened this won't introduce a useful dependency that could prune
       // out subsequent loads.
-      if (EnablePostLoadHardening && canHardenPostLoad(MI) &&
+      if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
+          MI.getDesc().getNumDefs() == 1 && MI.getOperand(0).isReg() &&
+          canHardenRegister(MI.getOperand(0).getReg()) &&
           !HardenedAddrRegs.count(BaseReg) &&
           !HardenedAddrRegs.count(IndexReg)) {
         HardenPostLoad.insert(&MI);
@@ -1525,7 +1669,7 @@
 }
 
 MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
-    MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedLoads) {
+    MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
   assert(isDataInvariantLoad(InitialMI) &&
          "Cannot get here with a non-invariant load!");
 
@@ -1540,9 +1684,19 @@
     MachineInstr *SingleUseMI = nullptr;
     for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
       // If we're already going to harden this use, it is data invariant and
-      // within our block and we just need to check that the use isn't in an
-      // address.
-      if (HardenedLoads.count(&UseMI)) {
+      // within our block.
+      if (HardenedInstrs.count(&UseMI)) {
+        if (!isDataInvariantLoad(UseMI)) {
+          // If we've already decided to harden a non-load, we must have sunk
+          // some other post-load hardened instruction to it and it must itself
+          // be data-invariant.
+          assert(isDataInvariant(UseMI) &&
+                 "Data variant instruction being hardened!");
+          continue;
+        }
+
+        // Otherwise, this is a load and the load component can't be data
+        // invariant so check how this register is being used.
         const MCInstrDesc &Desc = UseMI.getDesc();
         int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
         assert(MemRefBeginIdx >= 0 &&
@@ -1581,7 +1735,7 @@
       // can harden.
       unsigned UseDefReg = UseMI.getOperand(0).getReg();
       if (!TRI->isVirtualRegister(UseDefReg) ||
-          !MRI->getRegClass(UseDefReg)->hasSubClassEq(&X86::GR64RegClass))
+          !canHardenRegister(UseDefReg))
         return {};
 
       SingleUseMI = &UseMI;
@@ -1603,23 +1757,30 @@
   return MI;
 }
 
-bool X86SpeculativeLoadHardeningPass::canHardenPostLoad(MachineInstr &MI) {
-  if (!isDataInvariantLoad(MI))
+bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) {
+  auto *RC = MRI->getRegClass(Reg);
+  int RegBytes = TRI->getRegSizeInBits(*RC) / 8;
+  if (RegBytes > 8)
+    // We don't support post-load hardening of vectors.
     return false;
 
-  auto &DefOp = MI.getOperand(0);
-  unsigned OldDefReg = DefOp.getReg();
-
-  auto *DefRC = MRI->getRegClass(OldDefReg);
-  int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8;
-  if (DefRegBytes > 8)
-    // We don't support post-load hardening of vectors.
+  // If this register class is explicitly constrained to a class that doesn't
+  // require REX prefix, we may not be able to satisfy that constraint when
+  // emitting the hardening instructions, so bail out here.
+  // FIXME: This seems like a pretty lame hack. The way this comes up is when we
+  // end up both with a NOREX and REX-only register as operands to the hardening
+  // instructions. It would be better to fix that code to handle this situation
+  // rather than hack around it in this way.
+  const TargetRegisterClass *NOREXRegClasses[] = {
+      &X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass,
+      &X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass};
+  if (RC == NOREXRegClasses[Log2_32(RegBytes)])
     return false;
 
   const TargetRegisterClass *GPRRegClasses[] = {
       &X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass,
       &X86::GR64RegClass};
-  return DefRC->hasSuperClassEq(GPRRegClasses[Log2_32(DefRegBytes)]);
+  return RC->hasSuperClassEq(GPRRegClasses[Log2_32(RegBytes)]);
 }
 
 // We can harden non-leaking loads into register without touching the address
@@ -1629,15 +1790,14 @@
 // execution and coercing them to one is sufficient.
 void X86SpeculativeLoadHardeningPass::hardenPostLoad(
     MachineInstr &MI, MachineSSAUpdater &PredStateSSA) {
-  assert(canHardenPostLoad(MI) &&
-         "Invalid instruction for post-load hardening!");
-
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc Loc = MI.getDebugLoc();
 
   // For all of these, the def'ed register operand is operand zero.
   auto &DefOp = MI.getOperand(0);
   unsigned OldDefReg = DefOp.getReg();
+  assert(canHardenRegister(OldDefReg) &&
+         "Cannot harden this instruction's defined register!");
 
   auto *DefRC = MRI->getRegClass(OldDefReg);
   int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8;
Index: llvm/trunk/test/CodeGen/X86/speculative-load-hardening.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/speculative-load-hardening.ll
+++ llvm/trunk/test/CodeGen/X86/speculative-load-hardening.ll
@@ -862,3 +862,71 @@
   call void @sink_v2i64(<2 x i64> %x6)
   ret void
 }
+
+define void @test_deferred_hardening(i32* %ptr1, i32* %ptr2, i32 %x) nounwind {
+; X64-LABEL: test_deferred_hardening:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    movq %rsi, %r14
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    movq $-1, %rcx
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movl (%rdi), %edi
+; X64-NEXT:    incl %edi
+; X64-NEXT:    imull %edx, %edi
+; X64-NEXT:    orl %eax, %edi
+; X64-NEXT:    shlq $47, %rax
+; X64-NEXT:    orq %rax, %rsp
+; X64-NEXT:    callq sink
+; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    movl (%rbx), %ecx
+; X64-NEXT:    movl (%r14), %edx
+; X64-NEXT:    leal 1(%rcx,%rdx), %edi
+; X64-NEXT:    orl %eax, %edi
+; X64-NEXT:    shlq $47, %rax
+; X64-NEXT:    orq %rax, %rsp
+; X64-NEXT:    callq sink
+; X64-NEXT:    movq %rsp, %rax
+; X64-NEXT:    sarq $63, %rax
+; X64-NEXT:    shlq $47, %rax
+; X64-NEXT:    orq %rax, %rsp
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r14
+; X64-NEXT:    retq
+;
+; X64-LFENCE-LABEL: test_deferred_hardening:
+; X64-LFENCE:       # %bb.0: # %entry
+; X64-LFENCE-NEXT:    pushq %r14
+; X64-LFENCE-NEXT:    pushq %rbx
+; X64-LFENCE-NEXT:    pushq %rax
+; X64-LFENCE-NEXT:    movq %rsi, %r14
+; X64-LFENCE-NEXT:    movq %rdi, %rbx
+; X64-LFENCE-NEXT:    movl (%rdi), %edi
+; X64-LFENCE-NEXT:    incl %edi
+; X64-LFENCE-NEXT:    imull %edx, %edi
+; X64-LFENCE-NEXT:    callq sink
+; X64-LFENCE-NEXT:    movl (%rbx), %eax
+; X64-LFENCE-NEXT:    movl (%r14), %ecx
+; X64-LFENCE-NEXT:    leal 1(%rax,%rcx), %edi
+; X64-LFENCE-NEXT:    callq sink
+; X64-LFENCE-NEXT:    addq $8, %rsp
+; X64-LFENCE-NEXT:    popq %rbx
+; X64-LFENCE-NEXT:    popq %r14
+; X64-LFENCE-NEXT:    retq
+entry:
+  %a1 = load i32, i32* %ptr1
+  %a2 = add i32 %a1, 1
+  %a3 = mul i32 %a2, %x
+  call void @sink(i32 %a3)
+  %b1 = load i32, i32* %ptr1
+  %b2 = add i32 %b1, 1
+  %b3 = load i32, i32* %ptr2
+  %b4 = add i32 %b2, %b3
+  call void @sink(i32 %b4)
+  ret void
+}