Index: llvm/trunk/lib/Target/X86/X86SpeculativeLoadHardening.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86SpeculativeLoadHardening.cpp +++ llvm/trunk/lib/Target/X86/X86SpeculativeLoadHardening.cpp @@ -172,8 +172,8 @@ SmallDenseMap &AddrRegToHardenedReg); MachineInstr * sinkPostLoadHardenedInst(MachineInstr &MI, - SmallPtrSetImpl &HardenedLoads); - bool canHardenPostLoad(MachineInstr &MI); + SmallPtrSetImpl &HardenedInstrs); + bool canHardenRegister(unsigned Reg); void hardenPostLoad(MachineInstr &MI, MachineSSAUpdater &PredStateSSA); void checkReturnInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA); void checkCallInstr(MachineInstr &MI, MachineSSAUpdater &PredStateSSA); @@ -790,10 +790,152 @@ // By default, assume that the instruction is not data invariant. return false; - // FIXME: For now, we just use a very boring, conservative set of unary - // instructions because we're mostly interested in handling simple - // transformations. + // Some target-independent operations that trivially lower to data-invariant + // instructions. case TargetOpcode::COPY: + case TargetOpcode::INSERT_SUBREG: + case TargetOpcode::SUBREG_TO_REG: + return true; + + // On x86 it is believed that imul is constant time w.r.t. the loaded data. + // However, they set flags and are perhaps the most surprisingly constant + // time operations so we call them out here separately. + case X86::IMUL16rr: + case X86::IMUL16rri8: + case X86::IMUL16rri: + case X86::IMUL32rr: + case X86::IMUL32rri8: + case X86::IMUL32rri: + case X86::IMUL64rr: + case X86::IMUL64rri32: + case X86::IMUL64rri8: + + // Bit scanning and counting instructions that are somewhat surprisingly + // constant time as they scan across bits and do other fairly complex + // operations like popcnt, but are believed to be constant time on x86. + // However, these set flags. + case X86::BSF16rr: + case X86::BSF32rr: + case X86::BSF64rr: + case X86::BSR16rr: + case X86::BSR32rr: + case X86::BSR64rr: + case X86::LZCNT16rr: + case X86::LZCNT32rr: + case X86::LZCNT64rr: + case X86::POPCNT16rr: + case X86::POPCNT32rr: + case X86::POPCNT64rr: + case X86::TZCNT16rr: + case X86::TZCNT32rr: + case X86::TZCNT64rr: + + // Bit manipulation instructions are effectively combinations of basic + // arithmetic ops, and should still execute in constant time. These also + // set flags. + case X86::BLCFILL32rr: + case X86::BLCFILL64rr: + case X86::BLCI32rr: + case X86::BLCI64rr: + case X86::BLCIC32rr: + case X86::BLCIC64rr: + case X86::BLCMSK32rr: + case X86::BLCMSK64rr: + case X86::BLCS32rr: + case X86::BLCS64rr: + case X86::BLSFILL32rr: + case X86::BLSFILL64rr: + case X86::BLSI32rr: + case X86::BLSI64rr: + case X86::BLSIC32rr: + case X86::BLSIC64rr: + case X86::BLSMSK32rr: + case X86::BLSMSK64rr: + case X86::BLSR32rr: + case X86::BLSR64rr: + case X86::TZMSK32rr: + case X86::TZMSK64rr: + + // Bit extracting and clearing instructions should execute in constant time, + // and set flags. + case X86::BEXTR32rr: + case X86::BEXTR64rr: + case X86::BEXTRI32ri: + case X86::BEXTRI64ri: + case X86::BZHI32rr: + case X86::BZHI64rr: + + // Basic arithmetic is constant time on the input but does set flags. + case X86::ADC8rr: case X86::ADC8ri: + case X86::ADC16rr: case X86::ADC16ri: case X86::ADC16ri8: + case X86::ADC32rr: case X86::ADC32ri: case X86::ADC32ri8: + case X86::ADC64rr: case X86::ADC64ri8: case X86::ADC64ri32: + case X86::ADD8rr: case X86::ADD8ri: + case X86::ADD16rr: case X86::ADD16ri: case X86::ADD16ri8: + case X86::ADD32rr: case X86::ADD32ri: case X86::ADD32ri8: + case X86::ADD64rr: case X86::ADD64ri8: case X86::ADD64ri32: + case X86::AND8rr: case X86::AND8ri: + case X86::AND16rr: case X86::AND16ri: case X86::AND16ri8: + case X86::AND32rr: case X86::AND32ri: case X86::AND32ri8: + case X86::AND64rr: case X86::AND64ri8: case X86::AND64ri32: + case X86::OR8rr: case X86::OR8ri: + case X86::OR16rr: case X86::OR16ri: case X86::OR16ri8: + case X86::OR32rr: case X86::OR32ri: case X86::OR32ri8: + case X86::OR64rr: case X86::OR64ri8: case X86::OR64ri32: + case X86::SBB8rr: case X86::SBB8ri: + case X86::SBB16rr: case X86::SBB16ri: case X86::SBB16ri8: + case X86::SBB32rr: case X86::SBB32ri: case X86::SBB32ri8: + case X86::SBB64rr: case X86::SBB64ri8: case X86::SBB64ri32: + case X86::SUB8rr: case X86::SUB8ri: + case X86::SUB16rr: case X86::SUB16ri: case X86::SUB16ri8: + case X86::SUB32rr: case X86::SUB32ri: case X86::SUB32ri8: + case X86::SUB64rr: case X86::SUB64ri8: case X86::SUB64ri32: + case X86::XOR8rr: case X86::XOR8ri: + case X86::XOR16rr: case X86::XOR16ri: case X86::XOR16ri8: + case X86::XOR32rr: case X86::XOR32ri: case X86::XOR32ri8: + case X86::XOR64rr: case X86::XOR64ri8: case X86::XOR64ri32: + // Arithmetic with just 32-bit and 64-bit variants and no immediates. + case X86::ADCX32rr: case X86::ADCX64rr: + case X86::ADOX32rr: case X86::ADOX64rr: + case X86::ANDN32rr: case X86::ANDN64rr: + // Just one operand for inc and dec. + case X86::INC8r: case X86::INC16r: case X86::INC32r: case X86::INC64r: + case X86::DEC8r: case X86::DEC16r: case X86::DEC32r: case X86::DEC64r: + // Check whether the EFLAGS implicit-def is dead. We assume that this will + // always find the implicit-def because this code should only be reached + // for instructions that do in fact implicitly def this. + if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) { + // If we would clobber EFLAGS that are used, just bail for now. + LLVM_DEBUG(dbgs() << " Unable to harden post-load due to EFLAGS: "; + MI.dump(); dbgs() << "\n"); + return false; + } + + // Otherwise, fallthrough to handle these the same as instructions that + // don't set EFLAGS. + LLVM_FALLTHROUGH; + + // Integer multiply w/o affecting flags is still believed to be constant + // time on x86. Called out separately as this is among the most surprising + // instructions to exhibit that behavior. + case X86::MULX32rr: + case X86::MULX64rr: + + // Arithmetic instructions that are both constant time and don't set flags. + case X86::RORX32ri: + case X86::RORX64ri: + case X86::SARX32rr: + case X86::SARX64rr: + case X86::SHLX32rr: + case X86::SHLX64rr: + case X86::SHRX32rr: + case X86::SHRX64rr: + + // LEA doesn't actually access memory, and its arithmetic is constant time. + case X86::LEA16r: + case X86::LEA32r: + case X86::LEA64_32r: + case X86::LEA64r: return true; } } @@ -1121,7 +1263,9 @@ // address registers, queue it up to be hardened post-load. Notably, even // once hardened this won't introduce a useful dependency that could prune // out subsequent loads. - if (EnablePostLoadHardening && canHardenPostLoad(MI) && + if (EnablePostLoadHardening && isDataInvariantLoad(MI) && + MI.getDesc().getNumDefs() == 1 && MI.getOperand(0).isReg() && + canHardenRegister(MI.getOperand(0).getReg()) && !HardenedAddrRegs.count(BaseReg) && !HardenedAddrRegs.count(IndexReg)) { HardenPostLoad.insert(&MI); @@ -1525,7 +1669,7 @@ } MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst( - MachineInstr &InitialMI, SmallPtrSetImpl &HardenedLoads) { + MachineInstr &InitialMI, SmallPtrSetImpl &HardenedInstrs) { assert(isDataInvariantLoad(InitialMI) && "Cannot get here with a non-invariant load!"); @@ -1540,9 +1684,19 @@ MachineInstr *SingleUseMI = nullptr; for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) { // If we're already going to harden this use, it is data invariant and - // within our block and we just need to check that the use isn't in an - // address. - if (HardenedLoads.count(&UseMI)) { + // within our block. + if (HardenedInstrs.count(&UseMI)) { + if (!isDataInvariantLoad(UseMI)) { + // If we've already decided to harden a non-load, we must have sunk + // some other post-load hardened instruction to it and it must itself + // be data-invariant. + assert(isDataInvariant(UseMI) && + "Data variant instruction being hardened!"); + continue; + } + + // Otherwise, this is a load and the load component can't be data + // invariant so check how this register is being used. const MCInstrDesc &Desc = UseMI.getDesc(); int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags); assert(MemRefBeginIdx >= 0 && @@ -1581,7 +1735,7 @@ // can harden. unsigned UseDefReg = UseMI.getOperand(0).getReg(); if (!TRI->isVirtualRegister(UseDefReg) || - !MRI->getRegClass(UseDefReg)->hasSubClassEq(&X86::GR64RegClass)) + !canHardenRegister(UseDefReg)) return {}; SingleUseMI = &UseMI; @@ -1603,23 +1757,30 @@ return MI; } -bool X86SpeculativeLoadHardeningPass::canHardenPostLoad(MachineInstr &MI) { - if (!isDataInvariantLoad(MI)) +bool X86SpeculativeLoadHardeningPass::canHardenRegister(unsigned Reg) { + auto *RC = MRI->getRegClass(Reg); + int RegBytes = TRI->getRegSizeInBits(*RC) / 8; + if (RegBytes > 8) + // We don't support post-load hardening of vectors. return false; - auto &DefOp = MI.getOperand(0); - unsigned OldDefReg = DefOp.getReg(); - - auto *DefRC = MRI->getRegClass(OldDefReg); - int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8; - if (DefRegBytes > 8) - // We don't support post-load hardening of vectors. + // If this register class is explicitly constrained to a class that doesn't + // require REX prefix, we may not be able to satisfy that constraint when + // emitting the hardening instructions, so bail out here. + // FIXME: This seems like a pretty lame hack. The way this comes up is when we + // end up both with a NOREX and REX-only register as operands to the hardening + // instructions. It would be better to fix that code to handle this situation + // rather than hack around it in this way. + const TargetRegisterClass *NOREXRegClasses[] = { + &X86::GR8_NOREXRegClass, &X86::GR16_NOREXRegClass, + &X86::GR32_NOREXRegClass, &X86::GR64_NOREXRegClass}; + if (RC == NOREXRegClasses[Log2_32(RegBytes)]) return false; const TargetRegisterClass *GPRRegClasses[] = { &X86::GR8RegClass, &X86::GR16RegClass, &X86::GR32RegClass, &X86::GR64RegClass}; - return DefRC->hasSuperClassEq(GPRRegClasses[Log2_32(DefRegBytes)]); + return RC->hasSuperClassEq(GPRRegClasses[Log2_32(RegBytes)]); } // We can harden non-leaking loads into register without touching the address @@ -1629,15 +1790,14 @@ // execution and coercing them to one is sufficient. void X86SpeculativeLoadHardeningPass::hardenPostLoad( MachineInstr &MI, MachineSSAUpdater &PredStateSSA) { - assert(canHardenPostLoad(MI) && - "Invalid instruction for post-load hardening!"); - MachineBasicBlock &MBB = *MI.getParent(); DebugLoc Loc = MI.getDebugLoc(); // For all of these, the def'ed register operand is operand zero. auto &DefOp = MI.getOperand(0); unsigned OldDefReg = DefOp.getReg(); + assert(canHardenRegister(OldDefReg) && + "Cannot harden this instruction's defined register!"); auto *DefRC = MRI->getRegClass(OldDefReg); int DefRegBytes = TRI->getRegSizeInBits(*DefRC) / 8; Index: llvm/trunk/test/CodeGen/X86/speculative-load-hardening.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/speculative-load-hardening.ll +++ llvm/trunk/test/CodeGen/X86/speculative-load-hardening.ll @@ -862,3 +862,71 @@ call void @sink_v2i64(<2 x i64> %x6) ret void } + +define void @test_deferred_hardening(i32* %ptr1, i32* %ptr2, i32 %x) nounwind { +; X64-LABEL: test_deferred_hardening: +; X64: # %bb.0: # %entry +; X64-NEXT: pushq %r14 +; X64-NEXT: pushq %rbx +; X64-NEXT: pushq %rax +; X64-NEXT: movq %rsp, %rax +; X64-NEXT: movq %rsi, %r14 +; X64-NEXT: movq %rdi, %rbx +; X64-NEXT: movq $-1, %rcx +; X64-NEXT: sarq $63, %rax +; X64-NEXT: movl (%rdi), %edi +; X64-NEXT: incl %edi +; X64-NEXT: imull %edx, %edi +; X64-NEXT: orl %eax, %edi +; X64-NEXT: shlq $47, %rax +; X64-NEXT: orq %rax, %rsp +; X64-NEXT: callq sink +; X64-NEXT: movq %rsp, %rax +; X64-NEXT: sarq $63, %rax +; X64-NEXT: movl (%rbx), %ecx +; X64-NEXT: movl (%r14), %edx +; X64-NEXT: leal 1(%rcx,%rdx), %edi +; X64-NEXT: orl %eax, %edi +; X64-NEXT: shlq $47, %rax +; X64-NEXT: orq %rax, %rsp +; X64-NEXT: callq sink +; X64-NEXT: movq %rsp, %rax +; X64-NEXT: sarq $63, %rax +; X64-NEXT: shlq $47, %rax +; X64-NEXT: orq %rax, %rsp +; X64-NEXT: addq $8, %rsp +; X64-NEXT: popq %rbx +; X64-NEXT: popq %r14 +; X64-NEXT: retq +; +; X64-LFENCE-LABEL: test_deferred_hardening: +; X64-LFENCE: # %bb.0: # %entry +; X64-LFENCE-NEXT: pushq %r14 +; X64-LFENCE-NEXT: pushq %rbx +; X64-LFENCE-NEXT: pushq %rax +; X64-LFENCE-NEXT: movq %rsi, %r14 +; X64-LFENCE-NEXT: movq %rdi, %rbx +; X64-LFENCE-NEXT: movl (%rdi), %edi +; X64-LFENCE-NEXT: incl %edi +; X64-LFENCE-NEXT: imull %edx, %edi +; X64-LFENCE-NEXT: callq sink +; X64-LFENCE-NEXT: movl (%rbx), %eax +; X64-LFENCE-NEXT: movl (%r14), %ecx +; X64-LFENCE-NEXT: leal 1(%rax,%rcx), %edi +; X64-LFENCE-NEXT: callq sink +; X64-LFENCE-NEXT: addq $8, %rsp +; X64-LFENCE-NEXT: popq %rbx +; X64-LFENCE-NEXT: popq %r14 +; X64-LFENCE-NEXT: retq +entry: + %a1 = load i32, i32* %ptr1 + %a2 = add i32 %a1, 1 + %a3 = mul i32 %a2, %x + call void @sink(i32 %a3) + %b1 = load i32, i32* %ptr1 + %b2 = add i32 %b1, 1 + %b3 = load i32, i32* %ptr2 + %b4 = add i32 %b2, %b3 + call void @sink(i32 %b4) + ret void +}