Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -965,6 +965,11 @@ if (TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF)) return true; switch (MI.getOpcode()) { + case AArch64::HINT: + // CSDB hints are scheduling barriers. + if (MI.getOperand(0).getImm() == 0x14) + return true; + break; case AArch64::DSB: case AArch64::ISB: // DSB and ISB also are scheduling barriers. Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td @@ -520,6 +520,14 @@ [(set GPR64:$Rd, (int_aarch64_space imm:$size, GPR64:$Rn))]>, Sched<[]>; +let hasSideEffects = 1, isCodeGenOnly = 1 in { + def SpeculationSafeValueX + : Pseudo<(outs GPR64:$dst), (ins GPR64:$src), []>, Sched<[]>; + def SpeculationSafeValueW + : Pseudo<(outs GPR32:$dst), (ins GPR32:$src), []>, Sched<[]>; +} + + //===----------------------------------------------------------------------===// // System instructions. //===----------------------------------------------------------------------===// Index: llvm/trunk/lib/Target/AArch64/AArch64SpeculationHardening.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64SpeculationHardening.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64SpeculationHardening.cpp @@ -16,17 +16,15 @@ // execution". // This pass is aimed at mitigating against SpectreV1-style vulnarabilities. // -// At the moment, it implements the tracking of miss-speculation of control -// flow into a taint register, but doesn't implement a mechanism yet to then -// use that taint register to mask of vulnerable data in registers (something -// for a follow-on improvement). Possible strategies to mask out vulnerable -// data that can be implemented on top of this are: -// - speculative load hardening to automatically mask of data loaded -// in registers. -// - using intrinsics to mask of data in registers as indicated by the -// programmer (see https://lwn.net/Articles/759423/). +// It also implements speculative load hardening, i.e. using the taint register +// to automatically mask off loaded data. // -// For AArch64, the following implementation choices are made below. +// As a possible follow-on improvement, also an intrinsics-based approach as +// explained at https://lwn.net/Articles/759423/ could be implemented on top of +// the current design. +// +// For AArch64, the following implementation choices are made to implement the +// tracking of control flow miss-speculation into a taint register: // Some of these are different than the implementation choices made in // the similar pass implemented in X86SpeculativeLoadHardening.cpp, as // the instruction set characteristics result in different trade-offs. @@ -65,6 +63,24 @@ // - On function call boundaries, the miss-speculation state is transferred from // the taint register X16 to be encoded in the SP register as value 0. // +// For the aspect of automatically hardening loads, using the taint register, +// (a.k.a. speculative load hardening, see +// https://llvm.org/docs/SpeculativeLoadHardening.html), the following +// implementation choices are made for AArch64: +// - Many of the optimizations described at +// https://llvm.org/docs/SpeculativeLoadHardening.html to harden fewer +// loads haven't been implemented yet - but for some of them there are +// FIXMEs in the code. +// - loads that load into general purpose (X or W) registers get hardened by +// masking the loaded data. For loads that load into other registers, the +// address loaded from gets hardened. It is expected that hardening the +// loaded data may be more efficient; but masking data in registers other +// than X or W is not easy and may result in being slower than just +// hardening the X address register loaded from. +// - On AArch64, CSDB instructions are inserted between the masking of the +// register and its first use, to ensure there's no non-control-flow +// speculation that might undermine the hardening mechanism. +// // Future extensions/improvements could be: // - Implement this functionality using full speculation barriers, akin to the // x86-slh-lfence option. This may be more useful for the intrinsics-based @@ -99,6 +115,10 @@ #define AARCH64_SPECULATION_HARDENING_NAME "AArch64 speculation hardening pass" +cl::opt HardenLoads("aarch64-slh-loads", cl::Hidden, + cl::desc("Sanitize loads from memory."), + cl::init(true)); + namespace { class AArch64SpeculationHardening : public MachineFunctionPass { @@ -120,7 +140,10 @@ private: unsigned MisspeculatingTaintReg; + unsigned MisspeculatingTaintReg32Bit; bool UseControlFlowSpeculationBarrier; + BitVector RegsNeedingCSDBBeforeUse; + BitVector RegsAlreadyMasked; bool functionUsesHardeningRegister(MachineFunction &MF) const; bool instrumentControlFlow(MachineBasicBlock &MBB); @@ -134,6 +157,16 @@ void insertRegToSPTaintPropagation(MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI, unsigned TmpReg) const; + + bool slhLoads(MachineBasicBlock &MBB); + bool makeGPRSpeculationSafe(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineInstr &MI, unsigned Reg); + bool lowerSpeculationSafeValuePseudos(MachineBasicBlock &MBB); + bool expandSpeculationSafeValue(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI); + bool insertCSDB(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + DebugLoc DL); }; } // end anonymous namespace @@ -330,23 +363,257 @@ return false; } +// Make GPR register Reg speculation-safe by putting it through the +// SpeculationSafeValue pseudo instruction, if we can't prove that +// the value in the register has already been hardened. +bool AArch64SpeculationHardening::makeGPRSpeculationSafe( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineInstr &MI, + unsigned Reg) { + assert(AArch64::GPR32allRegClass.contains(Reg) || + AArch64::GPR64allRegClass.contains(Reg)); + + // Loads cannot directly load a value into the SP (nor WSP). + // Therefore, if Reg is SP or WSP, it is because the instruction loads from + // the stack through the stack pointer. + // + // Since the stack pointer is never dynamically controllable, don't harden it. + if (Reg == AArch64::SP || Reg == AArch64::WSP) + return false; + + // Do not harden the register again if already hardened before. + if (RegsAlreadyMasked[Reg]) + return false; + + const bool Is64Bit = AArch64::GPR64allRegClass.contains(Reg); + LLVM_DEBUG(dbgs() << "About to harden register : " << Reg << "\n"); + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Is64Bit ? AArch64::SpeculationSafeValueX + : AArch64::SpeculationSafeValueW)) + .addDef(Reg) + .addUse(Reg); + RegsAlreadyMasked.set(Reg); + return true; +} + +bool AArch64SpeculationHardening::slhLoads(MachineBasicBlock &MBB) { + bool Modified = false; + + LLVM_DEBUG(dbgs() << "slhLoads running on MBB: " << MBB); + + RegsAlreadyMasked.reset(); + + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MachineBasicBlock::iterator NextMBBI; + for (; MBBI != E; MBBI = NextMBBI) { + MachineInstr &MI = *MBBI; + NextMBBI = std::next(MBBI); + // Only harden loaded values or addresses used in loads. + if (!MI.mayLoad()) + continue; + + LLVM_DEBUG(dbgs() << "About to harden: " << MI); + + // For general purpose register loads, harden the registers loaded into. + // For other loads, harden the address loaded from. + // Masking the loaded value is expected to result in less performance + // overhead, as the load can still execute speculatively in comparison to + // when the address loaded from gets masked. However, masking is only + // easy to do efficiently on GPR registers, so for loads into non-GPR + // registers (e.g. floating point loads), mask the address loaded from. + bool AllDefsAreGPR = llvm::all_of(MI.defs(), [&](MachineOperand &Op) { + return Op.isReg() && (AArch64::GPR32allRegClass.contains(Op.getReg()) || + AArch64::GPR64allRegClass.contains(Op.getReg())); + }); + // FIXME: it might be a worthwhile optimization to not mask loaded + // values if all the registers involved in address calculation are already + // hardened, leading to this load not able to execute on a miss-speculated + // path. + bool HardenLoadedData = AllDefsAreGPR; + bool HardenAddressLoadedFrom = !HardenLoadedData; + + // First remove registers from AlreadyMaskedRegisters if their value is + // updated by this instruction - it makes them contain a new value that is + // not guaranteed to already have been masked. + for (MachineOperand Op : MI.defs()) + for (MCRegAliasIterator AI(Op.getReg(), TRI, true); AI.isValid(); ++AI) + RegsAlreadyMasked.reset(*AI); + + // FIXME: loads from the stack with an immediate offset from the stack + // pointer probably shouldn't be hardened, which could result in a + // significant optimization. See section "Don’t check loads from + // compile-time constant stack offsets", in + // https://llvm.org/docs/SpeculativeLoadHardening.html + + if (HardenLoadedData) + for (auto Def : MI.defs()) { + if (Def.isDead()) + // Do not mask a register that is not used further. + continue; + // FIXME: For pre/post-increment addressing modes, the base register + // used in address calculation is also defined by this instruction. + // It might be a worthwhile optimization to not harden that + // base register increment/decrement when the increment/decrement is + // an immediate. + Modified |= makeGPRSpeculationSafe(MBB, NextMBBI, MI, Def.getReg()); + } + + if (HardenAddressLoadedFrom) + for (auto Use : MI.uses()) { + if (!Use.isReg()) + continue; + unsigned Reg = Use.getReg(); + // Some loads of floating point data have implicit defs/uses on a + // super register of that floating point data. Some examples: + // $s0 = LDRSui $sp, 22, implicit-def $q0 + // $q0 = LD1i64 $q0, 1, renamable $x0 + // We need to filter out these uses for non-GPR register which occur + // because the load partially fills a non-GPR register with the loaded + // data. Just skipping all non-GPR registers is safe (for now) as all + // AArch64 load instructions only use GPR registers to perform the + // address calculation. FIXME: However that might change once we can + // produce SVE gather instructions. + if (!(AArch64::GPR32allRegClass.contains(Reg) || + AArch64::GPR64allRegClass.contains(Reg))) + continue; + Modified |= makeGPRSpeculationSafe(MBB, MBBI, MI, Reg); + } + } + return Modified; +} + +/// \brief If MBBI references a pseudo instruction that should be expanded +/// here, do the expansion and return true. Otherwise return false. +bool AArch64SpeculationHardening::expandSpeculationSafeValue( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { + MachineInstr &MI = *MBBI; + unsigned Opcode = MI.getOpcode(); + bool Is64Bit = true; + + switch (Opcode) { + default: + break; + case AArch64::SpeculationSafeValueW: + Is64Bit = false; + LLVM_FALLTHROUGH; + case AArch64::SpeculationSafeValueX: + // Just remove the SpeculationSafe pseudo's if control flow + // miss-speculation isn't happening because we're already inserting barriers + // to guarantee that. + if (!UseControlFlowSpeculationBarrier) { + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned SrcReg = MI.getOperand(1).getReg(); + // Mark this register and all its aliasing registers as needing to be + // value speculation hardened before its next use, by using a CSDB + // barrier instruction. + for (MachineOperand Op : MI.defs()) + for (MCRegAliasIterator AI(Op.getReg(), TRI, true); AI.isValid(); ++AI) + RegsNeedingCSDBBeforeUse.set(*AI); + + // Mask off with taint state. + BuildMI(MBB, MBBI, MI.getDebugLoc(), + Is64Bit ? TII->get(AArch64::ANDXrs) : TII->get(AArch64::ANDWrs)) + .addDef(DstReg) + .addUse(SrcReg, RegState::Kill) + .addUse(Is64Bit ? MisspeculatingTaintReg + : MisspeculatingTaintReg32Bit) + .addImm(0); + } + MI.eraseFromParent(); + return true; + } + return false; +} + +bool AArch64SpeculationHardening::insertCSDB(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL) { + assert(!UseControlFlowSpeculationBarrier && "No need to insert CSDBs when " + "control flow miss-speculation " + "is already blocked"); + // insert data value speculation barrier (CSDB) + BuildMI(MBB, MBBI, DL, TII->get(AArch64::HINT)).addImm(0x14); + RegsNeedingCSDBBeforeUse.reset(); + return true; +} + +bool AArch64SpeculationHardening::lowerSpeculationSafeValuePseudos( + MachineBasicBlock &MBB) { + bool Modified = false; + + RegsNeedingCSDBBeforeUse.reset(); + + // The following loop iterates over all instructions in the basic block, + // and performs 2 operations: + // 1. Insert a CSDB at this location if needed. + // 2. Expand the SpeculationSafeValuePseudo if the current instruction is + // one. + // + // The insertion of the CSDB is done as late as possible (i.e. just before + // the use of a masked register), in the hope that that will reduce the + // total number of CSDBs in a block when there are multiple masked registers + // in the block. + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + DebugLoc DL; + while (MBBI != E) { + MachineInstr &MI = *MBBI; + DL = MI.getDebugLoc(); + MachineBasicBlock::iterator NMBBI = std::next(MBBI); + + // First check if a CSDB needs to be inserted due to earlier registers + // that were masked and that are used by the next instruction. + // Also emit the barrier on any potential control flow changes. + bool NeedToEmitBarrier = false; + if (RegsNeedingCSDBBeforeUse.any() && (MI.isCall() || MI.isTerminator())) + NeedToEmitBarrier = true; + if (!NeedToEmitBarrier) + for (MachineOperand Op : MI.uses()) + if (Op.isReg() && RegsNeedingCSDBBeforeUse[Op.getReg()]) { + NeedToEmitBarrier = true; + break; + } + + if (NeedToEmitBarrier) + Modified |= insertCSDB(MBB, MBBI, DL); + + Modified |= expandSpeculationSafeValue(MBB, MBBI); + + MBBI = NMBBI; + } + + if (RegsNeedingCSDBBeforeUse.any()) + Modified |= insertCSDB(MBB, MBBI, DL); + + return Modified; +} + bool AArch64SpeculationHardening::runOnMachineFunction(MachineFunction &MF) { if (!MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening)) return false; MisspeculatingTaintReg = AArch64::X16; + MisspeculatingTaintReg32Bit = AArch64::W16; TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); + RegsNeedingCSDBBeforeUse.resize(TRI->getNumRegs()); + RegsAlreadyMasked.resize(TRI->getNumRegs()); + UseControlFlowSpeculationBarrier = functionUsesHardeningRegister(MF); + bool Modified = false; - UseControlFlowSpeculationBarrier = functionUsesHardeningRegister(MF); + // Step 1: Enable automatic insertion of SpeculationSafeValue. + if (HardenLoads) { + LLVM_DEBUG( + dbgs() << "***** AArch64SpeculationHardening - automatic insertion of " + "SpeculationSafeValue intrinsics *****\n"); + for (auto &MBB : MF) + Modified |= slhLoads(MBB); + } - // Instrument control flow speculation tracking, if requested. + // 2.a Add instrumentation code to function entry and exits. LLVM_DEBUG( dbgs() << "***** AArch64SpeculationHardening - track control flow *****\n"); - // 1. Add instrumentation code to function entry and exits. SmallVector EntryBlocks; EntryBlocks.push_back(&MF.front()); for (const LandingPadInfo &LPI : MF.getLandingPads()) @@ -355,10 +622,16 @@ insertSPToRegTaintPropagation( Entry, Entry->SkipPHIsLabelsAndDebug(Entry->begin())); - // 2. Add instrumentation code to every basic block. + // 2.b Add instrumentation code to every basic block. for (auto &MBB : MF) Modified |= instrumentControlFlow(MBB); + LLVM_DEBUG(dbgs() << "***** AArch64SpeculationHardening - Lowering " + "SpeculationSafeValue Pseudos *****\n"); + // Step 3: Lower SpeculationSafeValue pseudo instructions. + for (auto &MBB : MF) + Modified |= lowerSpeculationSafeValuePseudos(MBB); + return Modified; } Index: llvm/trunk/test/CodeGen/AArch64/speculation-hardening-loads.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/speculation-hardening-loads.ll +++ llvm/trunk/test/CodeGen/AArch64/speculation-hardening-loads.ll @@ -0,0 +1,157 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu | FileCheck %s --dump-input-on-failure + +define i128 @ldp_single_csdb(i128* %p) speculative_load_hardening { +entry: + %0 = load i128, i128* %p, align 16 + ret i128 %0 +; CHECK-LABEL: ldp_single_csdb +; CHECK: ldp x8, x1, [x0] +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and x8, x8, x16 +; CHECK-NEXT: and x1, x1, x16 +; CHECK-NEXT: csdb +; CHECK-NEXT: mov x17, sp +; CHECK-NEXT: and x17, x17, x16 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov sp, x17 +; CHECK-NEXT: ret +} + +define double @ld_double(double* %p) speculative_load_hardening { +entry: + %0 = load double, double* %p, align 8 + ret double %0 +; Checking that the address laoded from is masked for a floating point load. +; CHECK-LABEL: ld_double +; CHECK: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and x0, x0, x16 +; CHECK-NEXT: csdb +; CHECK-NEXT: ldr d0, [x0] +; CHECK-NEXT: mov x17, sp +; CHECK-NEXT: and x17, x17, x16 +; CHECK-NEXT: mov sp, x17 +; CHECK-NEXT: ret +} + +define i32 @csdb_emitted_for_subreg_use(i64* %p, i32 %b) speculative_load_hardening { +entry: + %X = load i64, i64* %p, align 8 + %X_trunc = trunc i64 %X to i32 + %add = add i32 %b, %X_trunc + %iszero = icmp eq i64 %X, 0 + %ret = select i1 %iszero, i32 %b, i32 %add + ret i32 %ret +; Checking that the address laoded from is masked for a floating point load. +; CHECK-LABEL: csdb_emitted_for_subreg_use +; CHECK: ldr x8, [x0] +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and x8, x8, x16 +; csdb instruction must occur before the add instruction with w8 as operand. +; CHECK-NEXT: csdb +; CHECK-NEXT: mov x17, sp +; CHECK-NEXT: add w9, w1, w8 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: and x17, x17, x16 +; CHECK-NEXT: csel w0, w1, w9, eq +; CHECK-NEXT: mov sp, x17 +; CHECK-NEXT: ret +} + +define i64 @csdb_emitted_for_superreg_use(i32* %p, i64 %b) speculative_load_hardening { +entry: + %X = load i32, i32* %p, align 4 + %X_ext = zext i32 %X to i64 + %add = add i64 %b, %X_ext + %iszero = icmp eq i32 %X, 0 + %ret = select i1 %iszero, i64 %b, i64 %add + ret i64 %ret +; Checking that the address laoded from is masked for a floating point load. +; CHECK-LABEL: csdb_emitted_for_superreg_use +; CHECK: ldr w8, [x0] +; CHECK-NEXT: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and w8, w8, w16 +; csdb instruction must occur before the add instruction with x8 as operand. +; CHECK-NEXT: csdb +; CHECK-NEXT: mov x17, sp +; CHECK-NEXT: add x9, x1, x8 +; CHECK-NEXT: cmp w8, #0 +; CHECK-NEXT: and x17, x17, x16 +; CHECK-NEXT: csel x0, x1, x9, eq +; CHECK-NEXT: mov sp, x17 +; CHECK-NEXT: ret +} + +define i64 @no_masking_with_full_control_flow_barriers(i64 %a, i64 %b, i64* %p) speculative_load_hardening { +; CHECK-LABEL: no_masking_with_full_control_flow_barriers +; CHECK: dsb sy +; CHECK: isb +entry: + %0 = tail call i64 asm "autia1716", "={x17},{x16},0"(i64 %b, i64 %a) + %X = load i64, i64* %p, align 8 + %ret = add i64 %X, %0 +; CHECK-NOT: csdb +; CHECK-NOT: and +; CHECK: ret + ret i64 %ret +} + +define void @f_implicitdef_vector_load(<4 x i32>* %dst, <2 x i32>* %src) speculative_load_hardening +{ +entry: + %0 = load <2 x i32>, <2 x i32>* %src, align 8 + %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> + store <4 x i32> %shuffle, <4 x i32>* %dst, align 4 + ret void +; CHECK-LABEL: f_implicitdef_vector_load +; CHECK: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: and x1, x1, x16 +; CHECK-NEXT: csdb +; CHECK-NEXT: ldr d0, [x1] +; CHECK-NEXT: mov x17, sp +; CHECK-NEXT: and x17, x17, x16 +; CHECK-NEXT: mov v0.d[1], v0.d[0] +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: mov sp, x17 +; CHECK-NEXT: ret +} + +define <2 x double> @f_usedefvectorload(double* %a, double* %b) speculative_load_hardening { +entry: +; CHECK-LABEL: f_usedefvectorload +; CHECK: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: and x1, x1, x16 +; CHECK-NEXT: csdb +; CHECK-NEXT: ld1 { v0.d }[0], [x1] +; CHECK-NEXT: mov x17, sp +; CHECK-NEXT: and x17, x17, x16 +; CHECK-NEXT: mov sp, x17 +; CHECK-NEXT: ret + %0 = load double, double* %b, align 16 + %vld1_lane = insertelement <2 x double> , double %0, i32 0 + ret <2 x double> %vld1_lane +} + +define i32 @deadload() speculative_load_hardening { +entry: +; CHECK-LABEL: deadload +; CHECK: cmp sp, #0 +; CHECK-NEXT: csetm x16, ne +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ldr w8, [sp, #12] +; CHECK-NEXT: add sp, sp, #16 +; CHECK-NEXT: mov x17, sp +; CHECK-NEXT: and x17, x17, x16 +; CHECK-NEXT: mov sp, x17 +; CHECK-NEXT: ret + %a = alloca i32, align 4 + %val = load volatile i32, i32* %a, align 4 + ret i32 undef +}