Index: llvm/lib/Target/AArch64/AArch64.h =================================================================== --- llvm/lib/Target/AArch64/AArch64.h +++ llvm/lib/Target/AArch64/AArch64.h @@ -39,6 +39,7 @@ FunctionPass *createAArch64StorePairSuppressPass(); FunctionPass *createAArch64ExpandPseudoPass(); FunctionPass *createAArch64SLSHardeningPass(); +FunctionPass *createAArch64IndirectThunks(); FunctionPass *createAArch64SpeculationHardeningPass(); FunctionPass *createAArch64LoadStoreOptimizationPass(); FunctionPass *createAArch64SIMDInstrOptPass(); Index: llvm/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/lib/Target/AArch64/AArch64.td +++ llvm/lib/Target/AArch64/AArch64.td @@ -464,6 +464,9 @@ def FeatureHardenSlsRetBr : SubtargetFeature<"harden-sls-retbr", "HardenSlsRetBr", "true", "Harden against straight line speculation across RET and BR instructions">; +def FeatureHardenSlsBlr : SubtargetFeature<"harden-sls-blr", + "HardenSlsBlr", "true", + "Harden against straight line speculation across BLR instructions">; //===----------------------------------------------------------------------===// // AArch64 Processors supported. Index: llvm/lib/Target/AArch64/AArch64FastISel.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -3270,7 +3270,7 @@ // Issue the call. MachineInstrBuilder MIB; if (Subtarget->useSmallAddressing()) { - const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL); + const MCInstrDesc &II = TII.get(Addr.getReg() ? getBLRCallOpcode(*MF) : AArch64::BL); MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II); if (Symbol) MIB.addSym(Symbol, 0); @@ -3303,7 +3303,7 @@ if (!CallReg) return false; - const MCInstrDesc &II = TII.get(AArch64::BLR); + const MCInstrDesc &II = TII.get(getBLRCallOpcode(*MF)); CallReg = constrainOperandRegClass(II, CallReg, 0); MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(CallReg); } Index: llvm/lib/Target/AArch64/AArch64FrameLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1126,7 +1126,7 @@ .setMIFlag(MachineInstr::FrameSetup); } - BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR)) + BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF))) .addReg(AArch64::X16, RegState::Kill) .addReg(AArch64::X15, RegState::Implicit | RegState::Define) .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead) Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -397,6 +397,9 @@ return false; } +/// Return opcode to be used for indirect calls. +unsigned getBLRCallOpcode(const MachineFunction &MF); + // struct TSFlags { #define TSFLAG_ELEMENT_SIZE_TYPE(X) (X) // 3-bits #define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 4-bit Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -6092,7 +6092,9 @@ } else if (LastInstrOpcode == AArch64::BL || - (LastInstrOpcode == AArch64::BLR && !HasBTI)) { + ((LastInstrOpcode == AArch64::BLR || + LastInstrOpcode == AArch64::BLRNoIP) && + !HasBTI)) { // FIXME: Do we need to check if the code after this uses the value of LR? FrameID = MachineOutlinerThunk; NumBytesToCreateFrame = 0; @@ -6409,7 +6411,8 @@ // as a tail-call. Whitelist the call instructions we know about so we // don't get unexpected results with call pseudo-instructions. auto UnknownCallOutlineType = outliner::InstrType::Illegal; - if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL) + if (MI.getOpcode() == AArch64::BLR || + MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL) UnknownCallOutlineType = outliner::InstrType::LegalTerminator; if (!Callee) @@ -6557,7 +6560,8 @@ if (Call->getOpcode() == AArch64::BL) { TailOpcode = AArch64::TCRETURNdi; } else { - assert(Call->getOpcode() == AArch64::BLR); + assert(Call->getOpcode() == AArch64::BLR || + Call->getOpcode() == AArch64::BLRNoIP); TailOpcode = AArch64::TCRETURNriALL; } MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) @@ -6893,6 +6897,13 @@ return get(Opc).TSFlags & AArch64::ElementSizeMask; } +unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) { + if (MF.getSubtarget().hardenSlsBlr()) + return AArch64::BLRNoIP; + else + return AArch64::BLR; +} + #define GET_INSTRINFO_HELPERS #define GET_INSTRMAP_INFO #include "AArch64GenInstrInfo.inc" Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -589,6 +589,8 @@ def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>; def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>; + def SLSBLRMitigation : Predicate<[{ MF->getSubtarget().hardenSlsBlr() }]>; + def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget().hardenSlsBlr() }]>; // Toggles patterns which aren't beneficial in GlobalISel when we aren't // optimizing. This allows us to selectively use patterns without impacting // SelectionDAG's behaviour. @@ -2020,9 +2022,19 @@ def : InstAlias<"ret", (RET LR)>; let isCall = 1, Defs = [LR], Uses = [SP] in { -def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>; + def BLR : BranchReg<0b0001, "blr", []>; + def BLRNoIP : Pseudo<(outs), (ins GPR64noip:$Rn), []>, + Sched<[WriteBrReg]>, + PseudoInstExpansion<(BLR GPR64:$Rn)>; } // isCall +def : Pat<(AArch64call GPR64:$Rn), + (BLR GPR64:$Rn)>, + Requires<[NoSLSBLRMitigation]>; +def : Pat<(AArch64call GPR64noip:$Rn), + (BLRNoIP GPR64noip:$Rn)>, + Requires<[SLSBLRMitigation]>; + let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>; } // isBranch, isTerminator, isBarrier, isIndirectBranch Index: llvm/lib/Target/AArch64/AArch64SLSHardening.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64SLSHardening.cpp +++ llvm/lib/Target/AArch64/AArch64SLSHardening.cpp @@ -16,6 +16,7 @@ #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/IndirectThunks.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -57,9 +58,9 @@ private: bool hardenReturnsAndBRs(MachineBasicBlock &MBB) const; - void insertSpeculationBarrier(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MBBI, - DebugLoc DL) const; + bool hardenBLRs(MachineBasicBlock &MBB) const; + MachineBasicBlock &ConvertBLRToBL(MachineBasicBlock &MBB, + MachineBasicBlock::iterator) const; }; } // end anonymous namespace @@ -69,20 +70,26 @@ INITIALIZE_PASS(AArch64SLSHardening, "aarch64-sls-hardening", AARCH64_SLS_HARDENING_NAME, false, false) -void AArch64SLSHardening::insertSpeculationBarrier( - MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - DebugLoc DL) const { +static void insertSpeculationBarrier(const AArch64Subtarget *ST, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + DebugLoc DL, + bool AlwaysUseISBDSB = false) { assert(MBBI != MBB.begin() && "Must not insert SpeculationBarrierEndBB as only instruction in MBB."); assert(std::prev(MBBI)->isBarrier() && "SpeculationBarrierEndBB must only follow unconditional control flow " "instructions."); assert(std::prev(MBBI)->isTerminator() && - "SpeculatoinBarrierEndBB must only follow terminators."); - if (ST->hasSB()) - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SpeculationBarrierSBEndBB)); - else - BuildMI(MBB, MBBI, DL, TII->get(AArch64::SpeculationBarrierISBDSBEndBB)); + "SpeculationBarrierEndBB must only follow terminators."); + const TargetInstrInfo *TII = ST->getInstrInfo(); + unsigned BarrierOpc = ST->hasSB() && !AlwaysUseISBDSB + ? AArch64::SpeculationBarrierSBEndBB + : AArch64::SpeculationBarrierISBDSBEndBB; + if (MBBI == MBB.end() || + (MBBI->getOpcode() != AArch64::SpeculationBarrierSBEndBB && + MBBI->getOpcode() != AArch64::SpeculationBarrierISBDSBEndBB)) + BuildMI(MBB, MBBI, DL, TII->get(BarrierOpc)); } bool AArch64SLSHardening::runOnMachineFunction(MachineFunction &MF) { @@ -91,12 +98,30 @@ TRI = MF.getSubtarget().getRegisterInfo(); bool Modified = false; - for (auto &MBB : MF) + for (auto &MBB : MF) { Modified |= hardenReturnsAndBRs(MBB); + Modified |= hardenBLRs(MBB); + } return Modified; } +static bool isBLR(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AArch64::BLR: + case AArch64::BLRNoIP: + return true; + case AArch64::BLRAA: + case AArch64::BLRAB: + case AArch64::BLRAAZ: + case AArch64::BLRABZ: + llvm_unreachable("Currently, LLVM's code generator does not support " + "producing BLRA* instructions. Therefore, there's no " + "support in this pass for those instructions."); + } + return false; +} + bool AArch64SLSHardening::hardenReturnsAndBRs(MachineBasicBlock &MBB) const { if (!ST->hardenSlsRetBr()) return false; @@ -108,7 +133,244 @@ NextMBBI = std::next(MBBI); if (MI.isReturn() || isIndirectBranchOpcode(MI.getOpcode())) { assert(MI.isTerminator()); - insertSpeculationBarrier(MBB, std::next(MBBI), MI.getDebugLoc()); + insertSpeculationBarrier(ST, MBB, std::next(MBBI), MI.getDebugLoc()); + Modified = true; + } + } + return Modified; +} + +static const char SLSBLRNamePrefix[] = "__llvm_slsblr_thunk_"; + +static std::array SLSBLRThunkNames{ + "__llvm_slsblr_thunk_x0", "__llvm_slsblr_thunk_x1", + "__llvm_slsblr_thunk_x2", "__llvm_slsblr_thunk_x3", + "__llvm_slsblr_thunk_x4", "__llvm_slsblr_thunk_x5", + "__llvm_slsblr_thunk_x6", "__llvm_slsblr_thunk_x7", + "__llvm_slsblr_thunk_x8", "__llvm_slsblr_thunk_x9", + "__llvm_slsblr_thunk_x10", "__llvm_slsblr_thunk_x11", + "__llvm_slsblr_thunk_x12", "__llvm_slsblr_thunk_x13", + "__llvm_slsblr_thunk_x14", "__llvm_slsblr_thunk_x15", + // X16 and X17 are deliberately missing, as the mitigation requires those + // register to not be used in BLR. See comment in ConvertBLRToBL for more + // details. + "__llvm_slsblr_thunk_x18", "__llvm_slsblr_thunk_x19", + "__llvm_slsblr_thunk_x20", "__llvm_slsblr_thunk_x21", + "__llvm_slsblr_thunk_x22", "__llvm_slsblr_thunk_x23", + "__llvm_slsblr_thunk_x24", "__llvm_slsblr_thunk_x25", + "__llvm_slsblr_thunk_x26", "__llvm_slsblr_thunk_x27", + "__llvm_slsblr_thunk_x28", "__llvm_slsblr_thunk_x29", + // X30 is deliberately missing, for similar reasons as X16 and X17 are + // missing. + "__llvm_slsblr_thunk_x31", +}; +static std::array SLSBLRThunkRegs{ + AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3, AArch64::X4, + AArch64::X5, AArch64::X6, AArch64::X7, AArch64::X8, AArch64::X9, + AArch64::X10, AArch64::X11, AArch64::X12, AArch64::X13, AArch64::X14, + AArch64::X15, AArch64::X18, AArch64::X19, AArch64::X20, AArch64::X21, + AArch64::X22, AArch64::X23, AArch64::X24, AArch64::X25, AArch64::X26, + AArch64::X27, AArch64::X28, AArch64::FP, AArch64::XZR}; + +namespace { +struct SLSBLRThunkInserter : ThunkInserter { + const char *getThunkPrefix() { return SLSBLRNamePrefix; } + bool mayUseThunk(const MachineFunction &MF) { + // FIXME: This could also check if there are any BLRs in the function + // to more accurately reflect if a thunk will be needed. + return MF.getSubtarget().hardenSlsBlr(); + } + void insertThunks(MachineModuleInfo &MMI); + void populateThunk(MachineFunction &MF); +}; +} // namespace + +void SLSBLRThunkInserter::insertThunks(MachineModuleInfo &MMI) { + // FIXME: It probably would be possible to filter which thunks to produce + // based on which registers are actually used in BLR instructions in this + // function. But would that be a worthwhile optimization? + for (StringRef Name : SLSBLRThunkNames) + createThunkFunction(MMI, Name); +} + +void SLSBLRThunkInserter::populateThunk(MachineFunction &MF) { + // FIXME: How to better communicate Register number, rather than through + // name and lookup table? + assert(MF.getName().startswith(getThunkPrefix())); + int Index = -1; + for (int i = 0; i < (int)SLSBLRThunkNames.size(); ++i) + if (MF.getName() == SLSBLRThunkNames[i]) { + Index = i; + break; + } + assert(Index != -1); + Register ThunkReg = SLSBLRThunkRegs[Index]; + + const TargetInstrInfo *TII = + MF.getSubtarget().getInstrInfo(); + // Grab the entry MBB and erase any other blocks. O0 codegen appears to + // generate two bbs for the entry block. + MachineBasicBlock *Entry = &MF.front(); + Entry->clear(); + while (MF.size() > 1) + MF.erase(std::next(MF.begin())); + + // These thunks need to consist of the following instructions: + // __llvm_slsblr_thunk_xN: + // BR xN + // barrierInsts + Entry->addLiveIn(ThunkReg); + BuildMI(Entry, DebugLoc(), TII->get(AArch64::BR)).addReg(ThunkReg); + // Make sure the thunks do not make use of the SB extension in case there is + // a function somewhere that will call to it that for some reason disabled + // the SB extension locally on that function, even though it's enabled for + // the module otherwise. Therefore set AlwaysUseISBSDB to true. + insertSpeculationBarrier(&MF.getSubtarget(), *Entry, + Entry->end(), DebugLoc(), true /*AlwaysUseISBDSB*/); +} + +MachineBasicBlock & +AArch64SLSHardening::ConvertBLRToBL(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI) const { + // Transform a BLR to a BL as follows: + // Before: + // |-----------------------------| + // | ... | + // | instI | + // | BLR xN | + // | instJ | + // | ... | + // |-----------------------------| + // + // After: + // |-----------------------------| + // | ... | + // | instI | + // | BL __llvm_slsblr_thunk_xN | + // | instJ | + // | ... | + // |-----------------------------| + // + // __llvm_slsblr_thunk_xN: + // |-----------------------------| + // | BR xN | + // | barrierInsts | + // |-----------------------------| + // + // The __llvm_slsblr_thunk_xN thunks are created by the SLSBLRThunkInserter. + // This function merely needs to transform BLR xN into BL + // __llvm_slsblr_thunk_xN. + // + // Since linkers are allowed to clobber X16 and X17 on function calls, the + // above mitigation only works if the original BLR instruction was not + // BLR X16 nor BLR X17. Code generation before must make sure that no BLR + // X16|X17 was produced if the mitigation is enabled. + + MachineInstr &BLR = *MBBI; + assert(isBLR(BLR)); + unsigned BLOpcode; + Register Reg; + bool RegIsKilled; + switch (BLR.getOpcode()) { + case AArch64::BLR: + case AArch64::BLRNoIP: + BLOpcode = AArch64::BL; + Reg = BLR.getOperand(0).getReg(); + assert(Reg != AArch64::X16 && Reg != AArch64::X17 && Reg != AArch64::LR); + RegIsKilled = BLR.getOperand(0).isKill(); + break; + case AArch64::BLRAA: + case AArch64::BLRAB: + case AArch64::BLRAAZ: + case AArch64::BLRABZ: + llvm_unreachable("BLRA instructions cannot yet be produced by LLVM, " + "therefore there is no need to support them for now."); + default: + llvm_unreachable("unhandled BLR"); + } + DebugLoc DL = BLR.getDebugLoc(); + + // If we'd like to support also BLRAA and BLRAB instructions, we'd need + // a lot more different kind of thunks. + // For example, a + // + // BLRAA xN, xM + // + // instruction probably would need to be transformed to something like: + // + // BL __llvm_slsblraa_thunk_x_x + // + // __llvm_slsblraa_thunk_x_x: + // BRAA x, x + // barrierInsts + // + // Given that about 30 different values of N are possible and about 30 + // different values of M are possible in the above, with the current way + // of producing indirect thunks, we'd be producing about 30 times 30, i.e. + // about 900 thunks (where most might not be actually called). This would + // multiply further by two to support both BLRAA and BLRAB variants of those + // instructions. + // If we'd want to support this, we'd probably need to look into a different + // way to produce thunk functions, based on which variants are actually + // needed, rather than producing all possible variants. + // So far, LLVM does never produce BLRA* instructions, so let's leave this + // for the future when LLVM can start producing BLRA* instructions. + MachineFunction &MF = *MBBI->getMF(); + MCContext &Context = MBB.getParent()->getContext(); + MCSymbol *Sym = Context.getOrCreateSymbol("__llvm_slsblr_thunk_x" + + utostr(Reg - AArch64::X0)); + + MachineInstr *BL = BuildMI(MBB, MBBI, DL, TII->get(BLOpcode)).addSym(Sym); + + // Now copy the implicit operands from BLR to BL and copy other necessary + // info. + // However, both BLR and BL instructions implictly use SP and implicitly + // define LR. Blindly copying implicit operands would result in SP and LR + // operands to be present multiple times. While this may not be too much of + // an issue, let's avoid that for cleanliness, by removing those implicit + // operands from the BL created above before we copy over all implicit + // operands from the BLR. + int ImpLROpIdx = -1; + int ImpSPOpIdx = -1; + for (unsigned OpIdx = BL->getNumExplicitOperands(); + OpIdx < BL->getNumOperands(); OpIdx++) { + MachineOperand Op = BL->getOperand(OpIdx); + if (!Op.isReg()) + continue; + if (Op.getReg() == AArch64::LR && Op.isDef()) + ImpLROpIdx = OpIdx; + if (Op.getReg() == AArch64::SP && !Op.isDef()) + ImpSPOpIdx = OpIdx; + } + assert(ImpLROpIdx != -1); + assert(ImpSPOpIdx != -1); + int FirstOpIdxToRemove = std::max(ImpLROpIdx, ImpSPOpIdx); + int SecondOpIdxToRemove = std::min(ImpLROpIdx, ImpSPOpIdx); + BL->RemoveOperand(FirstOpIdxToRemove); + BL->RemoveOperand(SecondOpIdxToRemove); + // Now copy over the implicit operands from the original BLR + BL->copyImplicitOps(MF, BLR); + MF.moveCallSiteInfo(&BLR, BL); + // Also add the register called in the BLR as being used in the called thunk. + BL->addOperand(MachineOperand::CreateReg(Reg, false /*isDef*/, true /*isImp*/, + RegIsKilled /*isKill*/)); + // Remove BLR instruction + MBB.erase(MBBI); + + return MBB; +} + +bool AArch64SLSHardening::hardenBLRs(MachineBasicBlock &MBB) const { + if (!ST->hardenSlsBlr()) + return false; + bool Modified = false; + MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); + MachineBasicBlock::iterator NextMBBI; + for (; MBBI != E; MBBI = NextMBBI) { + MachineInstr &MI = *MBBI; + NextMBBI = std::next(MBBI); + if (isBLR(MI)) { + ConvertBLRToBL(MBB, MBBI); Modified = true; } } @@ -118,3 +380,60 @@ FunctionPass *llvm::createAArch64SLSHardeningPass() { return new AArch64SLSHardening(); } + +namespace { +class AArch64IndirectThunks : public MachineFunctionPass { +public: + static char ID; + + AArch64IndirectThunks() : MachineFunctionPass(ID) {} + + StringRef getPassName() const override { return "AArch64 Indirect Thunks"; } + + bool doInitialization(Module &M) override; + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addPreserved(); + } + +private: + std::tuple TIs; + + // FIXME: When LLVM moves to C++17, these can become folds + template + static void initTIs(Module &M, + std::tuple &ThunkInserters) { + (void)std::initializer_list{ + (std::get(ThunkInserters).init(M), 0)...}; + } + template + static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF, + std::tuple &ThunkInserters) { + bool Modified = false; + (void)std::initializer_list{ + Modified |= std::get(ThunkInserters).run(MMI, MF)...}; + return Modified; + } +}; + +} // end anonymous namespace + +char AArch64IndirectThunks::ID = 0; + +FunctionPass *llvm::createAArch64IndirectThunks() { + return new AArch64IndirectThunks(); +} + +bool AArch64IndirectThunks::doInitialization(Module &M) { + initTIs(M, TIs); + return false; +} + +bool AArch64IndirectThunks::runOnMachineFunction(MachineFunction &MF) { + LLVM_DEBUG(dbgs() << getPassName() << '\n'); + auto &MMI = getAnalysis().getMMI(); + return runTIs(MMI, MF, TIs); +} Index: llvm/lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- llvm/lib/Target/AArch64/AArch64Subtarget.h +++ llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -211,6 +211,7 @@ bool UseEL3ForTP = false; bool AllowTaggedGlobals = false; bool HardenSlsRetBr = false; + bool HardenSlsBlr = false; uint8_t MaxInterleaveFactor = 2; uint8_t VectorInsertExtractBaseCost = 3; uint16_t CacheLineSize = 0; @@ -365,6 +366,7 @@ } bool hardenSlsRetBr() const { return HardenSlsRetBr; } + bool hardenSlsBlr() const { return HardenSlsBlr; } bool useEL1ForTP() const { return UseEL1ForTP; } bool useEL2ForTP() const { return UseEL2ForTP; } Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -636,6 +636,7 @@ // info. addPass(createAArch64SpeculationHardeningPass()); + addPass(createAArch64IndirectThunks()); addPass(createAArch64SLSHardeningPass()); if (TM->getOptLevel() != CodeGenOpt::None) { Index: llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp +++ llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp @@ -773,17 +773,17 @@ return true; } -static unsigned getCallOpcode(const Function &CallerF, bool IsIndirect, +static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect, bool IsTailCall) { if (!IsTailCall) - return IsIndirect ? AArch64::BLR : AArch64::BL; + return IsIndirect ? getBLRCallOpcode(CallerF) : AArch64::BL; if (!IsIndirect) return AArch64::TCRETURNdi; // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use // x16 or x17. - if (CallerF.hasFnAttribute("branch-target-enforcement")) + if (CallerF.getFunction().hasFnAttribute("branch-target-enforcement")) return AArch64::TCRETURNriBTI; return AArch64::TCRETURNri; @@ -819,7 +819,7 @@ if (!IsSibCall) CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN); - unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), true); + unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true); auto MIB = MIRBuilder.buildInstrNoInsert(Opc); MIB.add(Info.Callee); @@ -979,7 +979,7 @@ // Create a temporarily-floating call instruction so we can add the implicit // uses of arg registers. - unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), false); + unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false); auto MIB = MIRBuilder.buildInstrNoInsert(Opc); MIB.add(Info.Callee); Index: llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp =================================================================== --- llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp +++ llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp @@ -2890,7 +2890,7 @@ // TLS calls preserve all registers except those that absolutely must be // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be // silly). - MIB.buildInstr(AArch64::BLR, {}, {Load}) + MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load}) .addDef(AArch64::X0, RegState::Implicit) .addRegMask(TRI.getTLSCallPreservedMask()); Index: llvm/test/CodeGen/AArch64/O0-pipeline.ll =================================================================== --- llvm/test/CodeGen/AArch64/O0-pipeline.ll +++ llvm/test/CodeGen/AArch64/O0-pipeline.ll @@ -55,6 +55,7 @@ ; CHECK-NEXT: Post-RA pseudo instruction expansion pass ; CHECK-NEXT: AArch64 pseudo instruction expansion pass ; CHECK-NEXT: AArch64 speculation hardening pass +; CHECK-NEXT: AArch64 Indirect Thunks ; CHECK-NEXT: AArch64 sls hardening pass ; CHECK-NEXT: Analyze Machine Code For Garbage Collection ; CHECK-NEXT: Insert fentry calls Index: llvm/test/CodeGen/AArch64/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/AArch64/O3-pipeline.ll +++ llvm/test/CodeGen/AArch64/O3-pipeline.ll @@ -178,6 +178,7 @@ ; CHECK-NEXT: AArch64 pseudo instruction expansion pass ; CHECK-NEXT: AArch64 load / store optimization pass ; CHECK-NEXT: AArch64 speculation hardening pass +; CHECK-NEXT: AArch64 Indirect Thunks ; CHECK-NEXT: AArch64 sls hardening pass ; CHECK-NEXT: MachineDominator Tree Construction ; CHECK-NEXT: Machine Natural Loop Construction Index: llvm/test/CodeGen/AArch64/speculation-hardening-sls-blr.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/speculation-hardening-sls-blr.mir @@ -0,0 +1,58 @@ +# RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu \ +# RUN: -start-before aarch64-sls-hardening \ +# RUN: -stop-after aarch64-sls-hardening -o - %s \ +# RUN: | FileCheck %s --check-prefixes=CHECK + +# Check that the BLR SLS hardening transforms a BLR into a BL with operands as +# expected. +--- | + $__llvm_slsblr_thunk_x8 = comdat any + @a = dso_local local_unnamed_addr global i32 (...)* null, align 8 + @b = dso_local local_unnamed_addr global i32 0, align 4 + + define dso_local void @fn1() local_unnamed_addr "target-features"="+harden-sls-blr" { + entry: + %0 = load i32 ()*, i32 ()** bitcast (i32 (...)** @a to i32 ()**), align 8 + %call = tail call i32 %0() nounwind + store i32 %call, i32* @b, align 4 + ret void + } + + ; Function Attrs: naked nounwind + define linkonce_odr hidden void @__llvm_slsblr_thunk_x8() naked nounwind comdat { + entry: + ret void + } +... +--- +name: fn1 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: fn1 + bb.0.entry: + liveins: $lr + + early-clobber $sp = frame-setup STRXpre killed $lr, $sp, -16 ; :: (store 8 into %stack.0) + frame-setup CFI_INSTRUCTION def_cfa_offset 16 + frame-setup CFI_INSTRUCTION offset $w30, -16 + renamable $x8 = ADRP target-flags(aarch64-page) @a + renamable $x8 = LDRXui killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @a :: (dereferenceable load 8 from `i32 ()** bitcast (i32 (...)** @a to i32 ()**)`) + BLRNoIP killed renamable $x8, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0 + ; CHECK: BL , csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $w0, implicit killed $x8 + renamable $x8 = ADRP target-flags(aarch64-page) @b + STRWui killed renamable $w0, killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @b :: (store 4 into @b) + early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 16 ; :: (load 8 from %stack.0) + RET undef $lr + + +... +--- +name: __llvm_slsblr_thunk_x8 +tracksRegLiveness: true +body: | + bb.0.entry: + liveins: $x8 + + BR $x8 + SpeculationBarrierISBDSBEndBB +... Index: llvm/test/CodeGen/AArch64/speculation-hardening-sls.ll =================================================================== --- llvm/test/CodeGen/AArch64/speculation-hardening-sls.ll +++ llvm/test/CodeGen/AArch64/speculation-hardening-sls.ll @@ -1,5 +1,6 @@ -; RUN: llc -mattr=harden-sls-retbr -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,ISBDSB -; RUN: llc -mattr=harden-sls-retbr -mattr=+sb -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,SB +; RUN: llc -mattr=harden-sls-retbr,harden-sls-blr -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,HARDEN,ISBDSB +; RUN: llc -mattr=harden-sls-retbr,harden-sls-blr -mattr=+sb -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,HARDEN,SB +; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,NOHARDEN ; Function Attrs: norecurse nounwind readnone @@ -24,33 +25,39 @@ ; ISBDSB-NEXT: dsb sy ; ISBDSB-NEXT: isb ; SB-NEXT: {{ sb$}} +; CHECK-NEXT: .Lfunc_end } @__const.indirect_branch.ptr = private unnamed_addr constant [2 x i8*] [i8* blockaddress(@indirect_branch, %return), i8* blockaddress(@indirect_branch, %l2)], align 8 ; Function Attrs: norecurse nounwind readnone define dso_local i32 @indirect_branch(i32 %a, i32 %b, i32 %i) { +; CHECK-LABEL: indirect_branch: entry: %idxprom = sext i32 %i to i64 %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* @__const.indirect_branch.ptr, i64 0, i64 %idxprom %0 = load i8*, i8** %arrayidx, align 8 indirectbr i8* %0, [label %return, label %l2] +; CHECK: br x +; ISBDSB-NEXT: dsb sy +; ISBDSB-NEXT: isb +; SB-NEXT: {{ sb$}} l2: ; preds = %entry br label %return +; CHECK: {{ret$}} +; ISBDSB-NEXT: dsb sy +; ISBDSB-NEXT: isb +; SB-NEXT: {{ sb$}} return: ; preds = %entry, %l2 %retval.0 = phi i32 [ 1, %l2 ], [ 0, %entry ] ret i32 %retval.0 -; CHECK-LABEL: indirect_branch: -; CHECK: br x -; ISBDSB-NEXT: dsb sy -; ISBDSB-NEXT: isb -; SB-NEXT: {{ sb$}} ; CHECK: {{ret$}} ; ISBDSB-NEXT: dsb sy ; ISBDSB-NEXT: isb ; SB-NEXT: {{ sb$}} +; CHECK-NEXT: .Lfunc_end } ; Check that RETAA and RETAB instructions are also protected as expected. @@ -61,6 +68,7 @@ ; ISBDSB-NEXT: dsb sy ; ISBDSB-NEXT: isb ; SB-NEXT: {{ sb$}} +; CHECK-NEXT: .Lfunc_end ret i32 %a } @@ -71,6 +79,7 @@ ; ISBDSB-NEXT: dsb sy ; ISBDSB-NEXT: isb ; SB-NEXT: {{ sb$}} +; CHECK-NEXT: .Lfunc_end ret i32 %a } @@ -102,3 +111,72 @@ ; SB-NEXT: {{ sb$}} ; CHECK-NEXT: .Lfunc_end } + +define dso_local i32 @indirect_call( +i32 (...)* nocapture %f1, i32 (...)* nocapture %f2) { +entry: +; CHECK-LABEL: indirect_call: + %callee.knr.cast = bitcast i32 (...)* %f1 to i32 ()* + %call = tail call i32 %callee.knr.cast() +; HARDEN: bl {{__llvm_slsblr_thunk_x[0-9]+$}} + %callee.knr.cast1 = bitcast i32 (...)* %f2 to i32 ()* + %call2 = tail call i32 %callee.knr.cast1() +; HARDEN: bl {{__llvm_slsblr_thunk_x[0-9]+$}} + %add = add nsw i32 %call2, %call + ret i32 %add +; CHECK: .Lfunc_end +} + +; verify calling through a function pointer. +@a = dso_local local_unnamed_addr global i32 (...)* null, align 8 +@b = dso_local local_unnamed_addr global i32 0, align 4 +define dso_local void @indirect_call_global() local_unnamed_addr { +; CHECK-LABEL: indirect_call_global: +entry: + %0 = load i32 ()*, i32 ()** bitcast (i32 (...)** @a to i32 ()**), align 8 + %call = tail call i32 %0() nounwind +; HARDEN: bl {{__llvm_slsblr_thunk_x[0-9]+$}} + store i32 %call, i32* @b, align 4 + ret void +; CHECK: .Lfunc_end +} + +; Verify that neither x16 nor x17 are used when the BLR mitigation is enabled, +; as a linker is allowed to clobber x16 or x17 on calls, which would break the +; correct execution of the code sequence produced by the mitigation. +; The below test carefully increases register pressure to persuade code +; generation to produce a BLR x16. Yes, that is a bit fragile. +define i64 @check_x16(i64 (i8*, i64, i64, i64, i64, i64, i64, i64)** nocapture readonly %fp, i64 (i8*, i64, i64, i64, i64, i64, i64, i64)** nocapture readonly %fp2) "target-features"="+neon,+reserve-x10,+reserve-x11,+reserve-x12,+reserve-x13,+reserve-x14,+reserve-x15,+reserve-x18,+reserve-x20,+reserve-x21,+reserve-x22,+reserve-x23,+reserve-x24,+reserve-x25,+reserve-x26,+reserve-x27,+reserve-x28,+reserve-x30,+reserve-x9" { +entry: +; CHECK-LABEL: check_x16: + %0 = load i64 (i8*, i64, i64, i64, i64, i64, i64, i64)*, i64 (i8*, i64, i64, i64, i64, i64, i64, i64)** %fp, align 8 + %1 = bitcast i64 (i8*, i64, i64, i64, i64, i64, i64, i64)** %fp2 to i8** + %2 = load i8*, i8** %1, align 8 + %call = call i64 %0(i8* %2, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0) + %3 = load i64 (i8*, i64, i64, i64, i64, i64, i64, i64)*, i64 (i8*, i64, i64, i64, i64, i64, i64, i64)** %fp2, align 8 + %4 = bitcast i64 (i8*, i64, i64, i64, i64, i64, i64, i64)** %fp to i8** + %5 = load i8*, i8** %4, align 8;, !tbaa !2 + %call1 = call i64 %3(i8* %5, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0) +; NOHARDEN: blr x16 +; ISBDSB-NOT: bl __llvm_slsblr_thunk_x16 +; SB-NOT: bl __llvm_slsblr_thunk_x16 +; CHECK + %add = add nsw i64 %call1, %call + ret i64 %add +; CHECK: .Lfunc_end +} + +; HARDEN-label: __llvm_slsblr_thunk_x0: +; HARDEN: br x0 +; ISBDSB-NEXT: dsb sy +; ISBDSB-NEXT: isb +; SB-NEXT: dsb sy +; SB-NEXT: isb +; HARDEN-NEXT: .Lfunc_end +; HARDEN-label: __llvm_slsblr_thunk_x19: +; HARDEN: br x19 +; ISBDSB-NEXT: dsb sy +; ISBDSB-NEXT: isb +; SB-NEXT: dsb sy +; SB-NEXT: isb +; HARDEN-NEXT: .Lfunc_end