diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -767,6 +767,10 @@ // ptr1 = tagp(ptr0, baseptr, tag_offset) returns a pointer where // * address is the address in ptr0 // * tag is a function of (tag in baseptr, tag_offset). +// ** Beware, this is not the same function as implemented by the ADDG instruction! +// Backend optimizations may change tag_offset; the only guarantee is that calls +// to tagp with the same pair of (baseptr, tag_offset) will produce pointers +// with the same tag value, assuming the set of excluded tags has not changed. // Address bits in baseptr and tag bits in ptr0 are ignored. // When offset between ptr0 and baseptr is a compile time constant, this can be emitted as // ADDG ptr1, baseptr, (ptr0 - baseptr), tag_offset diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -1070,9 +1070,13 @@ if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; - // Set tagged base pointer to the bottom of the stack frame. + // Set tagged base pointer to the requested stack slot. // Ideally it should match SP value after prologue. - AFI->setTaggedBasePointerOffset(MFI.getStackSize()); + Optional TBPI = AFI->getTaggedBasePointerIndex(); + if (TBPI) + AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI)); + else + AFI->setTaggedBasePointerOffset(MFI.getStackSize()); const StackOffset &SVEStackSize = getSVEStackSize(MF); diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -128,10 +128,13 @@ /// that must be forwarded to every musttail call. SmallVector ForwardedMustTailRegParms; - // Offset from SP-at-entry to the tagged base pointer. - // Tagged base pointer is set up to point to the first (lowest address) tagged - // stack slot. - unsigned TaggedBasePointerOffset = 0; + /// FrameIndex for the tagged base pointer. + Optional TaggedBasePointerIndex; + + /// Offset from SP-at-entry to the tagged base pointer. + /// Tagged base pointer is set up to point to the first (lowest address) + /// tagged stack slot. + unsigned TaggedBasePointerOffset; /// OutliningStyle denotes, if a function was outined, how it was outlined, /// e.g. Tail Call, Thunk, or Function if none apply. @@ -343,6 +346,11 @@ return ForwardedMustTailRegParms; } + Optional getTaggedBasePointerIndex() const { + return TaggedBasePointerIndex; + } + void setTaggedBasePointerIndex(int Index) { TaggedBasePointerIndex = Index; } + unsigned getTaggedBasePointerOffset() const { return TaggedBasePointerOffset; } diff --git a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp --- a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp @@ -13,7 +13,6 @@ #include "AArch64InstrInfo.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SetVector.h" -#include "llvm/ADT/MapVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -50,6 +49,12 @@ "apply unchecked-ld-st when the target is definitely within range"), clEnumValN(UncheckedAlways, "always", "always apply unchecked-ld-st"))); +static cl::opt + ClFirstSlot("stack-tagging-first-slot-opt", cl::Hidden, cl::init(true), + cl::ZeroOrMore, + cl::desc("Apply first slot optimization for stack tagging " + "(eliminate ADDG Rt, Rn, 0, 0).")); + namespace { class AArch64StackTaggingPreRA : public MachineFunctionPass { @@ -71,6 +76,7 @@ bool mayUseUncheckedLoadStore(); void uncheckUsesOf(unsigned TaggedReg, int FI); void uncheckLoadsAndStores(); + Optional findFirstSlotCandidate(); bool runOnMachineFunction(MachineFunction &Func) override; StringRef getPassName() const override { @@ -197,6 +203,141 @@ } } +struct SlotWithTag { + int FI; + int Tag; + SlotWithTag(int FI, int Tag) : FI(FI), Tag(Tag) {} + explicit SlotWithTag(const MachineInstr &MI) + : FI(MI.getOperand(1).getIndex()), Tag(MI.getOperand(4).getImm()) {} + bool operator==(const SlotWithTag &Other) const { + return FI == Other.FI && Tag == Other.Tag; + } +}; + +namespace llvm { +template <> struct DenseMapInfo { + static inline SlotWithTag getEmptyKey() { return {-2, -2}; } + static inline SlotWithTag getTombstoneKey() { return {-3, -3}; } + static unsigned getHashValue(const SlotWithTag &V) { + return hash_combine(DenseMapInfo::getHashValue(V.FI), + DenseMapInfo::getHashValue(V.Tag)); + } + static bool isEqual(const SlotWithTag &A, const SlotWithTag &B) { + return A == B; + } +}; +} // namespace llvm + +static bool isSlotPreAllocated(MachineFrameInfo *MFI, int FI) { + return MFI->getUseLocalStackAllocationBlock() && + MFI->isObjectPreAllocated(FI); +} + +// Pin one of the tagged slots to offset 0 from the tagged base pointer. +// This would make its address available in a virtual register (IRG's def), as +// opposed to requiring an ADDG instruction to materialize. This effectively +// eliminates a vreg (by replacing it with direct uses of IRG, which is usually +// live almost everywhere anyway), and therefore needs to happen before +// regalloc. +Optional AArch64StackTaggingPreRA::findFirstSlotCandidate() { + // Find the best (FI, Tag) pair to pin to offset 0. + // Looking at the possible uses of a tagged address, the advantage of pinning + // is: + // - COPY to physical register. + // Does not matter, this would trade a MOV instruction for an ADDG. + // - ST*G matter, but those mostly appear near the function prologue where all + // the tagged addresses need to be materialized anyway; also, counting ST*G + // uses would overweight large allocas that require more than one ST*G + // instruction. + // - Load/Store instructions in the address operand do not require a tagged + // pointer, so they also do not benefit. These operands have already been + // eliminated (see uncheckLoadsAndStores) so all remaining load/store + // instructions count. + // - Any other instruction may benefit from being pinned to offset 0. + LLVM_DEBUG(dbgs() << "AArch64StackTaggingPreRA::findFirstSlotCandidate\n"); + if (!ClFirstSlot) + return None; + + DenseMap RetagScore; + SlotWithTag MaxScoreST{-1, -1}; + int MaxScore = -1; + for (auto *I : ReTags) { + SlotWithTag ST{*I}; + if (isSlotPreAllocated(MFI, ST.FI)) + continue; + + Register RetagReg = I->getOperand(0).getReg(); + if (!Register::isVirtualRegister(RetagReg)) + continue; + + int Score = 0; + SmallVector WorkList; + WorkList.push_back(RetagReg); + + while (!WorkList.empty()) { + Register UseReg = WorkList.back(); + WorkList.pop_back(); + for (auto &UseI : MRI->use_instructions(UseReg)) { + unsigned Opcode = UseI.getOpcode(); + if (Opcode == AArch64::STGOffset || Opcode == AArch64::ST2GOffset || + Opcode == AArch64::STZGOffset || Opcode == AArch64::STZ2GOffset || + Opcode == AArch64::STGPi || Opcode == AArch64::STGloop || + Opcode == AArch64::STZGloop || Opcode == AArch64::STGloop_wback || + Opcode == AArch64::STZGloop_wback) + continue; + if (UseI.isCopy()) { + Register DstReg = UseI.getOperand(0).getReg(); + if (Register::isVirtualRegister(DstReg)) + WorkList.push_back(DstReg); + continue; + } + LLVM_DEBUG(dbgs() << "[" << ST.FI << ":" << ST.Tag << "] use of %" + << Register::virtReg2Index(UseReg) << " in " << UseI + << "\n"); + Score++; + } + } + + int TotalScore = RetagScore[ST] += Score; + if (TotalScore > MaxScore || + (TotalScore == MaxScore && ST.FI > MaxScoreST.FI)) { + MaxScore = TotalScore; + MaxScoreST = ST; + } + } + + if (MaxScoreST.FI < 0) + return None; + + // If FI's tag is already 0, we are done. + if (MaxScoreST.Tag == 0) + return MaxScoreST.FI; + + // Otherwise, find a random victim pair (FI, Tag) where Tag == 0. + SlotWithTag SwapST{-1, -1}; + for (auto *I : ReTags) { + SlotWithTag ST{*I}; + if (ST.Tag == 0) { + SwapST = ST; + break; + } + } + + // Swap tags between the victim and the highest scoring pair. + // If SwapWith is still (-1, -1), that's fine, too - we'll simply take tag for + // the highest score slot without changing anything else. + for (auto *&I : ReTags) { + SlotWithTag ST{*I}; + MachineOperand &TagOp = I->getOperand(4); + if (ST == MaxScoreST) { + TagOp.setImm(0); + } else if (ST == SwapST) { + TagOp.setImm(MaxScoreST.Tag); + } + } + return MaxScoreST.FI; +} + bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) { MF = &Func; MRI = &MF->getRegInfo(); @@ -225,11 +366,35 @@ } } + // Take over from SSP. It does nothing for tagged slots, and should not really + // have been enabled in the first place. + for (int FI : TaggedSlots) + MFI->setObjectSSPLayout(FI, MachineFrameInfo::SSPLK_None); + if (ReTags.empty()) return false; if (mayUseUncheckedLoadStore()) uncheckLoadsAndStores(); + // Find a slot that is used with zero tag offset, like ADDG #fi, 0. + // If the base tagged pointer is set up to the address of this slot, + // the ADDG instruction can be eliminated. + Optional BaseSlot = findFirstSlotCandidate(); + if (BaseSlot) + AFI->setTaggedBasePointerIndex(*BaseSlot); + + for (auto *I : ReTags) { + int FI = I->getOperand(1).getIndex(); + int Tag = I->getOperand(4).getImm(); + Register Base = I->getOperand(3).getReg(); + if (Tag == 0 && FI == BaseSlot) { + BuildMI(*I->getParent(), I, {}, TII->get(AArch64::COPY), + I->getOperand(0).getReg()) + .addReg(Base); + I->eraseFromParent(); + } + } + return true; } diff --git a/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll b/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll --- a/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll +++ b/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll @@ -3,12 +3,32 @@ define i8* @small_alloca() { entry: ; CHECK-LABEL: small_alloca: +; CHECK: irg x0, sp{{$}} +; CHECK: ret + %a = alloca i8, align 16 + %q = call i8* @llvm.aarch64.irg.sp(i64 0) + %q1 = call i8* @llvm.aarch64.tagp.p0i8(i8* %a, i8* %q, i64 1) + ret i8* %q1 +} + +@sink = global i8* null, align 8 + +; Check that IRG is pinned to %b because the store instruction needs +; the address in a non-fixed physical register and can benefit from it +; being equal to the base tagged pointer. +define i8* @small_allocas() { +entry: +; CHECK-LABEL: small_allocas: ; CHECK: irg [[R:x[0-9]+]], sp{{$}} -; CHECK-NEXT: addg x0, [[R]], #0, #1 +; CHECK: addg x0, [[R]], #16, #1 +; CHECK: str [[R]], {{.*}}sink ; CHECK: ret %a = alloca i8, align 16 + %b = alloca i8, align 16 %q = call i8* @llvm.aarch64.irg.sp(i64 0) %q1 = call i8* @llvm.aarch64.tagp.p0i8(i8* %a, i8* %q, i64 1) + %q2 = call i8* @llvm.aarch64.tagp.p0i8(i8* %b, i8* %q, i64 2) + store i8* %q2, i8** @sink, align 8 ret i8* %q1 } @@ -16,16 +36,15 @@ define void @huge_allocas() { entry: ; CHECK-LABEL: huge_allocas: -; CHECK: irg [[R:x[0-9]+]], sp{{$}} -; CHECK: add [[TMP:x[0-9]+]], [[R]], #3088 +; CHECK: irg x1, sp{{$}} +; CHECK: add [[TMP:x[0-9]+]], x1, #3088 ; CHECK: addg x0, [[TMP]], #1008, #1 -; CHECK: addg x1, [[R]], #0, #2 ; CHECK: bl use2 %a = alloca i8, i64 4096, align 16 %b = alloca i8, i64 4096, align 16 %base = call i8* @llvm.aarch64.irg.sp(i64 0) %a_t = call i8* @llvm.aarch64.tagp.p0i8(i8* %a, i8* %base, i64 1) - %b_t = call i8* @llvm.aarch64.tagp.p0i8(i8* %b, i8* %base, i64 2) + %b_t = call i8* @llvm.aarch64.tagp.p0i8(i8* %b, i8* %base, i64 0) call void @use2(i8* %a_t, i8* %b_t) ret void } @@ -37,8 +56,7 @@ ; CHECK-LABEL: realign: ; CHECK: mov x29, sp ; CHECK: and sp, x{{[0-9]*}}, #0xffffffffffffffc0 -; CHECK: irg [[R:x[0-9]+]], sp{{$}} -; CHECK: addg x0, [[R]], #0, #1 +; CHECK: irg x0, sp{{$}} ; CHECK: bl use %a = alloca i8, i64 4096, align 64 %base = call i8* @llvm.aarch64.irg.sp(i64 0) @@ -52,10 +70,9 @@ define void @dynamic_alloca(i64 %size) { entry: ; CHECK-LABEL: dynamic_alloca: -; CHECK: sub [[R:x[0-9]+]], x29, #[[OFS:[0-9]+]] -; CHECK: irg [[R]], [[R]] -; CHECK: addg x1, [[R]], #0, #1 -; CHECK: sub x0, x29, #[[OFS]] +; CHECK: sub x1, x29, #[[OFS:[0-9]+]] +; CHECK: irg x1, x1 +; CHECK-DAG: sub x0, x29, #[[OFS]] ; CHECK: bl use2 %base = call i8* @llvm.aarch64.irg.sp(i64 0) %a = alloca i128, i64 %size, align 16 @@ -74,9 +91,9 @@ ; CHECK-LABEL: dynamic_alloca_and_realign: ; CHECK: and sp, x{{.*}}, #0xffffffffffffffc0 ; CHECK: mov x19, sp -; CHECK: irg [[R:x[0-9]+]], x19 -; CHECK: addg x1, [[R]], #[[OFS:[0-9]+]], #1 -; CHECK: add x0, x19, #[[OFS]] +; CHECK: add x1, x19, #[[OFS:[0-9]+]] +; CHECK: irg x1, x1 +; CHECK-DAG: add x0, x19, #[[OFS]] ; CHECK: bl use2 %base = call i8* @llvm.aarch64.irg.sp(i64 0) %a = alloca i128, i64 %size, align 64