diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -767,6 +767,10 @@ // ptr1 = tagp(ptr0, baseptr, tag_offset) returns a pointer where // * address is the address in ptr0 // * tag is a function of (tag in baseptr, tag_offset). +// ** Beware, this is not the same function as implemented by the ADDG instruction! +// Backend optimizations may change tag_offset; the only guarantee is that calls +// to tagp with the same pair of (baseptr, tag_offset) will produce pointers +// with the same tag value, assuming the set of excluded tags has not changed. // Address bits in baseptr and tag bits in ptr0 are ignored. // When offset between ptr0 and baseptr is a compile time constant, this can be emitted as // ADDG ptr1, baseptr, (ptr0 - baseptr), tag_offset diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -113,6 +113,10 @@ return StackId != TargetStackID::SVEVector; } + void + orderFrameObjects(const MachineFunction &MF, + SmallVectorImpl &ObjectsToAllocate) const override; + private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, uint64_t StackBumpBytes) const; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -176,6 +176,10 @@ cl::desc("merge settag instruction in function epilog"), cl::init(true), cl::Hidden); +static cl::opt OrderFrameObjects("aarch64-order-frame-objects", + cl::desc("sort stack allocations"), + cl::init(true), cl::Hidden); + STATISTIC(NumRedZoneFunctions, "Number of functions using red zone"); /// Returns the argument pop size. @@ -1058,9 +1062,13 @@ if (MF.getFunction().getCallingConv() == CallingConv::GHC) return; - // Set tagged base pointer to the bottom of the stack frame. + // Set tagged base pointer to the requested stack slot. // Ideally it should match SP value after prologue. - AFI->setTaggedBasePointerOffset(MFI.getStackSize()); + Optional TBPI = AFI->getTaggedBasePointerIndex(); + if (TBPI) + AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI)); + else + AFI->setTaggedBasePointerOffset(MFI.getStackSize()); const StackOffset &SVEStackSize = getSVEStackSize(MF); @@ -3223,3 +3231,162 @@ return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(), getStackAlign()); } + +namespace { +struct FrameObject { + bool IsValid = false; + // Index of the object in MFI. + int ObjectIndex = 0; + // Group ID this object belongs to. + int GroupIndex = -1; + // This object should be placed first (closest to SP). + bool ObjectFirst = false; + // This object's group (which always contains the object with + // ObjectFirst==true) should be placed first. + bool GroupFirst = false; +}; + +class GroupBuilder { + SmallVector CurrentMembers; + int NextGroupIndex = 0; + std::vector &Objects; + +public: + GroupBuilder(std::vector &Objects) : Objects(Objects) {} + void AddMember(int Index) { CurrentMembers.push_back(Index); } + void EndCurrentGroup() { + if (CurrentMembers.size() > 1) { + // Create a new group with the current member list. This might remove them + // from their pre-existing groups. That's OK, dealing with overlapping + // groups is too hard and unlikely to make a difference. + LLVM_DEBUG(dbgs() << "group:"); + for (int Index : CurrentMembers) { + Objects[Index].GroupIndex = NextGroupIndex; + LLVM_DEBUG(dbgs() << " " << Index); + } + LLVM_DEBUG(dbgs() << "\n"); + NextGroupIndex++; + } + CurrentMembers.clear(); + } +}; + +bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) { + // Objects at a lower index are closer to FP; objects at a higher index are + // closer to SP. + // + // For consistency in our comparison, all invalid objects are placed + // at the end. This also allows us to stop walking when we hit the + // first invalid item after it's all sorted. + // + // The "first" object goes first (closest to SP), followed by the members of + // the "first" group. + // + // The rest are sorted by the group index to keep the groups together. + // Higher numbered groups are more likely to be around longer (i.e. untagged + // in the function epilogue and not at some earlier point). Place them closer + // to SP. + // + // If all else equal, sort by the object index to keep the objects in the + // original order. + return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex, + A.ObjectIndex) < + std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex, + B.ObjectIndex); +} +} // namespace + +void AArch64FrameLowering::orderFrameObjects( + const MachineFunction &MF, SmallVectorImpl &ObjectsToAllocate) const { + if (!OrderFrameObjects || ObjectsToAllocate.empty()) + return; + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + std::vector FrameObjects(MFI.getObjectIndexEnd()); + for (auto &Obj : ObjectsToAllocate) { + FrameObjects[Obj].IsValid = true; + FrameObjects[Obj].ObjectIndex = Obj; + } + + // Identify stack slots that are tagged at the same time. + GroupBuilder GB(FrameObjects); + for (auto &MBB : MF) { + for (auto &MI : MBB) { + if (MI.isDebugInstr()) + continue; + int OpIndex; + switch (MI.getOpcode()) { + case AArch64::STGloop: + case AArch64::STZGloop: + OpIndex = 3; + break; + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + OpIndex = 1; + break; + default: + OpIndex = -1; + } + + int TaggedFI = -1; + if (OpIndex >= 0) { + const MachineOperand &MO = MI.getOperand(OpIndex); + if (MO.isFI()) { + int FI = MO.getIndex(); + if (FI >= 0 && FI < MFI.getObjectIndexEnd() && + FrameObjects[FI].IsValid) + TaggedFI = FI; + } + } + + // If this is a stack tagging instruction for a slot that is not part of a + // group yet, either start a new group or add it to the current one. + if (TaggedFI >= 0) + GB.AddMember(TaggedFI); + else + GB.EndCurrentGroup(); + } + // Groups should never span multiple basic blocks. + GB.EndCurrentGroup(); + } + + // If the function's tagged base pointer is pinned to a stack slot, we want to + // put that slot first when possible. This will likely place it at SP + 0, + // and save one instruction when generating the base pointer because IRG does + // not allow an immediate offset. + const AArch64FunctionInfo &AFI = *MF.getInfo(); + Optional TBPI = AFI.getTaggedBasePointerIndex(); + if (TBPI) { + FrameObjects[*TBPI].ObjectFirst = true; + FrameObjects[*TBPI].GroupFirst = true; + int FirstGroupIndex = FrameObjects[*TBPI].GroupIndex; + if (FirstGroupIndex >= 0) + for (FrameObject &Object : FrameObjects) + if (Object.GroupIndex == FirstGroupIndex) + Object.GroupFirst = true; + } + + llvm::stable_sort(FrameObjects, FrameObjectCompare); + + int i = 0; + for (auto &Obj : FrameObjects) { + // All invalid items are sorted at the end, so it's safe to stop. + if (!Obj.IsValid) + break; + ObjectsToAllocate[i++] = Obj.ObjectIndex; + } + + LLVM_DEBUG(dbgs() << "Final frame order:\n"; for (auto &Obj + : FrameObjects) { + if (!Obj.IsValid) + break; + dbgs() << " " << Obj.ObjectIndex << ": group " << Obj.GroupIndex; + if (Obj.ObjectFirst) + dbgs() << ", first"; + if (Obj.GroupFirst) + dbgs() << ", group-first"; + dbgs() << "\n"; + }); +} diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -128,10 +128,13 @@ /// that must be forwarded to every musttail call. SmallVector ForwardedMustTailRegParms; - // Offset from SP-at-entry to the tagged base pointer. - // Tagged base pointer is set up to point to the first (lowest address) tagged - // stack slot. - unsigned TaggedBasePointerOffset = 0; + /// FrameIndex for the tagged base pointer. + Optional TaggedBasePointerIndex; + + /// Offset from SP-at-entry to the tagged base pointer. + /// Tagged base pointer is set up to point to the first (lowest address) tagged + /// stack slot. + unsigned TaggedBasePointerOffset; /// OutliningStyle denotes, if a function was outined, how it was outlined, /// e.g. Tail Call, Thunk, or Function if none apply. @@ -343,6 +346,13 @@ return ForwardedMustTailRegParms; } + Optional getTaggedBasePointerIndex() const { + return TaggedBasePointerIndex; + } + void setTaggedBasePointerIndex(int Index) { + TaggedBasePointerIndex = Index; + } + unsigned getTaggedBasePointerOffset() const { return TaggedBasePointerOffset; } diff --git a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp --- a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp +++ b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp @@ -13,7 +13,6 @@ #include "AArch64InstrInfo.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/SetVector.h" -#include "llvm/ADT/MapVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -50,6 +49,12 @@ "apply unchecked-ld-st when the target is definitely within range"), clEnumValN(UncheckedAlways, "always", "always apply unchecked-ld-st"))); +static cl::opt + ClFirstSlot("stack-tagging-first-slot-opt", cl::Hidden, cl::init(true), + cl::ZeroOrMore, + cl::desc("Apply first slot optimization for stack tagging " + "(eliminate ADDG Rt, Rn, 0, 0).")); + namespace { class AArch64StackTaggingPreRA : public MachineFunctionPass { @@ -71,6 +76,7 @@ bool mayUseUncheckedLoadStore(); void uncheckUsesOf(unsigned TaggedReg, int FI); void uncheckLoadsAndStores(); + Optional findFirstSlotCandidate(); bool runOnMachineFunction(MachineFunction &Func) override; StringRef getPassName() const override { @@ -197,6 +203,141 @@ } } +struct SlotWithTag { + int FI; + int Tag; + SlotWithTag(int FI, int Tag) : FI(FI), Tag(Tag) {} + explicit SlotWithTag(const MachineInstr &MI) + : FI(MI.getOperand(1).getIndex()), Tag(MI.getOperand(4).getImm()) {} + bool operator==(const SlotWithTag &Other) const { + return FI == Other.FI && Tag == Other.Tag; + } +}; + +namespace llvm { +template <> struct DenseMapInfo { + static inline SlotWithTag getEmptyKey() { return {-2, -2}; } + static inline SlotWithTag getTombstoneKey() { return {-3, -3}; } + static unsigned getHashValue(const SlotWithTag &V) { + return hash_combine(DenseMapInfo::getHashValue(V.FI), + DenseMapInfo::getHashValue(V.Tag)); + } + static bool isEqual(const SlotWithTag &A, const SlotWithTag &B) { + return A == B; + } +}; +} // namespace llvm + +static bool isSlotPreAllocated(MachineFrameInfo *MFI, int FI) { + return MFI->getUseLocalStackAllocationBlock() && + MFI->isObjectPreAllocated(FI); +} + +// Pin one of the tagged slots to offset 0 from the tagged base pointer. +// This would make its address available in a virtual register (IRG's def), as +// opposed to requiring an ADDG instruction to materialize. This effectively +// eliminates a vreg (by replacing it with direct uses of IRG, which is usually +// live almost everywhere anyway), and therefore needs to happen before +// regalloc. +Optional AArch64StackTaggingPreRA::findFirstSlotCandidate() { + // Find the best (FI, Tag) pair to pin to offset 0. + // Looking at the possible uses of a tagged address, the advantage of pinning + // is: + // - COPY to physical register. + // Does not matter, this would trade a MOV instruction for an ADDG. + // - ST*G matter, but those mostly appear near the function prologue where all + // the tagged addresses need to be materialized anyway; also, counting ST*G + // uses would overweight large allocas that require more than one ST*G + // instruction. + // - Load/Store instructions in the address operand do not require a tagged + // pointer, so they also do not benefit. These operands have already been + // eliminated (see uncheckLoadsAndStores) so all remaining load/store + // instructions count. + // - Any other instruction may benefit from being pinned to offset 0. + LLVM_DEBUG(dbgs() << "AArch64StackTaggingPreRA::findFirstSlotCandidate\n"); + if (!ClFirstSlot) + return None; + + DenseMap RetagScore; + SlotWithTag MaxScoreST{-1, -1}; + int MaxScore = -1; + for (auto *I : ReTags) { + SlotWithTag ST{*I}; + if (isSlotPreAllocated(MFI, ST.FI)) + continue; + + Register RetagReg = I->getOperand(0).getReg(); + if (!Register::isVirtualRegister(RetagReg)) + continue; + + int Score = 0; + SmallVector WorkList; + WorkList.push_back(RetagReg); + + while (!WorkList.empty()) { + Register UseReg = WorkList.back(); + WorkList.pop_back(); + for (auto &UseI : MRI->use_instructions(UseReg)) { + unsigned Opcode = UseI.getOpcode(); + if (Opcode == AArch64::STGOffset || Opcode == AArch64::ST2GOffset || + Opcode == AArch64::STZGOffset || Opcode == AArch64::STZ2GOffset || + Opcode == AArch64::STGPi || Opcode == AArch64::STGloop || + Opcode == AArch64::STZGloop || Opcode == AArch64::STGloop_wback || + Opcode == AArch64::STZGloop_wback) + continue; + if (UseI.isCopy()) { + Register DstReg = UseI.getOperand(0).getReg(); + if (Register::isVirtualRegister(DstReg)) + WorkList.push_back(DstReg); + continue; + } + LLVM_DEBUG(dbgs() << "[" << ST.FI << ":" << ST.Tag << "] use of %" + << Register::virtReg2Index(UseReg) << " in " << UseI + << "\n"); + Score++; + } + } + + int TotalScore = RetagScore[ST] += Score; + if (TotalScore > MaxScore || + (TotalScore == MaxScore && ST.FI > MaxScoreST.FI)) { + MaxScore = TotalScore; + MaxScoreST = ST; + } + } + + if (MaxScoreST.FI < 0) + return None; + + // If FI's tag is already 0, we are done. + if (MaxScoreST.Tag == 0) + return MaxScoreST.FI; + + // Otherwise, find a random victim pair (FI, Tag) where Tag == 0. + SlotWithTag SwapST{-1, -1}; + for (auto *I : ReTags) { + SlotWithTag ST{*I}; + if (ST.Tag == 0) { + SwapST = ST; + break; + } + } + + // Swap tags between the victim and the highest scoring pair. + // If SwapWith is still (-1, -1), that's fine, too - we'll simply take tag for + // the highest score slot without changing anything else. + for (auto *&I : ReTags) { + SlotWithTag ST{*I}; + MachineOperand &TagOp = I->getOperand(4); + if (ST == MaxScoreST) { + TagOp.setImm(0); + } else if (ST == SwapST) { + TagOp.setImm(MaxScoreST.Tag); + } + } + return MaxScoreST.FI; +} + bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) { MF = &Func; MRI = &MF->getRegInfo(); @@ -225,11 +366,35 @@ } } + // Take over from SSP. It does nothing for tagged slots, and should not really + // have been enabled in the first place. + for (int FI : TaggedSlots) + MFI->setObjectSSPLayout(FI, MachineFrameInfo::SSPLK_None); + if (ReTags.empty()) return false; if (mayUseUncheckedLoadStore()) uncheckLoadsAndStores(); + // Find a slot that is used with zero tag offset, like ADDG #fi, 0. + // If the base tagged pointer is set up to the address of this slot, + // the ADDG instruction can be eliminated. + Optional BaseSlot = findFirstSlotCandidate(); + if (BaseSlot) + AFI->setTaggedBasePointerIndex(*BaseSlot); + + for (auto *I : ReTags) { + int FI = I->getOperand(1).getIndex(); + int Tag = I->getOperand(4).getImm(); + Register Base = I->getOperand(3).getReg(); + if (Tag == 0 && FI == BaseSlot) { + BuildMI(*I->getParent(), I, {}, TII->get(AArch64::COPY), + I->getOperand(0).getReg()) + .addReg(Base); + I->eraseFromParent(); + } + } + return true; } diff --git a/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll b/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll --- a/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll +++ b/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll @@ -3,12 +3,32 @@ define i8* @small_alloca() { entry: ; CHECK-LABEL: small_alloca: +; CHECK: irg x0, sp{{$}} +; CHECK: ret + %a = alloca i8, align 16 + %q = call i8* @llvm.aarch64.irg.sp(i64 0) + %q1 = call i8* @llvm.aarch64.tagp.p0i8(i8* %a, i8* %q, i64 1) + ret i8* %q1 +} + +@sink = global i8* null, align 8 + +; Check that IRG is pinned to %b because the store instruction needs +; the address in a non-fixed physical register and can benefit from it +; being equal to the base tagged pointer. +define i8* @small_allocas() { +entry: +; CHECK-LABEL: small_allocas: ; CHECK: irg [[R:x[0-9]+]], sp{{$}} -; CHECK-NEXT: addg x0, [[R]], #0, #1 +; CHECK: addg x0, [[R]], #16, #1 +; CHECK: str [[R]], {{.*}}sink ; CHECK: ret %a = alloca i8, align 16 + %b = alloca i8, align 16 %q = call i8* @llvm.aarch64.irg.sp(i64 0) %q1 = call i8* @llvm.aarch64.tagp.p0i8(i8* %a, i8* %q, i64 1) + %q2 = call i8* @llvm.aarch64.tagp.p0i8(i8* %b, i8* %q, i64 2) + store i8* %q2, i8** @sink, align 8 ret i8* %q1 } @@ -16,16 +36,15 @@ define void @huge_allocas() { entry: ; CHECK-LABEL: huge_allocas: -; CHECK: irg [[R:x[0-9]+]], sp{{$}} -; CHECK: add [[TMP:x[0-9]+]], [[R]], #3088 +; CHECK: irg x1, sp{{$}} +; CHECK: add [[TMP:x[0-9]+]], x1, #3088 ; CHECK: addg x0, [[TMP]], #1008, #1 -; CHECK: addg x1, [[R]], #0, #2 ; CHECK: bl use2 %a = alloca i8, i64 4096, align 16 %b = alloca i8, i64 4096, align 16 %base = call i8* @llvm.aarch64.irg.sp(i64 0) %a_t = call i8* @llvm.aarch64.tagp.p0i8(i8* %a, i8* %base, i64 1) - %b_t = call i8* @llvm.aarch64.tagp.p0i8(i8* %b, i8* %base, i64 2) + %b_t = call i8* @llvm.aarch64.tagp.p0i8(i8* %b, i8* %base, i64 0) call void @use2(i8* %a_t, i8* %b_t) ret void } @@ -37,8 +56,7 @@ ; CHECK-LABEL: realign: ; CHECK: mov x29, sp ; CHECK: and sp, x{{[0-9]*}}, #0xffffffffffffffc0 -; CHECK: irg [[R:x[0-9]+]], sp{{$}} -; CHECK: addg x0, [[R]], #0, #1 +; CHECK: irg x0, sp{{$}} ; CHECK: bl use %a = alloca i8, i64 4096, align 64 %base = call i8* @llvm.aarch64.irg.sp(i64 0) @@ -52,10 +70,9 @@ define void @dynamic_alloca(i64 %size) { entry: ; CHECK-LABEL: dynamic_alloca: -; CHECK: sub [[R:x[0-9]+]], x29, #[[OFS:[0-9]+]] -; CHECK: irg [[R]], [[R]] -; CHECK: addg x1, [[R]], #0, #1 -; CHECK: sub x0, x29, #[[OFS]] +; CHECK: sub x1, x29, #[[OFS:[0-9]+]] +; CHECK: irg x1, x1 +; CHECK-DAG: sub x0, x29, #[[OFS]] ; CHECK: bl use2 %base = call i8* @llvm.aarch64.irg.sp(i64 0) %a = alloca i128, i64 %size, align 16 @@ -74,9 +91,9 @@ ; CHECK-LABEL: dynamic_alloca_and_realign: ; CHECK: and sp, x{{.*}}, #0xffffffffffffffc0 ; CHECK: mov x19, sp -; CHECK: irg [[R:x[0-9]+]], x19 -; CHECK: addg x1, [[R]], #[[OFS:[0-9]+]], #1 -; CHECK: add x0, x19, #[[OFS]] +; CHECK: add x1, x19, #[[OFS:[0-9]+]] +; CHECK: irg x1, x1 +; CHECK-DAG: add x0, x19, #[[OFS]] ; CHECK: bl use2 %base = call i8* @llvm.aarch64.irg.sp(i64 0) %a = alloca i128, i64 %size, align 64 diff --git a/llvm/test/CodeGen/AArch64/settag-merge-order.ll b/llvm/test/CodeGen/AArch64/settag-merge-order.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/settag-merge-order.ll @@ -0,0 +1,71 @@ +; RUN: llc < %s -mtriple=aarch64 -mattr=+mte -aarch64-order-frame-objects=1 | FileCheck %s + +declare void @use(i8* %p) +declare void @llvm.aarch64.settag(i8* %p, i64 %a) +declare void @llvm.aarch64.settag.zero(i8* %p, i64 %a) + +; Two loops of size 256; the second loop updates SP. +; After frame reordering, two loops can be merged into one. +define void @stg128_128_gap_128_128() { +entry: +; CHECK-LABEL: stg128_128_gap_128_128: +; CHECK: mov x8, #512 +; CHECK: st2g sp, [sp], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, +; CHECK: ret + %a = alloca i8, i32 128, align 16 + %a2 = alloca i8, i32 128, align 16 + %b = alloca i8, i32 32, align 16 + %c = alloca i8, i32 128, align 16 + %c2 = alloca i8, i32 128, align 16 + call void @use(i8* %b) + call void @llvm.aarch64.settag(i8* %a, i64 128) + call void @llvm.aarch64.settag(i8* %a2, i64 128) + call void @llvm.aarch64.settag(i8* %c, i64 128) + call void @llvm.aarch64.settag(i8* %c2, i64 128) + ret void +} + +define void @stg2(i1 %flag) { +entry: +; CHECK-LABEL: stg2: + %a = alloca i8, i32 160, align 16 + %a2 = alloca i8, i32 160, align 16 + %b = alloca i8, i32 32, align 16 + %c = alloca i8, i32 128, align 16 + %c2 = alloca i8, i32 128, align 16 + call void @use(i8* %b) + br i1 %flag, label %if.then, label %if.else + +if.then: +; CHECK: mov x8, #320 +; CHECK: st2g x9, [x9], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, + call void @llvm.aarch64.settag(i8* %a, i64 160) + call void @llvm.aarch64.settag(i8* %a2, i64 160) + br label %if.end + +if.else: +; CHECK: mov x8, #256 +; CHECK: st2g x9, [x9], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, + call void @llvm.aarch64.settag(i8* %c, i64 128) + call void @llvm.aarch64.settag(i8* %c2, i64 128) + br label %if.end + +if.end: +; CHECK: mov x8, #576 +; CHECK: st2g sp, [sp], #32 +; CHECK: sub x8, x8, #32 +; CHECK: cbnz x8, + call void @llvm.aarch64.settag(i8* %a, i64 160) + call void @llvm.aarch64.settag(i8* %a2, i64 160) + call void @llvm.aarch64.settag(i8* %c, i64 128) + call void @llvm.aarch64.settag(i8* %c2, i64 128) + +; CHECK: ret + ret void +} diff --git a/llvm/test/CodeGen/AArch64/settag-merge.ll b/llvm/test/CodeGen/AArch64/settag-merge.ll --- a/llvm/test/CodeGen/AArch64/settag-merge.ll +++ b/llvm/test/CodeGen/AArch64/settag-merge.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=aarch64 -mattr=+mte | FileCheck %s +; RUN: llc < %s -mtriple=aarch64 -mattr=+mte -aarch64-order-frame-objects=0 | FileCheck %s declare void @use(i8* %p) declare void @llvm.aarch64.settag(i8* %p, i64 %a)