diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -767,6 +767,10 @@
 // ptr1 = tagp(ptr0, baseptr, tag_offset) returns a pointer where
 // * address is the address in ptr0
 // * tag is a function of (tag in baseptr, tag_offset).
+// ** Beware, this is not the same function as implemented by the ADDG instruction!
+//    Backend optimizations may change tag_offset; the only guarantee is that calls
+//    to tagp with the same pair of (baseptr, tag_offset) will produce pointers
+//    with the same tag value, assuming the set of excluded tags has not changed.
 // Address bits in baseptr and tag bits in ptr0 are ignored.
 // When offset between ptr0 and baseptr is a compile time constant, this can be emitted as
 //   ADDG ptr1, baseptr, (ptr0 - baseptr), tag_offset
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1070,9 +1070,13 @@
   if (MF.getFunction().getCallingConv() == CallingConv::GHC)
     return;
 
-  // Set tagged base pointer to the bottom of the stack frame.
+  // Set tagged base pointer to the requested stack slot.
   // Ideally it should match SP value after prologue.
-  AFI->setTaggedBasePointerOffset(MFI.getStackSize());
+  Optional<int> TBPI = AFI->getTaggedBasePointerIndex();
+  if (TBPI)
+    AFI->setTaggedBasePointerOffset(-MFI.getObjectOffset(*TBPI));
+  else
+    AFI->setTaggedBasePointerOffset(MFI.getStackSize());
 
   const StackOffset &SVEStackSize = getSVEStackSize(MF);
 
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -128,10 +128,13 @@
   /// that must be forwarded to every musttail call.
   SmallVector<ForwardedRegister, 1> ForwardedMustTailRegParms;
 
-  // Offset from SP-at-entry to the tagged base pointer.
-  // Tagged base pointer is set up to point to the first (lowest address) tagged
-  // stack slot.
-  unsigned TaggedBasePointerOffset = 0;
+  /// FrameIndex for the tagged base pointer.
+  Optional<int> TaggedBasePointerIndex;
+
+  /// Offset from SP-at-entry to the tagged base pointer.
+  /// Tagged base pointer is set up to point to the first (lowest address)
+  /// tagged stack slot.
+  unsigned TaggedBasePointerOffset;
 
   /// OutliningStyle denotes, if a function was outined, how it was outlined,
   /// e.g. Tail Call, Thunk, or Function if none apply.
@@ -343,6 +346,11 @@
     return ForwardedMustTailRegParms;
   }
 
+  Optional<int> getTaggedBasePointerIndex() const {
+    return TaggedBasePointerIndex;
+  }
+  void setTaggedBasePointerIndex(int Index) { TaggedBasePointerIndex = Index; }
+
   unsigned getTaggedBasePointerOffset() const {
     return TaggedBasePointerOffset;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
--- a/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp
@@ -13,7 +13,6 @@
 #include "AArch64InstrInfo.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -50,6 +49,12 @@
             "apply unchecked-ld-st when the target is definitely within range"),
         clEnumValN(UncheckedAlways, "always", "always apply unchecked-ld-st")));
 
+static cl::opt<bool>
+    ClFirstSlot("stack-tagging-first-slot-opt", cl::Hidden, cl::init(true),
+                cl::ZeroOrMore,
+                cl::desc("Apply first slot optimization for stack tagging "
+                         "(eliminate ADDG Rt, Rn, 0, 0)."));
+
 namespace {
 
 class AArch64StackTaggingPreRA : public MachineFunctionPass {
@@ -71,6 +76,7 @@
   bool mayUseUncheckedLoadStore();
   void uncheckUsesOf(unsigned TaggedReg, int FI);
   void uncheckLoadsAndStores();
+  Optional<int> findFirstSlotCandidate();
 
   bool runOnMachineFunction(MachineFunction &Func) override;
   StringRef getPassName() const override {
@@ -197,6 +203,141 @@
   }
 }
 
+struct SlotWithTag {
+  int FI;
+  int Tag;
+  SlotWithTag(int FI, int Tag) : FI(FI), Tag(Tag) {}
+  explicit SlotWithTag(const MachineInstr &MI)
+      : FI(MI.getOperand(1).getIndex()), Tag(MI.getOperand(4).getImm()) {}
+  bool operator==(const SlotWithTag &Other) const {
+    return FI == Other.FI && Tag == Other.Tag;
+  }
+};
+
+namespace llvm {
+template <> struct DenseMapInfo<SlotWithTag> {
+  static inline SlotWithTag getEmptyKey() { return {-2, -2}; }
+  static inline SlotWithTag getTombstoneKey() { return {-3, -3}; }
+  static unsigned getHashValue(const SlotWithTag &V) {
+    return hash_combine(DenseMapInfo<int>::getHashValue(V.FI),
+                        DenseMapInfo<int>::getHashValue(V.Tag));
+  }
+  static bool isEqual(const SlotWithTag &A, const SlotWithTag &B) {
+    return A == B;
+  }
+};
+} // namespace llvm
+
+static bool isSlotPreAllocated(MachineFrameInfo *MFI, int FI) {
+  return MFI->getUseLocalStackAllocationBlock() &&
+         MFI->isObjectPreAllocated(FI);
+}
+
+// Pin one of the tagged slots to offset 0 from the tagged base pointer.
+// This would make its address available in a virtual register (IRG's def), as
+// opposed to requiring an ADDG instruction to materialize. This effectively
+// eliminates a vreg (by replacing it with direct uses of IRG, which is usually
+// live almost everywhere anyway), and therefore needs to happen before
+// regalloc.
+Optional<int> AArch64StackTaggingPreRA::findFirstSlotCandidate() {
+  // Find the best (FI, Tag) pair to pin to offset 0.
+  // Looking at the possible uses of a tagged address, the advantage of pinning
+  // is:
+  // - COPY to physical register.
+  //   Does not matter, this would trade a MOV instruction for an ADDG.
+  // - ST*G matter, but those mostly appear near the function prologue where all
+  //   the tagged addresses need to be materialized anyway; also, counting ST*G
+  //   uses would overweight large allocas that require more than one ST*G
+  //   instruction.
+  // - Load/Store instructions in the address operand do not require a tagged
+  //   pointer, so they also do not benefit. These operands have already been
+  //   eliminated (see uncheckLoadsAndStores) so all remaining load/store
+  //   instructions count.
+  // - Any other instruction may benefit from being pinned to offset 0.
+  LLVM_DEBUG(dbgs() << "AArch64StackTaggingPreRA::findFirstSlotCandidate\n");
+  if (!ClFirstSlot)
+    return None;
+
+  DenseMap<SlotWithTag, int> RetagScore;
+  SlotWithTag MaxScoreST{-1, -1};
+  int MaxScore = -1;
+  for (auto *I : ReTags) {
+    SlotWithTag ST{*I};
+    if (isSlotPreAllocated(MFI, ST.FI))
+      continue;
+
+    Register RetagReg = I->getOperand(0).getReg();
+    if (!Register::isVirtualRegister(RetagReg))
+      continue;
+
+    int Score = 0;
+    SmallVector<Register, 8> WorkList;
+    WorkList.push_back(RetagReg);
+
+    while (!WorkList.empty()) {
+      Register UseReg = WorkList.back();
+      WorkList.pop_back();
+      for (auto &UseI : MRI->use_instructions(UseReg)) {
+        unsigned Opcode = UseI.getOpcode();
+        if (Opcode == AArch64::STGOffset || Opcode == AArch64::ST2GOffset ||
+            Opcode == AArch64::STZGOffset || Opcode == AArch64::STZ2GOffset ||
+            Opcode == AArch64::STGPi || Opcode == AArch64::STGloop ||
+            Opcode == AArch64::STZGloop || Opcode == AArch64::STGloop_wback ||
+            Opcode == AArch64::STZGloop_wback)
+          continue;
+        if (UseI.isCopy()) {
+          Register DstReg = UseI.getOperand(0).getReg();
+          if (Register::isVirtualRegister(DstReg))
+            WorkList.push_back(DstReg);
+          continue;
+        }
+        LLVM_DEBUG(dbgs() << "[" << ST.FI << ":" << ST.Tag << "] use of %"
+                          << Register::virtReg2Index(UseReg) << " in " << UseI
+                          << "\n");
+        Score++;
+      }
+    }
+
+    int TotalScore = RetagScore[ST] += Score;
+    if (TotalScore > MaxScore ||
+        (TotalScore == MaxScore && ST.FI > MaxScoreST.FI)) {
+      MaxScore = TotalScore;
+      MaxScoreST = ST;
+    }
+  }
+
+  if (MaxScoreST.FI < 0)
+    return None;
+
+  // If FI's tag is already 0, we are done.
+  if (MaxScoreST.Tag == 0)
+    return MaxScoreST.FI;
+
+  // Otherwise, find a random victim pair (FI, Tag) where Tag == 0.
+  SlotWithTag SwapST{-1, -1};
+  for (auto *I : ReTags) {
+    SlotWithTag ST{*I};
+    if (ST.Tag == 0) {
+      SwapST = ST;
+      break;
+    }
+  }
+
+  // Swap tags between the victim and the highest scoring pair.
+  // If SwapWith is still (-1, -1), that's fine, too - we'll simply take tag for
+  // the highest score slot without changing anything else.
+  for (auto *&I : ReTags) {
+    SlotWithTag ST{*I};
+    MachineOperand &TagOp = I->getOperand(4);
+    if (ST == MaxScoreST) {
+      TagOp.setImm(0);
+    } else if (ST == SwapST) {
+      TagOp.setImm(MaxScoreST.Tag);
+    }
+  }
+  return MaxScoreST.FI;
+}
+
 bool AArch64StackTaggingPreRA::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
   MRI = &MF->getRegInfo();
@@ -225,11 +366,35 @@
     }
   }
 
+  // Take over from SSP. It does nothing for tagged slots, and should not really
+  // have been enabled in the first place.
+  for (int FI : TaggedSlots)
+    MFI->setObjectSSPLayout(FI, MachineFrameInfo::SSPLK_None);
+
   if (ReTags.empty())
     return false;
 
   if (mayUseUncheckedLoadStore())
     uncheckLoadsAndStores();
 
+  // Find a slot that is used with zero tag offset, like ADDG #fi, 0.
+  // If the base tagged pointer is set up to the address of this slot,
+  // the ADDG instruction can be eliminated.
+  Optional<int> BaseSlot = findFirstSlotCandidate();
+  if (BaseSlot)
+    AFI->setTaggedBasePointerIndex(*BaseSlot);
+
+  for (auto *I : ReTags) {
+    int FI = I->getOperand(1).getIndex();
+    int Tag = I->getOperand(4).getImm();
+    Register Base = I->getOperand(3).getReg();
+    if (Tag == 0 && FI == BaseSlot) {
+      BuildMI(*I->getParent(), I, {}, TII->get(AArch64::COPY),
+              I->getOperand(0).getReg())
+          .addReg(Base);
+      I->eraseFromParent();
+    }
+  }
+
   return true;
 }
diff --git a/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll b/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll
--- a/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll
+++ b/llvm/test/CodeGen/AArch64/irg_sp_tagp.ll
@@ -3,12 +3,32 @@
 define i8* @small_alloca() {
 entry:
 ; CHECK-LABEL: small_alloca:
+; CHECK:      irg  x0, sp{{$}}
+; CHECK:      ret
+  %a = alloca i8, align 16
+  %q = call i8* @llvm.aarch64.irg.sp(i64 0)
+  %q1 = call i8* @llvm.aarch64.tagp.p0i8(i8* %a, i8* %q, i64 1)
+  ret i8* %q1
+}
+
+@sink = global i8* null, align 8
+
+; Check that IRG is pinned to %b because the store instruction needs
+; the address in a non-fixed physical register and can benefit from it
+; being equal to the base tagged pointer.
+define i8* @small_allocas() {
+entry:
+; CHECK-LABEL: small_allocas:
 ; CHECK:      irg  [[R:x[0-9]+]], sp{{$}}
-; CHECK-NEXT: addg x0, [[R]], #0, #1
+; CHECK:      addg x0, [[R]], #16, #1
+; CHECK:      str  [[R]], {{.*}}sink
 ; CHECK:      ret
   %a = alloca i8, align 16
+  %b = alloca i8, align 16
   %q = call i8* @llvm.aarch64.irg.sp(i64 0)
   %q1 = call i8* @llvm.aarch64.tagp.p0i8(i8* %a, i8* %q, i64 1)
+  %q2 = call i8* @llvm.aarch64.tagp.p0i8(i8* %b, i8* %q, i64 2)
+  store i8* %q2, i8** @sink, align 8
   ret i8* %q1
 }
 
@@ -16,16 +36,15 @@
 define void @huge_allocas() {
 entry:
 ; CHECK-LABEL: huge_allocas:
-; CHECK:      irg  [[R:x[0-9]+]], sp{{$}}
-; CHECK:      add  [[TMP:x[0-9]+]], [[R]], #3088
+; CHECK:      irg  x1, sp{{$}}
+; CHECK:      add  [[TMP:x[0-9]+]], x1, #3088
 ; CHECK:      addg x0, [[TMP]], #1008, #1
-; CHECK:      addg x1, [[R]], #0, #2
 ; CHECK:      bl use2
   %a = alloca i8, i64 4096, align 16
   %b = alloca i8, i64 4096, align 16
   %base = call i8* @llvm.aarch64.irg.sp(i64 0)
   %a_t = call i8* @llvm.aarch64.tagp.p0i8(i8* %a, i8* %base, i64 1)
-  %b_t = call i8* @llvm.aarch64.tagp.p0i8(i8* %b, i8* %base, i64 2)
+  %b_t = call i8* @llvm.aarch64.tagp.p0i8(i8* %b, i8* %base, i64 0)
   call void @use2(i8* %a_t, i8* %b_t)
   ret void
 }
@@ -37,8 +56,7 @@
 ; CHECK-LABEL: realign:
 ; CHECK:      mov  x29, sp
 ; CHECK:      and  sp, x{{[0-9]*}}, #0xffffffffffffffc0
-; CHECK:      irg  [[R:x[0-9]+]], sp{{$}}
-; CHECK:      addg x0, [[R]], #0, #1
+; CHECK:      irg  x0, sp{{$}}
 ; CHECK:      bl use
   %a = alloca i8, i64 4096, align 64
   %base = call i8* @llvm.aarch64.irg.sp(i64 0)
@@ -52,10 +70,9 @@
 define void @dynamic_alloca(i64 %size) {
 entry:
 ; CHECK-LABEL: dynamic_alloca:
-; CHECK:      sub  [[R:x[0-9]+]], x29, #[[OFS:[0-9]+]]
-; CHECK:      irg  [[R]], [[R]]
-; CHECK:      addg x1, [[R]], #0, #1
-; CHECK:      sub  x0, x29, #[[OFS]]
+; CHECK:      sub  x1, x29, #[[OFS:[0-9]+]]
+; CHECK:      irg  x1, x1
+; CHECK-DAG:  sub  x0, x29, #[[OFS]]
 ; CHECK:      bl   use2
   %base = call i8* @llvm.aarch64.irg.sp(i64 0)
   %a = alloca i128, i64 %size, align 16
@@ -74,9 +91,9 @@
 ; CHECK-LABEL: dynamic_alloca_and_realign:
 ; CHECK:      and  sp, x{{.*}}, #0xffffffffffffffc0
 ; CHECK:      mov  x19, sp
-; CHECK:      irg  [[R:x[0-9]+]], x19
-; CHECK:      addg x1, [[R]], #[[OFS:[0-9]+]], #1
-; CHECK:      add  x0, x19, #[[OFS]]
+; CHECK:      add  x1, x19, #[[OFS:[0-9]+]]
+; CHECK:      irg  x1, x1
+; CHECK-DAG:  add  x0, x19, #[[OFS]]
 ; CHECK:      bl   use2
   %base = call i8* @llvm.aarch64.irg.sp(i64 0)
   %a = alloca i128, i64 %size, align 64