diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1669,6 +1669,11 @@
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
+  InstructionCost getCompactCost() const;
+  bool isTargetSupportedCompactStore() const;
+  unsigned getTargetSupportedCompact() const;
+  unsigned getTargetSupportedCNTP() const;
+
   /// @}
 
 private:
@@ -2035,6 +2040,10 @@
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
+  virtual bool isTargetSupportedCompactStore() const = 0;
+  virtual unsigned getTargetSupportedCompact() const = 0;
+  virtual unsigned getTargetSupportedCNTP() const = 0;
+  virtual InstructionCost getCompactCost() const = 0;
 };
 
 template <typename T>
@@ -2745,6 +2754,22 @@
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
+
+  bool isTargetSupportedCompactStore() const override {
+    return Impl.isTargetSupportedCompactStore();
+  }
+
+  unsigned getTargetSupportedCompact() const override {
+    return Impl.getTargetSupportedCompact();
+  }
+
+  unsigned getTargetSupportedCNTP() const override {
+    return Impl.getTargetSupportedCNTP();
+  }
+
+  InstructionCost getCompactCost() const override {
+    return Impl.getCompactCost();
+  }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -891,6 +891,13 @@
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
+  bool isTargetSupportedCompactStore() const { return false; }
+  unsigned getTargetSupportedCompact() const { return 0; }
+  unsigned getTargetSupportedCNTP() const { return 0; }
+  InstructionCost getCompactCost() const {
+    return InstructionCost::getInvalid();
+  }
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -700,6 +700,10 @@
     return getST()->getMaxPrefetchIterationsAhead();
   }
 
+  virtual InstructionCost getCompactCost() const {
+    return InstructionCost::getInvalid();
+  }
+
   virtual bool enableWritePrefetching() const {
     return getST()->enableWritePrefetching();
   }
diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
--- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -412,6 +412,14 @@
                              const RecurrenceDescriptor &Desc, Value *Src,
                              PHINode *OrigPhi = nullptr);
 
+Value *createTargetCompact(IRBuilderBase &B, Module *M,
+                           const TargetTransformInfo *TTI, Value *Mask,
+                           Value *Val);
+
+Value *createTargetCNTP(IRBuilderBase &B, Module *M,
+                        const TargetTransformInfo *TTI, Value *Mask,
+                        Value *Val);
+
 /// Create an ordered reduction intrinsic using the given recurrence
 /// descriptor \p Desc.
 Value *createOrderedReduction(IRBuilderBase &B,
diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
--- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -224,6 +224,26 @@
   Instruction *ExactFPMathInst = nullptr;
 };
 
+class CompactDescriptor {
+  PHINode *LiveOutPhi;
+  bool IsCompactSign;
+  SmallPtrSet<Value *, 8> Chain;
+
+public:
+  CompactDescriptor() = default;
+  CompactDescriptor(SmallPtrSetImpl<Value *> &CompactChain, PHINode *LiveOut,
+                    bool IsSign)
+      : LiveOutPhi(LiveOut), IsCompactSign(IsSign) {
+    Chain.insert(CompactChain.begin(), CompactChain.end());
+  }
+
+  bool isInCompactChain(Value *V) const { return Chain.find(V) != Chain.end(); }
+
+  PHINode *getLiveOutPhi() const { return LiveOutPhi; }
+
+  bool isSign() const { return IsCompactSign; }
+};
+
 /// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
 /// to what vectorization factor.
 /// This class does not look at the profitability of vectorization, only the
@@ -261,6 +281,8 @@
   /// inductions and reductions.
   using RecurrenceSet = SmallPtrSet<const PHINode *, 8>;
 
+  using CompactList = MapVector<PHINode *, CompactDescriptor>;
+
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
   /// loop, only that it is legal to do so.
@@ -397,6 +419,14 @@
 
   DominatorTree *getDominatorTree() const { return DT; }
 
+  const CompactList &getCompactList() const { return CpList; }
+
+  bool hasCompactChain() const { return CpList.size() > 0; }
+
+  PHINode *getCompactChainStart(Instruction *I) const;
+
+  bool isSign(PHINode *Phi) { return CpList[Phi].isSign(); };
+
 private:
   /// Return true if the pre-header, exiting and latch blocks of \p Lp and all
   /// its nested loops are considered legal for vectorization. These legal
@@ -425,6 +455,8 @@
   /// and we only need to check individual instructions.
   bool canVectorizeInstrs();
 
+  bool isMatchCompact(PHINode *Phi, Loop *TheLoop, CompactDescriptor &CpDesc);
+
   /// When we vectorize loops we may change the order in which
   /// we read and write from memory. This method checks if it is
   /// legal to vectorize the code, considering only memory constrains.
@@ -538,6 +570,9 @@
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
+
+  // Record compact chain in the loop.
+  CompactList CpList;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1237,6 +1237,22 @@
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
+bool TargetTransformInfo::isTargetSupportedCompactStore() const {
+  return TTIImpl->isTargetSupportedCompactStore();
+}
+
+unsigned TargetTransformInfo::getTargetSupportedCompact() const {
+  return TTIImpl->getTargetSupportedCompact();
+}
+
+unsigned TargetTransformInfo::getTargetSupportedCNTP() const {
+  return TTIImpl->getTargetSupportedCNTP();
+}
+
+InstructionCost TargetTransformInfo::getCompactCost() const {
+  return TTIImpl->getCompactCost();
+}
+
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
@@ -301,6 +302,11 @@
   case ISD::FFREXP:
     Res = PromoteIntRes_FFREXP(N);
     break;
+  case ISD::INTRINSIC_WO_CHAIN:
+    if (N->getConstantOperandVal(0) == Intrinsic::aarch64_sve_compact) {
+      Res = PromoteIntRes_COMPACT(N);
+      break;
+    }
   }
 
   // If the result is null then the sub-method took care of registering it.
@@ -5957,6 +5963,12 @@
   return DAG.getBuildVector(N->getValueType(0), dl, NewOps);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_COMPACT(SDNode *N) {
+  SDValue OpExt = SExtOrZExtPromotedInteger(N->getOperand(2));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), OpExt.getValueType(),
+                     N->getOperand(0), N->getOperand(1), OpExt);
+}
+
 SDValue DAGTypeLegalizer::ExpandIntOp_STACKMAP(SDNode *N, unsigned OpNo) {
   assert(OpNo > 1);
   SDValue Op = N->getOperand(OpNo);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -364,6 +364,7 @@
   SDValue PromoteIntRes_FunnelShift(SDNode *N);
   SDValue PromoteIntRes_VPFunnelShift(SDNode *N);
   SDValue PromoteIntRes_IS_FPCLASS(SDNode *N);
+  SDValue PromoteIntRes_COMPACT(SDNode *N);
 
   // Integer Operand Promotion.
   bool PromoteIntegerOperand(SDNode *N, unsigned OpNo);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -24,8 +24,8 @@
 #include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
 #include <cstdint>
-#include <optional>
 
 namespace llvm {
 
@@ -410,6 +410,15 @@
 
     return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
   }
+
+  bool isTargetSupportedCompactStore() const { return ST->hasSVE(); }
+  unsigned getTargetSupportedCompact() const {
+    return Intrinsic::aarch64_sve_compact;
+  }
+  unsigned getTargetSupportedCNTP() const {
+    return Intrinsic::aarch64_sve_cntp;
+  }
+  InstructionCost getCompactCost() const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3816,3 +3816,5 @@
     return AM.Scale != 0 && AM.Scale != 1;
   return -1;
 }
+
+InstructionCost AArch64TTIImpl::getCompactCost() const { return 6; }
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -34,6 +34,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
@@ -1123,6 +1124,34 @@
   return createSimpleTargetReduction(B, TTI, Src, RK);
 }
 
+Value *llvm::createTargetCompact(IRBuilderBase &B, Module *M,
+                                 const TargetTransformInfo *TTI, Value *Mask,
+                                 Value *Val) {
+  Intrinsic::ID IID = TTI->getTargetSupportedCompact();
+  switch (IID) {
+  default:
+    return nullptr;
+  case Intrinsic::aarch64_sve_compact:
+    Function *CompactMaskDecl = Intrinsic::getDeclaration(
+        M, Intrinsic::aarch64_sve_compact, Val->getType());
+    return B.CreateCall(CompactMaskDecl, {Mask, Val});
+  }
+}
+
+Value *llvm::createTargetCNTP(IRBuilderBase &B, Module *M,
+                              const TargetTransformInfo *TTI, Value *Mask,
+                              Value *Val) {
+  Intrinsic::ID IID = TTI->getTargetSupportedCNTP();
+  switch (IID) {
+  default:
+    return nullptr;
+  case Intrinsic::aarch64_sve_cntp:
+    Function *CNTPDecl = Intrinsic::getDeclaration(
+        M, Intrinsic::aarch64_sve_cntp, Val->getType());
+    return B.CreateCall(CNTPDecl, {Mask, Val});
+  }
+}
+
 Value *llvm::createOrderedReduction(IRBuilderBase &B,
                                     const RecurrenceDescriptor &Desc,
                                     Value *Src, Value *Start) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 
@@ -78,6 +79,11 @@
                 "Scalable vectorization is available and favored when the "
                 "cost is inconclusive.")));
 
+static cl::opt<bool>
+    EnableCompactVectorization("enable-compact-vectorization", cl::init(true),
+                               cl::Hidden,
+                               cl::desc("Enable vectorizing compact pattern."));
+
 /// Maximum vectorization interleave count.
 static const unsigned MaxInterleaveFactor = 16;
 
@@ -785,6 +791,143 @@
   return Scalarize;
 }
 
+static bool isUserOfCompactPHI(BasicBlock *BB, PHINode *Phi, Instruction *I) {
+  if (I->getParent() != BB)
+    return false;
+
+  // Operations on PHI should be affine.
+  if (I->getOpcode() != Instruction::Add &&
+      I->getOpcode() != Instruction::Sub &&
+      I->getOpcode() != Instruction::SExt &&
+      I->getOpcode() != Instruction::ZExt)
+    return false;
+
+  if (I == Phi)
+    return true;
+
+  for (unsigned i = 0; i < I->getNumOperands(); i++) {
+    if (auto *Instr = dyn_cast<Instruction>(I->getOperand(i)))
+      if (isUserOfCompactPHI(BB, Phi, Instr))
+        return true;
+  }
+  return false;
+}
+
+// Match the basic compact pattern:
+// for.body:
+//    %src.phi = phi i64 [ 0, %preheader ], [ %target.phi, %for.inc ]
+//    ...
+// if.then:
+//    ...
+//    %data = load i32, ptr %In
+//    (there may be additional sext/zext if %src.phi types i32)
+//    %addr = getelementptr i32, ptr %Out, i64 %src.phi
+//    store i32 %data, ptr %addr
+//    %inc = add i64 %src.phi, 1
+// for.inc
+//    %target.phi = phi i64 [ %inc, if.then ], [ %src.phi, %for.body ]
+bool LoopVectorizationLegality::isMatchCompact(PHINode *Phi, Loop *TheLoop,
+                                               CompactDescriptor &CpDesc) {
+  if (Phi->getNumIncomingValues() > 2)
+    return false;
+
+  // Don't support phis who is used as mask.
+  for (User *U : Phi->users()) {
+    if (isa<CmpInst>(U))
+      return false;
+  }
+
+  SmallPtrSet<Value *, 8> CompactChain;
+  CompactChain.insert(Phi);
+
+  BasicBlock *LoopPreHeader = TheLoop->getLoopPreheader();
+  int ExitIndex = Phi->getIncomingBlock(0) == LoopPreHeader ? 1 : 0;
+  BasicBlock *ExitBlock = Phi->getIncomingBlock(ExitIndex);
+  PHINode *CompactLiveOut = nullptr;
+  Value *IncValue = nullptr;
+  BasicBlock *IncBlock = nullptr;
+  bool IsCycle = false;
+  for (auto &CandPhi : ExitBlock->phis()) {
+    if (llvm::is_contained(CandPhi.incoming_values(), Phi) &&
+        CandPhi.getNumIncomingValues() == 2) {
+      IsCycle = true;
+      CompactLiveOut = &CandPhi;
+      int IncIndex = CandPhi.getIncomingBlock(0) == Phi->getParent() ? 1 : 0;
+      IncBlock = CandPhi.getIncomingBlock(IncIndex);
+      IncValue = CandPhi.getIncomingValueForBlock(IncBlock);
+      break;
+    }
+  }
+  // Similar with reduction PHI.
+  if (!IsCycle)
+    return false;
+  CompactChain.insert(CompactLiveOut);
+
+  // Match the pattern %inc = add i32 %src.phi, 1.
+  Value *Index = nullptr, *Step = nullptr;
+  if (!match(IncValue, m_Add(m_Value(Index), m_Value(Step))))
+    return false;
+  if (Index != Phi) {
+    std::swap(Index, Step);
+  }
+  if (Step != ConstantInt::get(Step->getType(), 1))
+    return false;
+  CompactChain.insert(IncValue);
+
+  const DataLayout &DL = Phi->getModule()->getDataLayout();
+  int CntCandStores = 0;
+  GetElementPtrInst *GEP = nullptr;
+  for (auto &Inst : *IncBlock) {
+    if (auto *SI = dyn_cast<StoreInst>(&Inst)) {
+      // TODO: Support llvm.sve.compact.nxv8i16, llvm.sve.compact.nxv16i18 in
+      // the future.
+      unsigned TySize = DL.getTypeSizeInBits(SI->getValueOperand()->getType());
+      if (TySize < 32)
+        return false;
+
+      GEP = dyn_cast<GetElementPtrInst>(SI->getPointerOperand());
+      if (GEP == nullptr)
+        continue;
+
+      // Only handle single pointer.
+      if (GEP->getNumOperands() != 2)
+        continue;
+
+      // Get the index of GEP, index could be phi or sext/zext (if phi types
+      // i32).
+      Value *Op1 = GEP->getOperand(1);
+      Value *X = nullptr;
+      SmallSet<Value *, 16> CandiInstrs;
+      if (match(Op1, m_SExt(m_Value(X))) || match(Op1, m_ZExt(m_Value(X)))) {
+        Op1 = X;
+      }
+      Instruction *Op1Instr = dyn_cast<Instruction>(Op1);
+      if (!Op1Instr || isUserOfCompactPHI(IncBlock, Phi, Op1Instr))
+        continue;
+      CompactChain.insert(GEP);
+      CompactChain.insert(SI);
+      CntCandStores++;
+    }
+  }
+  if (!CntCandStores)
+    return false;
+
+  KnownBits Bits = computeKnownBits(Phi, DL);
+  bool IsSign = !Bits.isNonNegative();
+  CompactDescriptor CompactDesc(CompactChain, CompactLiveOut, IsSign);
+  CpDesc = CompactDesc;
+  LLVM_DEBUG(dbgs() << "LV: Found a compact chain.\n");
+  return true;
+}
+
+PHINode *LoopVectorizationLegality::getCompactChainStart(Instruction *I) const {
+  for (auto &CpDesc : CpList) {
+    if (CpDesc.second.isInCompactChain(I))
+      return CpDesc.first;
+  }
+  return nullptr;
+}
+
 bool LoopVectorizationLegality::canVectorizeInstrs() {
   BasicBlock *Header = TheLoop->getHeader();
 
@@ -881,6 +1024,14 @@
           continue;
         }
 
+        CompactDescriptor CpDesc;
+        if (EnableCompactVectorization &&
+            TTI->isTargetSupportedCompactStore() &&
+            isMatchCompact(Phi, TheLoop, CpDesc)) {
+          CpList[Phi] = CpDesc;
+          continue;
+        }
+
         reportVectorizationFailure("Found an unidentified PHI",
             "value that could not be identified as "
             "reduction is used outside the loop",
@@ -1525,16 +1676,22 @@
   LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
 
   SmallPtrSet<const Value *, 8> ReductionLiveOuts;
+  SmallPtrSet<const Value *, 8> CompactLiveOuts;
 
   for (const auto &Reduction : getReductionVars())
     ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
 
+  for (const auto &Compact : getCompactList())
+    CompactLiveOuts.insert(Compact.second.getLiveOutPhi());
+
   // TODO: handle non-reduction outside users when tail is folded by masking.
   for (auto *AE : AllowedExit) {
     // Check that all users of allowed exit values are inside the loop or
     // are the live-out of a reduction.
     if (ReductionLiveOuts.count(AE))
       continue;
+    if (CompactLiveOuts.count(AE))
+      continue;
     for (User *U : AE->users()) {
       Instruction *UI = cast<Instruction>(U);
       if (TheLoop->contains(UI))
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -619,6 +619,8 @@
   /// Create code for the loop exit value of the reduction.
   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
 
+  void fixCompactPHI(VPCompactPHIRecipe *CompactPHIR, VPTransformState &State);
+
   /// Iteratively sink the scalarized operands of a predicated instruction into
   /// the block that was created for it.
   void sinkScalarOperands(Instruction *PredInst);
@@ -1913,7 +1915,8 @@
   /// there is no vector code generation, the check blocks are removed
   /// completely.
   void Create(Loop *L, const LoopAccessInfo &LAI,
-              const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
+              const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC,
+              LoopVectorizationLegality *LVL = nullptr) {
 
     // Hard cutoff to limit compile-time increase in case a very large number of
     // runtime checks needs to be generated.
@@ -1946,7 +1949,7 @@
                                  "vector.memcheck");
 
       auto DiffChecks = RtPtrChecking.getDiffChecks();
-      if (DiffChecks) {
+      if (DiffChecks && !(LVL && LVL->hasCompactChain())) {
         Value *RuntimeVF = nullptr;
         MemRuntimeCheckCond = addDiffRuntimeChecks(
             MemCheckBlock->getTerminator(), *DiffChecks, MemCheckExp,
@@ -3604,6 +3607,47 @@
       fixReduction(ReductionPhi, State);
     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
       fixFixedOrderRecurrence(FOR, State);
+    else if (auto *CompactR = dyn_cast<VPCompactPHIRecipe>(&R))
+      fixCompactPHI(CompactR, State);
+  }
+}
+
+void InnerLoopVectorizer::fixCompactPHI(VPCompactPHIRecipe *CompactPHIR,
+                                        VPTransformState &State) {
+  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstNonPHI());
+  VPValue *VPBackEdgeValue = CompactPHIR->getBackedgeValue();
+  Value *BackEdgeValue = State.get(VPBackEdgeValue, State.UF - 1);
+  Value *StartValue = CompactPHIR->getStartValue()->getUnderlyingValue();
+  Value *TruncBackEdgeValue = BackEdgeValue;
+  if (StartValue->getType() != BackEdgeValue->getType())
+    TruncBackEdgeValue =
+        Builder.CreateTruncOrBitCast(BackEdgeValue, StartValue->getType());
+
+  // Generate phi in scalar preheader to pass LiveIns outside the loop.
+  PHINode *ScalarPreheaderPN =
+      PHINode::Create(StartValue->getType(), 2, "compact.rdx",
+                      LoopScalarPreHeader->getFirstNonPHI());
+
+  for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
+    if (Incoming == LoopMiddleBlock)
+      ScalarPreheaderPN->addIncoming(TruncBackEdgeValue, Incoming);
+    else
+      ScalarPreheaderPN->addIncoming(StartValue, Incoming);
+  }
+
+  Value *ScalarBackEdgeValue =
+      CompactPHIR->getBackedgeValue()->getUnderlyingValue();
+  for (PHINode &Phi : LoopScalarBody->phis()) {
+    if (llvm::is_contained(Phi.incoming_values(), ScalarBackEdgeValue)) {
+      Phi.setIncomingValueForBlock(LoopScalarPreHeader, ScalarPreheaderPN);
+    }
+  }
+
+  for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
+    if (llvm::is_contained(LCSSAPhi.incoming_values(), ScalarBackEdgeValue)) {
+      LCSSAPhi.addIncoming(TruncBackEdgeValue, LoopMiddleBlock);
+      State.Plan->removeLiveOut(&LCSSAPhi);
+    }
   }
 }
 
@@ -4223,6 +4267,8 @@
     return !VFDatabase::hasMaskedVariant(*(cast<CallInst>(I)), VF);
   case Instruction::Load:
   case Instruction::Store: {
+    if (Legal->getCompactChainStart(I) != nullptr)
+      return false;
     auto *Ptr = getLoadStorePointerOperand(I);
     auto *Ty = getLoadStoreType(I);
     Type *VTy = Ty;
@@ -4584,6 +4630,12 @@
         continue;
       }
 
+      // GEPs in compact chain should be uniform after vectorization.
+      if (isa<GetElementPtrInst>(&I) && Legal->getCompactChainStart(&I)) {
+        addToWorklistIfAllowed(&I);
+        continue;
+      }
+
       // If there's no pointer operand, there's nothing to do.
       auto *Ptr = getLoadStorePointerOperand(&I);
       if (!Ptr)
@@ -6801,6 +6853,24 @@
         continue;
       }
 
+      if (isa<StoreInst>(I) && Legal->hasCompactChain()) {
+        InstructionCost Cost = 0;
+        if (!VF.isScalable() || VF.isScalar()) {
+          setWideningDecision(&I, VF, CM_Widen, InstructionCost::getInvalid());
+          continue;
+        }
+        Type *EleTy = getLoadStoreType(&I);
+        VectorType *VectorTy = cast<VectorType>(ToVectorTy(EleTy, VF));
+        const Align Alignment = getLoadStoreAlignment(&I);
+        unsigned AS = getLoadStoreAddressSpace(&I);
+        enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+        Cost += TTI.getMaskedMemoryOpCost(I.getOpcode(), VectorTy, Alignment,
+                                          AS, CostKind);
+        Cost += TTI.getCompactCost();
+        setWideningDecision(&I, VF, CM_Widen, Cost);
+        continue;
+      }
+
       // Choose between Interleaving, Gather/Scatter or Scalarization.
       InstructionCost InterleaveCost = InstructionCost::getInvalid();
       unsigned NumAccesses = 1;
@@ -8451,6 +8521,30 @@
   return toVPRecipeResult(Recipe);
 }
 
+VPRecipeOrVPValueTy VPRecipeBuilder::tryToCreateWidenCompactRecipe(
+    Instruction *Instr, ArrayRef<VPValue *> Operands, VPlanPtr &Plan,
+    bool IsSign, const TargetTransformInfo *TTI) {
+  if (auto Phi = dyn_cast<PHINode>(Instr)) {
+    if (Instr->getParent() != OrigLoop->getHeader())
+      return toVPRecipeResult(new VPWidenCompactInstructionRecipe(
+          Instr, Instr->getOpcode(), Operands));
+
+    VPValue *StartV = Operands[0];
+    VPHeaderPHIRecipe *PhiRecipe = new VPCompactPHIRecipe(Phi, StartV, IsSign);
+    recordRecipeOf(cast<Instruction>(
+        Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
+    PhisToFix.push_back(PhiRecipe);
+    return toVPRecipeResult(PhiRecipe);
+  }
+
+  if (isa<GetElementPtrInst>(Instr))
+    return nullptr;
+
+  VPValue *Mask = createBlockInMask(Instr->getParent(), *Plan);
+  return toVPRecipeResult(new VPWidenCompactInstructionRecipe(
+      Instr, Instr->getOpcode(), Operands, Mask, TTI));
+}
+
 VPRecipeOrVPValueTy
 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
                                         ArrayRef<VPValue *> Operands,
@@ -8542,6 +8636,10 @@
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
                                                         ElementCount MaxVF) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
+  // Don't build vplan of fixed width version if there is a compact chain in the
+  // loop.
+  if (Legal->hasCompactChain() && !MinVF.isScalable())
+    return;
 
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
@@ -8783,8 +8881,15 @@
           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
         continue;
 
-      auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
-          Instr, Operands, Range, VPBB, Plan);
+      VPRecipeOrVPValueTy RecipeOrValue;
+      if (PHINode *ChainStart = Legal->getCompactChainStart(Instr)) {
+        RecipeOrValue = RecipeBuilder.tryToCreateWidenCompactRecipe(
+            Instr, Operands, Plan, Legal->isSign(ChainStart), &TTI);
+      } else {
+        RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(Instr, Operands,
+                                                             Range, VPBB, Plan);
+      }
+
       if (!RecipeOrValue)
         RecipeOrValue = RecipeBuilder.handleReplication(Instr, Range, *Plan);
       // If Instr can be simplified to an existing VPValue, use it.
@@ -9924,7 +10029,8 @@
     //  Optimistically generate runtime checks if they are needed. Drop them if
     //  they turn out to not be profitable.
     if (VF.Width.isVector() || SelectedIC > 1)
-      Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+      Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC,
+                    &LVL);
 
     // Check if it is profitable to vectorize with runtime checks.
     bool ForceVectorization =
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -123,6 +123,11 @@
                                              VFRange &Range, VPBasicBlock *VPBB,
                                              VPlanPtr &Plan);
 
+  VPRecipeOrVPValueTy
+  tryToCreateWidenCompactRecipe(Instruction *Instr,
+                                ArrayRef<VPValue *> Operands, VPlanPtr &Plan,
+                                bool IsSign, const TargetTransformInfo *TTI);
+
   /// Set the recipe created for given ingredient. This operation is a no-op for
   /// ingredients that were not marked using a nullptr entry in the map.
   void setRecipe(Instruction *I, VPRecipeBase *R) {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1246,6 +1246,40 @@
   }
 };
 
+class VPWidenCompactInstructionRecipe : public VPRecipeBase, public VPValue {
+private:
+  Instruction &Ingredient;
+  unsigned Opcode;
+  VPValue *Mask;
+  const TargetTransformInfo *TTI;
+
+  void genCompactInc(VPTransformState &State);
+  void genCompactStore(VPTransformState &State);
+  void genCompactLiveOut(VPTransformState &State);
+
+public:
+  VPWidenCompactInstructionRecipe(Instruction *I, unsigned Opcode,
+                                  ArrayRef<VPValue *> Operands,
+                                  VPValue *Mask = nullptr,
+                                  const TargetTransformInfo *TTI = nullptr)
+      : VPRecipeBase(VPDef::VPCompactInstructionSC, Operands), VPValue(this, I),
+        Ingredient(*I), Opcode(Opcode), Mask(Mask), TTI(TTI) {}
+  ~VPWidenCompactInstructionRecipe() override = default;
+
+  VP_CLASSOF_IMPL(VPDef::VPCompactInstructionSC)
+
+  unsigned getOpcode() const { return Opcode; }
+
+  VPValue *getMask() { return Mask; }
+
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for handling GEP instructions.
 class VPWidenGEPRecipe : public VPRecipeWithIRFlags, public VPValue {
   bool isPointerLoopInvariant() const {
@@ -1588,6 +1622,34 @@
   bool isInLoop() const { return IsInLoop; }
 };
 
+class VPCompactPHIRecipe : public VPHeaderPHIRecipe {
+  PHINode *CompactPHI;
+  bool IsCompactSign;
+
+public:
+  VPCompactPHIRecipe(PHINode *Phi, VPValue *Start, bool IsSign)
+      : VPHeaderPHIRecipe(VPDef::VPCompactPHISC, Phi, Start), CompactPHI(Phi),
+        IsCompactSign(IsSign) {}
+
+  ~VPCompactPHIRecipe() override = default;
+
+  VP_CLASSOF_IMPL(VPDef::VPCompactPHISC)
+
+  static inline bool classof(const VPHeaderPHIRecipe *R) {
+    return R->getVPDefID() == VPDef::VPCompactPHISC;
+  }
+
+  bool isSign() { return IsCompactSign; }
+
+  void execute(VPTransformState &State) override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
 /// instructions.
 class VPBlendRecipe : public VPRecipeBase, public VPValue {
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -830,6 +830,7 @@
     // generated.
     bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
                             isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
+                            isa<VPCompactPHIRecipe>(PhiR) ||
                             (isa<VPReductionPHIRecipe>(PhiR) &&
                              cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
     unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;
@@ -840,6 +841,22 @@
                               SinglePartNeeded ? State->UF - 1 : Part);
       cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
     }
+
+    // Fix Compact phis if UF > 1.
+    if (isa<VPCompactPHIRecipe>(PhiR)) {
+      for (unsigned Part = 1; Part < State->UF; ++Part) {
+        Value *Val = State->get(PhiR->getBackedgeValue(), Part - 1);
+        // BOSCC vectorization will transform liveouts into phis, and we should
+        // get the underlying value here.
+        if (auto *PN = dyn_cast<PHINode>(Val)) {
+          int ValIdx = isa<PoisonValue>(PN->getOperand(0)) ? 1 : 0;
+          Val = PN->getOperand(ValIdx);
+        }
+        PHINode *Phi = cast<PHINode>(State->get(PhiR, Part));
+        Phi->replaceAllUsesWith(Val);
+        Phi->eraseFromParent();
+      }
+    }
   }
 
   // We do not attempt to preserve DT for outer loop vectorization currently.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <cassert>
 
@@ -933,6 +934,140 @@
   VecInd->addIncoming(LastInduction, VectorPH);
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPWidenCompactInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                            VPSlotTracker &SlotTracker) const {
+  O << Indent << "COMPACT ";
+  if (getOpcode() != Instruction::Store) {
+    printAsOperand(O, SlotTracker);
+    O << " = ";
+  }
+  O << Instruction::getOpcodeName(getOpcode()) << " ";
+  printOperands(O, SlotTracker);
+}
+#endif
+
+void VPWidenCompactInstructionRecipe::execute(VPTransformState &State) {
+  switch (getOpcode()) {
+  case Instruction::Add:
+    genCompactInc(State);
+    break;
+  case Instruction::PHI:
+    genCompactLiveOut(State);
+    break;
+  case Instruction::Store:
+    genCompactStore(State);
+    break;
+  default:
+    llvm_unreachable("Unsupport opcode for compact.");
+  }
+}
+
+void VPWidenCompactInstructionRecipe::genCompactStore(VPTransformState &State) {
+  assert(State.VF.isScalable() && "Compact store is for SVE scenario");
+  auto &Builder = State.Builder;
+  VPValue *VPStoredValue = getOperand(0);
+  VPValue *VPAddr = getOperand(1);
+  StoreInst *SI = cast<StoreInst>(&Ingredient);
+  Type *ScalarTy = getLoadStoreType(&Ingredient);
+  Module *M = SI->getModule();
+  VectorType *MaskVTy = cast<VectorType>(State.get(getMask(), 0)->getType());
+  Constant *One = nullptr;
+  unsigned VL = MaskVTy->getElementCount().getKnownMinValue();
+  switch (VL) {
+  case 2:
+    One = ConstantInt::get(Type::getInt64Ty(M->getContext()), 1);
+    break;
+  case 4:
+    One = ConstantInt::get(Type::getInt32Ty(M->getContext()), 1);
+    break;
+  default:
+    // TODO: Try to support compact.nxv8i16 / compact.nxv16i8 in the future.
+    llvm_unreachable("Unsupported type");
+  }
+  Constant *VOne = ConstantVector::getSplat(MaskVTy->getElementCount(), One);
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    // Generate compact mask.
+    Value *Mask = State.get(getMask(), Part);
+    Value *CompactMaskII = createTargetCompact(Builder, M, TTI, Mask, VOne);
+    assert(CompactMaskII && "Do not support compact in current target.");
+    Value *CompactCmpII =
+        Builder.CreateCmp(ICmpInst::ICMP_EQ, CompactMaskII, VOne);
+
+    // Transform stored value into compact form.
+    VectorType *StoreVTy = VectorType::get(ScalarTy, State.VF);
+    const Align Alignment = getLoadStoreAlignment(&Ingredient);
+    Value *Addr = State.get(VPAddr, VPIteration(Part, 0));
+    Value *StoredValue = State.get(VPStoredValue, Part);
+    Value *SCompact = createTargetCompact(Builder, M, TTI, Mask, StoredValue);
+    assert(SCompact && "Do not support comapct in current target.");
+    Instruction *CompactSI =
+        Builder.CreateMaskedStore(SCompact, Addr, Alignment, CompactCmpII);
+    State.addMetadata(CompactSI, SI);
+  }
+}
+
+void VPWidenCompactInstructionRecipe::genCompactInc(VPTransformState &State) {
+  auto &Builder = State.Builder;
+  Module *M = getUnderlyingInstr()->getModule();
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    Value *Mask = State.get(getMask(), Part);
+    Constant *PTrue = ConstantInt::getTrue(cast<VectorType>(Mask->getType()));
+    Value *CNTPCall = createTargetCNTP(Builder, M, TTI, PTrue, Mask);
+    Value *Idx = nullptr;
+    if (Part == 0)
+      Idx = State.get(getOperand(0), Part);
+    else
+      Idx = State.get(this, Part - 1);
+    Value *TruncCall = CNTPCall;
+    if (Idx->getType() != CNTPCall->getType()) {
+      TruncCall = Builder.CreateTrunc(CNTPCall, Idx->getType());
+    }
+    Value *NewInc = Builder.CreateAdd(cast<Instruction>(Idx), TruncCall);
+    State.set(this, NewInc, Part);
+  }
+}
+
+void VPWidenCompactInstructionRecipe::genCompactLiveOut(
+    VPTransformState &State) {
+  // Get the exit value of phi
+  VPValue *VPExitValue = nullptr;
+  PHINode *Phi = cast<PHINode>(&Ingredient);
+  for (unsigned Idx = 0; Idx < Phi->getNumIncomingValues(); Idx++) {
+    PHINode *PhiOp =
+        dyn_cast_or_null<PHINode>(getOperand(Idx)->getUnderlyingValue());
+    if (!PhiOp) {
+      VPExitValue = getOperand(Idx);
+      break;
+    }
+  }
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    Value *ExitVal = State.get(VPExitValue, Part);
+    State.set(this, ExitVal, Part);
+  }
+}
+
+void VPCompactPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                               VPSlotTracker &SlotTracker) const {
+  O << Indent << "COMPACT-PHI ";
+  printAsOperand(O, SlotTracker);
+  O << " = phi ";
+  printOperands(O, SlotTracker);
+}
+
+void VPCompactPHIRecipe::execute(VPTransformState &State) {
+  BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
+  BasicBlock *VectorHeader = State.CFG.PrevBB;
+  VPValue *StartVPV = getStartValue();
+  Value *Start = StartVPV->getLiveInIRValue();
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    PHINode *Entry = PHINode::Create(Start->getType(), 2, "compact.iv",
+                                     &*VectorHeader->getFirstInsertionPt());
+    Entry->addIncoming(Start, VectorPH);
+    State.set(this, Entry, Part);
+  }
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
                                           VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -348,6 +348,7 @@
     VPWidenCastSC,
     VPWidenGEPSC,
     VPWidenMemoryInstructionSC,
+    VPCompactInstructionSC,
     VPWidenSC,
     VPWidenSelectSC,
     // START: Phi-like recipes. Need to be kept together.
@@ -361,6 +362,7 @@
     VPWidenPHISC,
     VPWidenIntOrFpInductionSC,
     VPWidenPointerInductionSC,
+    VPCompactPHISC,
     VPReductionPHISC,
     // END: SubclassID for recipes that inherit VPHeaderPHIRecipe
     // END: Phi-like recipes
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/compact-vplan.ll b/llvm/test/Transforms/LoopVectorize/AArch64/compact-vplan.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/compact-vplan.ll
@@ -0,0 +1,78 @@
+; REQUIRES: asserts
+; RUN: opt -passes=loop-vectorize -debug -disable-output %s 2>&1 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+; CHECK-LABEL: 'kernel_reference'
+; CHECK:      VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; CHECK-NEXT: Live-in vp<%0> = vector-trip-count
+; CHECK-NEXT: vp<%1> = original trip-count
+; CHECK-EMPTY:
+; CHECK-NEXT: ph:
+; CHECK-NEXT:   EMIT vp<%1> = EXPAND SCEV (zext i32 %N to i64)
+; CHECK-NEXT: No successors
+; CHECK-EMPTY:
+; CHECK-NEXT: vector.ph:
+; CHECK-NEXT: Successor(s): vector loop
+; CHECK-EMPTY:
+; CHECK-NEXT: <x1> vector loop: {
+; CHECK-NEXT:   vector.body:
+; CHECK-NEXT:     EMIT vp<%2> = CANONICAL-INDUCTION
+; CHECK-NEXT:     COMPACT-PHI ir<%n.013> = phi ir<0>, ir<%n.1>
+; CHECK-NEXT:     vp<%4>    = SCALAR-STEPS vp<%2>, ir<1>
+; CHECK-NEXT:     CLONE ir<%arrayidx> = getelementptr inbounds ir<%comp>, vp<%4>
+; CHECK-NEXT:     WIDEN ir<%0> = load ir<%arrayidx>
+; CHECK-NEXT:     WIDEN ir<%cmp1> = icmp slt ir<%0>, ir<%a>
+; CHECK-NEXT:     CLONE ir<%arrayidx3> = getelementptr inbounds ir<%B>, vp<%4>
+; CHECK-NEXT:     WIDEN ir<%1> = load ir<%arrayidx3>, ir<%cmp1>
+; CHECK-NEXT:     COMPACT ir<%inc> = add ir<%n.013>, ir<1>
+; CHECK-NEXT:     CLONE ir<%idxprom4> = sext ir<%n.013>
+; CHECK-NEXT:     CLONE ir<%arrayidx5> = getelementptr inbounds ir<%Out_ref>, ir<%idxprom4>
+; CHECK-NEXT:     COMPACT store ir<%1>, ir<%arrayidx5>
+; CHECK-NEXT:     COMPACT ir<%n.1> = phi ir<%inc>, ir<%n.013>
+; CHECK-NEXT:     EMIT vp<%15> = VF * UF + nuw vp<%2>
+; CHECK-NEXT:     EMIT branch-on-count vp<%15>, vp<%0>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT: }
+; CHECK-NEXT: Successor(s): middle.block
+;
+; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16)
+define dso_local i32 @kernel_reference(i32 noundef %N, i32 noundef %a, ptr noalias nocapture noundef readonly %comp, ptr noalias nocapture noundef writeonly %Out_ref, ptr nocapture noundef readonly %B, ptr noalias nocapture noundef readnone %Out1) #0 {
+entry:
+  %cmp11 = icmp sgt i32 %N, 0
+  br i1 %cmp11, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ]
+  %n.013 = phi i32 [ 0, %for.body.preheader ], [ %n.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, ptr %comp, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp1 = icmp slt i32 %0, %a
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx3 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv
+  %1 = load i32, ptr %arrayidx3, align 4
+  %inc = add nsw i32 %n.013, 1
+  %idxprom4 = sext i32 %n.013 to i64
+  %arrayidx5 = getelementptr inbounds i32, ptr %Out_ref, i64 %idxprom4
+  store i32 %1, ptr %arrayidx5, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %n.1 = phi i32 [ %inc, %if.then ], [ %n.013, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %n.0.lcssa = phi i32 [ 0, %entry ], [ %n.1, %for.inc ]
+  ret i32 %n.0.lcssa
+}
+
+attributes #0 = { argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16) "target-cpu"="generic" "target-features"="+neon,+sve,+v8.2a"}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll b/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll
--- a/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/compact.ll
@@ -4,6 +4,12 @@
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
 
+; for (i = 0; i < N; i++){
+;  x = comp[i];
+;  if(x < a) Out[n++] = B[i];
+; }
+; return n;
+
 ; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable vscale_range(1,16)
 define dso_local i32 @kernel_reference(i32 noundef %N, i32 noundef %a, ptr noalias nocapture noundef readonly %comp, ptr noalias nocapture noundef writeonly %Out_ref, ptr nocapture noundef readonly %B, ptr noalias nocapture noundef readnone %Out1) #0 {
 ; CHECK-LABEL: @kernel_reference(
@@ -12,29 +18,97 @@
 ; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[A:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[COMPACT_IV:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[COMP:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[COMP]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP14]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i32>, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD2]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[B]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP20]], i32 4, <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 4
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i32, ptr [[TMP18]], i64 [[TMP22]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr [[TMP23]], i32 4, <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[TMP16]])
+; CHECK-NEXT:    [[TMP25:%.*]] = trunc i64 [[TMP24]] to i32
+; CHECK-NEXT:    [[TMP26:%.*]] = add i32 [[COMPACT_IV]], [[TMP25]]
+; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.aarch64.sve.cntp.nxv4i1(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i1> [[TMP17]])
+; CHECK-NEXT:    [[TMP28:%.*]] = trunc i64 [[TMP27]] to i32
+; CHECK-NEXT:    [[TMP29]] = add i32 [[TMP26]], [[TMP28]]
+; CHECK-NEXT:    [[TMP30:%.*]] = sext i32 [[COMPACT_IV]] to i64
+; CHECK-NEXT:    [[TMP31:%.*]] = sext i32 [[TMP26]] to i64
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF:%.*]], i64 [[TMP30]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF]], i64 [[TMP31]]
+; CHECK-NEXT:    [[TMP34:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.compact.nxv4i32(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP35:%.*]] = icmp eq <vscale x 4 x i32> [[TMP34]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP36:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.compact.nxv4i32(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP36]], ptr [[TMP32]], i32 4, <vscale x 4 x i1> [[TMP35]])
+; CHECK-NEXT:    [[TMP37:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.compact.nxv4i32(<vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP38:%.*]] = icmp eq <vscale x 4 x i32> [[TMP37]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP39:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.compact.nxv4i32(<vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD3]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP39]], ptr [[TMP33]], i32 4, <vscale x 4 x i1> [[TMP38]])
+; CHECK-NEXT:    [[TMP40:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP40]], 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP41]]
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP42]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[COMPACT_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    [[N_013:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_1:%.*]], [[FOR_INC]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COMP:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP0]], [[A:%.*]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    [[N_013:%.*]] = phi i32 [ [[COMPACT_RDX]], [[SCALAR_PH]] ], [ [[N_1:%.*]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[COMP]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP43]], [[A]]
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP44:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
 ; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[N_013]], 1
 ; CHECK-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[N_013]] to i64
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF:%.*]], i64 [[IDXPROM4]]
-; CHECK-NEXT:    store i32 [[TMP1]], ptr [[ARRAYIDX5]], align 4
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[OUT_REF]], i64 [[IDXPROM4]]
+; CHECK-NEXT:    store i32 [[TMP44]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[N_1]] = phi i32 [ [[INC]], [[IF_THEN]] ], [ [[N_013]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    [[N_1_LCSSA:%.*]] = phi i32 [ [[N_1]], [[FOR_INC]] ]
+; CHECK-NEXT:    [[N_1_LCSSA:%.*]] = phi i32 [ [[N_1]], [[FOR_INC]] ], [ [[TMP29]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[N_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[N_1_LCSSA]], [[FOR_END_LOOPEXIT]] ]