diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -772,6 +772,8 @@
     /// Once this function is called, no other functions on this object are
     /// valid; the loop has been removed.
     virtual void disposed() = 0;
+
+    virtual unsigned getTripCountReg() { return 0; }
   };
 
   /// Analyze loop L, which must be a single-basic-block loop, and if the
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -820,6 +820,16 @@
           }
           MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
           MachineMemOperand *MMO2 = *MI.memoperands_begin();
+
+          // Workaround for loop distribution. Verification required.
+          if (MMO1->getValue() && MMO2->getPseudoValue()) {
+            if (!MMO2->getPseudoValue()->mayAlias(&MFI))
+              continue;
+          } else if (MMO2->getValue() && MMO1->getPseudoValue()) {
+            if (!MMO1->getPseudoValue()->mayAlias(&MFI))
+              continue;
+          }
+
           if (!MMO1->getValue() || !MMO2->getValue()) {
             SDep Dep(Load, SDep::Barrier);
             Dep.setLatency(1);
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -44,6 +44,7 @@
 FunctionPass *createAArch64SpeculationHardeningPass();
 FunctionPass *createAArch64KCFIPass();
 FunctionPass *createAArch64LoadStoreOptimizationPass();
+FunctionPass *createAArch64LoopDistribute();
 ModulePass *createAArch64LowerHomogeneousPrologEpilogPass();
 FunctionPass *createAArch64SIMDInstrOptPass();
 ModulePass *createAArch64PromoteConstantPass();
@@ -88,6 +89,7 @@
 void initializeAArch64GlobalsTaggingPass(PassRegistry &);
 void initializeAArch64KCFIPass(PassRegistry &);
 void initializeAArch64LoadStoreOptPass(PassRegistry&);
+void initializeAArch64LoopDistributePass(PassRegistry &);
 void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);
 void initializeAArch64MIPeepholeOptPass(PassRegistry &);
 void initializeAArch64O0PreLegalizerCombinerPass(PassRegistry &);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -223,6 +223,10 @@
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
+
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+  analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const override;
+
   bool
   reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
   bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -14,6 +14,7 @@
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
@@ -24,6 +25,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
@@ -8327,6 +8329,68 @@
     return AArch64::BLR;
 }
 
+namespace {
+class AArch64PipelinerLoopInfo : public TargetInstrInfo::PipelinerLoopInfo {
+  MachineInstr *Comp, *CondBranch;
+  MachineBasicBlock *TBB;
+  SmallVector<MachineOperand, 4> BrCond;
+  const TargetInstrInfo *TII;
+
+public:
+  AArch64PipelinerLoopInfo(MachineInstr *Comp, MachineInstr *CondBranch,
+                           MachineBasicBlock *TBB,
+                           const SmallVectorImpl<MachineOperand> &BrCond)
+      : Comp(Comp), CondBranch(CondBranch), TBB(TBB),
+        BrCond(BrCond.begin(), BrCond.end()), TII(CondBranch->getParent()
+                                                      ->getParent()
+                                                      ->getSubtarget()
+                                                      .getInstrInfo()) {}
+
+  bool shouldIgnoreForPipelining(const MachineInstr *MI) const override {
+    if (MI == Comp || MI == CondBranch)
+      return true;
+    return false;
+  }
+
+  std::optional<bool> createTripCountGreaterCondition(
+      int TC, MachineBasicBlock &MBB,
+      SmallVectorImpl<MachineOperand> &Cond) override {
+    Cond = BrCond;
+    if (TBB == CondBranch->getParent())
+      TII->reverseBranchCondition(Cond);
+    return {};
+  }
+
+  void setPreheader(MachineBasicBlock *NewPreheader) override {}
+
+  void adjustTripCount(int TripCountAdjust) override {}
+
+  void disposed() override {}
+};
+} // namespace
+
+std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo>
+AArch64InstrInfo::analyzeLoopForPipelining(MachineBasicBlock *LoopBB) const {
+  MachineBasicBlock *TBB, *FBB;
+  SmallVector<MachineOperand, 4> BrCond;
+  if (analyzeBranch(*LoopBB, TBB, FBB, BrCond))
+    return nullptr;
+
+  MachineInstr *CondBranch = &*LoopBB->getFirstTerminator();
+  if (CondBranch->getOpcode() != AArch64::Bcc)
+    return nullptr;
+
+  MachineInstr *Comp;
+  for (auto I = LoopBB->getFirstTerminator(); I != LoopBB->begin(); --I) {
+    if (I->findRegisterDefOperandIdx(AArch64::NZCV) != -1) {
+      Comp = &*I;
+    }
+  }
+
+  return std::make_unique<AArch64PipelinerLoopInfo>(Comp, CondBranch, TBB,
+                                                    BrCond);
+}
+
 #define GET_INSTRINFO_HELPERS
 #define GET_INSTRMAP_INFO
 #include "AArch64GenInstrInfo.inc"
diff --git a/llvm/lib/Target/AArch64/AArch64LoopDistribute.cpp b/llvm/lib/Target/AArch64/AArch64LoopDistribute.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64LoopDistribute.cpp
@@ -0,0 +1,1183 @@
+//===- AArch64LoopDistribute.cpp - AArch64 Loop Distribute Pass -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A prototype of loop distribution for IPC improvement.
+//
+// This pass runs before pipelining and splits loops so that they can be
+// pipelined more efficiently. Only a simple test program has confirmed
+// successfully pipelined. Some improvements are needed for correct
+// transformation. It is implemented for AArch64 only to simplify implementation
+// and finally we want to make it generic.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64RegisterInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-loop-distribute"
+
+static cl::opt<int> NumRegs("loop-dist-num-regs",
+                            cl::desc("Number of registers."), cl::Hidden,
+                            cl::init(32));
+
+namespace {
+class AArch64LoopDistribute : public MachineFunctionPass {
+public:
+  static char ID;
+
+  MachineFunction *MF = nullptr;
+  const MachineLoopInfo *MLI = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+  AliasAnalysis *AA;
+
+  AArch64LoopDistribute() : MachineFunctionPass(ID) {
+    initializeAArch64LoopDistributePass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void distributeLoop(MachineLoop &L);
+};
+char AArch64LoopDistribute::ID = 0;
+} // namespace
+
+INITIALIZE_PASS_BEGIN(AArch64LoopDistribute, DEBUG_TYPE,
+                      "AArch64 Loop Distribute", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(AArch64LoopDistribute, DEBUG_TYPE,
+                    "AArch64 Loop Distribute", false, false)
+
+namespace {
+class Distribute : public ScheduleDAGInstrs {
+  const MCSubtargetInfo &STI;
+  const MCSchedModel &SM;
+  MachineRegisterInfo &MRI;
+  MachineFrameInfo &MFI;
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> PLI;
+  MachineBasicBlock *OrgPreheader;
+
+  struct EstIIDetail {
+    int EstimatedII;
+    int ResMII;
+    double RequiredParallel;
+    double PossibleParallel;
+    int LenCritPath;
+    int NumRegPerIter;
+    int NumRegInvariant;
+    int NumRegOutput;
+    int NumTmpArray;
+    void print(raw_ostream &OS) {
+      OS << "Estimated II: " << EstimatedII << "\n"
+         << "Resource Min II: " << ResMII << "\n"
+         << "Required Num Parallel: " << RequiredParallel << "\n"
+         << "Possible Num Parallel: " << PossibleParallel << "\n"
+         << "Latency Critical Path Length: " << LenCritPath << "\n"
+         << "Required Num Registers Per Iter: " << NumRegPerIter << "\n"
+         << "Required Num Registers For Invariant: " << NumRegInvariant << "\n"
+         << "Required Num Registers For Output: " << NumRegOutput << "\n"
+         << "Num Temporary Arrays: " << NumTmpArray << "\n";
+    }
+  };
+
+  void regPressListSched(std::vector<SUnit *> &RPSched);
+
+  /// Handle only specified kind of registers.
+  bool isTargetRegKind(MachineInstr *MI) {
+    for (MachineOperand &MO : MI->defs()) {
+      if (isTargetRegKind(MO.getReg()))
+        return true;
+    }
+    return false;
+  }
+  bool isTargetRegKind(const SDep &Dep) {
+    if (Dep.isAssignedRegDep() && isTargetRegKind(Dep.getReg()))
+      return true;
+    return false;
+  }
+  bool isTargetRegKind(unsigned Reg) {
+    return MRI.getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass);
+  }
+  int estII(std::vector<SUnit *>::iterator B, std::vector<SUnit *>::iterator E,
+            EstIIDetail &Detail);
+  int maxNumRegPerIter(std::vector<SUnit *>::iterator B,
+                       std::vector<SUnit *>::iterator E);
+  int lenCritPath(std::vector<SUnit *>::iterator B,
+                  std::vector<SUnit *>::iterator E);
+  int resMII(std::vector<SUnit *>::iterator B,
+             std::vector<SUnit *>::iterator E);
+  int numInvariant(std::vector<SUnit *>::iterator B,
+                   std::vector<SUnit *>::iterator E);
+  int numOutRegs(std::vector<SUnit *>::iterator B,
+                 std::vector<SUnit *>::iterator E);
+  int numTmpArrays(std::vector<SUnit *>::iterator B,
+                   std::vector<SUnit *>::iterator E);
+  void gatherPredSUs(const std::set<SUnit *> &Inside, SUnit *Root,
+                     std::set<SUnit *> &Result);
+  unsigned createCounter(MachineBasicBlock *MBB, MachineBasicBlock *Preheader);
+  void replacePhiBB(MachineInstr *MI, MachineBasicBlock *To,
+                    MachineBasicBlock *From);
+  MachineInstr *cloneInstr(MachineInstr *MI, MachineBasicBlock *To,
+                           MachineBasicBlock::iterator ToIter,
+                           std::map<Register, Register> &VRMap);
+  unsigned getLoadOpc(unsigned Reg);
+  unsigned getStoreOpc(unsigned Reg);
+  std::vector<SUnit *>::iterator findDistPoint(std::vector<SUnit *> &RPSched);
+  int countNumTmpLoads(std::vector<SUnit *>::iterator B,
+                       std::vector<SUnit *>::iterator E);
+  int countNumTmpStores(std::vector<SUnit *>::iterator B,
+                        std::vector<SUnit *>::iterator E);
+
+  bool findTripCountReg(Register *LCReg, SUnit **Comp);
+  void constructBlocks(MachineBasicBlock *&OrgExitBB, MachineBasicBlock *&BB1,
+                       MachineBasicBlock *&BB2,
+                       MachineBasicBlock *&PreheaderBB2,
+                       MachineBasicBlock *&UniqExitBB);
+  void cloneInstrs(std::vector<SUnit *> &RPSched,
+                   std::vector<SUnit *>::iterator DistPoint, SUnit *Comp,
+                   MachineBasicBlock *BB1, MachineBasicBlock *BB2,
+                   MachineBasicBlock *PreheaderBB2,
+                   std::map<Register, Register> &VRMap1,
+                   std::map<Register, Register> &VRMap2);
+  void gatherRegsStored(std::map<Register, Register> &Reg2Offset,
+                        std::vector<SUnit *> &RPSched,
+                        std::vector<SUnit *>::iterator DistPoint);
+  void calcTmpArrayOffset(Register TripCount,
+                          std::map<Register, Register> &Reg2Offset,
+                          Register &LastAddr);
+  void allocStackArea(Register ArraySize, MachineBasicBlock *UniqExitBB,
+                      Register &Addr);
+  void insertTmpLoadStore(MachineBasicBlock *BB1, MachineBasicBlock *BB2,
+                          MachineBasicBlock *PreheaderBB2,
+                          std::map<Register, Register> &Reg2Offset,
+                          Register TmpAreaTop,
+                          std::map<Register, Register> &VRMap1);
+
+public:
+  virtual void schedule() override;
+  Distribute(MachineFunction &MF, MachineLoop &L, const MachineLoopInfo *MLI)
+      : ScheduleDAGInstrs(MF, MLI), STI(MF.getSubtarget()),
+        SM(STI.getSchedModel()), MRI(MF.getRegInfo()), MFI(MF.getFrameInfo()),
+        OrgPreheader(L.getLoopPreheader()) {
+    assert(L.getBlocks().size() == 1);
+    assert(L.getLoopPreheader());
+  }
+};
+} // namespace
+
+void AArch64LoopDistribute::distributeLoop(MachineLoop &L) {
+  if (L.getBlocks().size() != 1 || !L.getLoopPreheader())
+    return;
+
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> PLI =
+      TII->analyzeLoopForPipelining(L.getTopBlock());
+
+  if (!PLI)
+    return;
+
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->analyzeBranch(*L.getLoopPreheader(), TBB, FBB, Cond)) {
+    return;
+  }
+  if (TII->analyzeBranch(*L.getTopBlock(), TBB, FBB, Cond)) {
+    return;
+  }
+  if (!TII->analyzeLoopForPipelining(L.getTopBlock())) {
+    return;
+  }
+
+  MachineBasicBlock *MBB = L.getTopBlock();
+  LLVM_DEBUG(MBB->dump(););
+  unsigned Size = MBB->size();
+  for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
+                                   E = MBB->instr_end();
+       I != E; ++I, --Size)
+    ;
+  Distribute Dist(*MF, L, MLI);
+  Dist.startBlock(L.getTopBlock());
+  auto Iter = MBB->getFirstTerminator();
+  Dist.enterRegion(L.getTopBlock(), MBB->getFirstNonDebugInstr(), Iter, Size);
+  Dist.buildSchedGraph(AA);
+  Dist.dump();
+  Dist.schedule();
+}
+
+Register getPhiLoopReg(MachineInstr &Phi) {
+  assert(Phi.getNumOperands() == 5);
+  int I;
+  if (Phi.getOperand(2).getMBB() == Phi.getParent())
+    I = 1;
+  else
+    I = 3;
+  return Phi.getOperand(I).getReg();
+}
+
+Register getPhiInitReg(MachineInstr &Phi) {
+  assert(Phi.getNumOperands() == 5);
+  int I;
+  if (Phi.getOperand(2).getMBB() != Phi.getParent())
+    I = 1;
+  else
+    I = 3;
+  return Phi.getOperand(I).getReg();
+}
+
+unsigned Distribute::getLoadOpc(unsigned Reg) {
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  switch (TRI->getSpillSize(*RC)) {
+  case 1:
+    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+      return AArch64::LDRBui;
+    break;
+  case 2:
+    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+      return AArch64::LDRHui;
+    break;
+  case 4:
+    if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+      return AArch64::LDRSui;
+    break;
+  case 8:
+    if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+      return AArch64::LDRDui;
+    break;
+  case 16:
+    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+      return AArch64::LDRQui;
+    break;
+  }
+  std::abort();
+}
+
+unsigned Distribute::getStoreOpc(unsigned Reg) {
+  const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+  switch (TRI->getSpillSize(*RC)) {
+  case 1:
+    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+      return AArch64::STRBui;
+    break;
+  case 2:
+    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+      return AArch64::STRHui;
+    break;
+  case 4:
+    if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+      return AArch64::STRSui;
+    break;
+  case 8:
+    if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+      return AArch64::STRDui;
+    break;
+  case 16:
+    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+      return AArch64::STRQui;
+    break;
+  }
+  std::abort();
+}
+
+/// Repeatedly gather preceding nodes from the Root, stopping when it reaches
+/// StopNodes
+void Distribute::gatherPredSUs(const std::set<SUnit *> &StopNodes, SUnit *Root,
+                               std::set<SUnit *> &Result) {
+  std::set<SUnit *> Processing;
+  Processing.insert(Root);
+  Result.insert(Root);
+  while (Processing.size()) {
+    SUnit *SU = *Processing.begin();
+    Processing.erase(SU);
+    if (SU->getInstr()->isPHI()) {
+      SUnit *LoopPhi = getSUnit(MRI.getVRegDef(getPhiLoopReg(*SU->getInstr())));
+      if (StopNodes.count(LoopPhi) || Result.count(LoopPhi))
+        continue;
+      assert(LoopPhi);
+      Result.insert(LoopPhi);
+      Processing.insert(LoopPhi);
+      continue;
+    }
+    for (SDep &Pred : SU->Preds) {
+      SUnit *PredSU = Pred.getSUnit();
+      if (PredSU->isBoundaryNode() || StopNodes.count(PredSU) ||
+          Result.count(PredSU))
+        continue;
+      Result.insert(PredSU);
+      Processing.insert(PredSU);
+    }
+  }
+}
+
+/// Create a loop counter starting from 0 and increasing by 1.
+unsigned Distribute::createCounter(MachineBasicBlock *MBB,
+                                   MachineBasicBlock *Preheader) {
+  Register Zero = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+  BuildMI(*Preheader, Preheader->getFirstTerminator(), DebugLoc(),
+          TII->get(AArch64::MOVi64imm), Zero)
+      .addImm(0);
+  Register PhiReg = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+  Register LoopReg = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+  BuildMI(*MBB, MBB->getFirstNonPHI(), DebugLoc(), TII->get(TargetOpcode::PHI),
+          PhiReg)
+      .addReg(Zero)
+      .addMBB(Preheader)
+      .addReg(LoopReg)
+      .addMBB(MBB);
+  BuildMI(*MBB, MBB->getFirstTerminator(), DebugLoc(),
+          TII->get(AArch64::ADDXri), LoopReg)
+      .addReg(PhiReg)
+      .addImm(1)
+      .addImm(0);
+  return PhiReg;
+}
+
+void Distribute::replacePhiBB(MachineInstr *MI, MachineBasicBlock *To,
+                              MachineBasicBlock *From) {
+  for (MachineOperand &MO : MI->operands())
+    if (MO.isMBB() && MO.getMBB() == From)
+      MO.setMBB(To);
+}
+
+MachineInstr *Distribute::cloneInstr(MachineInstr *MI, MachineBasicBlock *To,
+                                     MachineBasicBlock::iterator ToIter,
+                                     std::map<Register, Register> &VRMap) {
+  MachineInstr *NewMI = MF.CloneMachineInstr(MI);
+  for (MachineOperand &MO : NewMI->defs()) {
+    if (!MO.isReg() || !MO.getReg().isVirtual())
+      continue;
+    Register Reg = MO.getReg();
+    const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+    Register NewReg = MRI.createVirtualRegister(RC);
+    VRMap[Reg] = NewReg;
+    MO.setReg(NewReg);
+  }
+  if (NewMI->isPHI())
+    for (MachineOperand &MO : NewMI->operands())
+      if (MO.isMBB() && MO.getMBB() == MI->getParent())
+        MO.setMBB(To);
+  To->insert(ToIter, NewMI);
+  return NewMI;
+}
+
+/// Return the division point that minimizes II
+std::vector<SUnit *>::iterator
+Distribute::findDistPoint(std::vector<SUnit *> &RPSched) {
+  EstIIDetail Detail1, Detail2;
+  int MinII = estII(RPSched.begin(), RPSched.end(), Detail1);
+  LLVM_DEBUG({
+    dbgs() << "Original: \n";
+    Detail1.print(dbgs());
+    dbgs() << "\n";
+  });
+
+  int MinDiffII = MinII;
+  auto DistPoint = RPSched.end();
+  for (auto I = RPSched.begin(); I != RPSched.end(); ++I) {
+    EstIIDetail DetailTmp1, DetailTmp2;
+    int II1 = estII(RPSched.begin(), I, DetailTmp1);
+    if (II1 == 0)
+      continue;
+    int II2 = estII(I, RPSched.end(), DetailTmp2);
+    if (II2 == 0)
+      continue;
+    int II = II1 + II2;
+
+    if (II < MinII ||
+        (II == MinII && DetailTmp1.NumTmpArray < Detail1.NumTmpArray) ||
+        (II == MinII && DetailTmp2.NumTmpArray == Detail1.NumTmpArray &&
+         std::abs(II1 - II2) < MinDiffII)) {
+      MinII = II;
+      DistPoint = I;
+      MinDiffII = std::abs(II1 - II2);
+      Detail1 = DetailTmp1;
+      Detail2 = DetailTmp2;
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "First Block: \n";
+    Detail1.print(dbgs());
+    for (auto I = RPSched.begin(); I != DistPoint; ++I)
+      (*I)->getInstr()->dump();
+    dbgs() << "\n";
+
+    dbgs() << "Second Block: \n";
+    Detail2.print(dbgs());
+    for (auto I = DistPoint; I != RPSched.end(); ++I)
+      (*I)->getInstr()->dump();
+    dbgs() << "\n";
+  });
+
+  return DistPoint;
+}
+
+/// Returns a register representing the trip count.
+/// A simple implementation that may return an incorrect result.
+bool Distribute::findTripCountReg(Register *TripCount, SUnit **Comp) {
+  for (auto I = BB->rbegin(); I != BB->rend(); ++I) {
+    if (I->isCompare()) {
+      *Comp = getSUnit(&*I);
+      MachineInstr *CompMI = (*Comp)->getInstr();
+      MachineOperand &TC = CompMI->getOperand(1);
+      if (!TC.isReg())
+        return false;
+      MachineInstr *Phi = MRI.getVRegDef(TC.getReg());
+      if (!Phi->isPHI())
+        return false;
+      *TripCount = getPhiInitReg(*Phi);
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Insert new blocks and branches and modify the CFG.
+void Distribute::constructBlocks(MachineBasicBlock *&OrgExitBB,
+                                 MachineBasicBlock *&BB1,
+                                 MachineBasicBlock *&BB2,
+                                 MachineBasicBlock *&PreheaderBB2,
+                                 MachineBasicBlock *&UniqExitBB) {
+  // OrgPreheader:
+  //   b BB1
+  //
+  // BB1:
+  //   b.cc BB1
+  //   b PreheaderBB2
+  //
+  // PreheaderBB2:
+  //   b BB2
+  //
+  // BB2:
+  //   b.cc BB2
+  //   b UniqExitBB
+  //
+  // UniqExitBB:
+  //   b OrgExitBB
+
+  BB1 = MF.CreateMachineBasicBlock(BB->getBasicBlock());
+  BB2 = MF.CreateMachineBasicBlock(BB->getBasicBlock());
+  PreheaderBB2 = MF.CreateMachineBasicBlock(BB->getBasicBlock());
+  UniqExitBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
+  MF.insert(BB->getIterator(), BB1);
+  MF.insert(BB->getIterator(), PreheaderBB2);
+  MF.insert(BB->getIterator(), BB2);
+
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  TII->analyzeBranch(*OrgPreheader, TBB, FBB, Cond);
+  assert(!FBB);
+  if (TBB) {
+    TII->removeBranch(*OrgPreheader);
+    TII->insertUnconditionalBranch(*OrgPreheader, BB1, DebugLoc());
+  }
+  TII->analyzeBranch(*BB, TBB, FBB, Cond);
+  TII->insertBranch(*BB1, TBB == BB ? BB1 : PreheaderBB2,
+                    TBB == BB ? PreheaderBB2 : BB1, Cond, DebugLoc());
+  TII->insertBranch(*BB2, TBB == BB ? BB2 : UniqExitBB,
+                    TBB == BB ? UniqExitBB : BB2, Cond, DebugLoc());
+
+  if (!FBB)
+    FBB = BB->getNextNode();
+
+  if (TBB == BB)
+    OrgExitBB = FBB;
+  else
+    OrgExitBB = TBB;
+
+  MF.insert(OrgExitBB->getIterator(), UniqExitBB);
+  TII->insertUnconditionalBranch(*UniqExitBB, OrgExitBB, DebugLoc());
+  TII->insertUnconditionalBranch(*PreheaderBB2, BB2, DebugLoc());
+
+  OrgPreheader->replaceSuccessor(BB, BB1);
+  BB1->addSuccessor(BB1);
+  BB1->addSuccessor(PreheaderBB2);
+  PreheaderBB2->addSuccessor(BB2);
+  BB2->addSuccessor(BB2);
+  BB2->addSuccessor(UniqExitBB);
+  BB->removeSuccessor(OrgExitBB);
+  UniqExitBB->addSuccessor(OrgExitBB);
+}
+
+/// Clone instructions to new blocks.
+void Distribute::cloneInstrs(std::vector<SUnit *> &RPSched,
+                             std::vector<SUnit *>::iterator DistPoint,
+                             SUnit *Comp, MachineBasicBlock *BB1,
+                             MachineBasicBlock *BB2,
+                             MachineBasicBlock *PreheaderBB2,
+                             std::map<Register, Register> &VRMap1,
+                             std::map<Register, Register> &VRMap2) {
+
+  // Clone instructions for each division
+  for (auto I = RPSched.begin(); I != DistPoint; ++I)
+    cloneInstr((*I)->getInstr(), BB1, BB1->getFirstTerminator(), VRMap1);
+  for (auto I = DistPoint; I != RPSched.end(); ++I)
+    cloneInstr((*I)->getInstr(), BB2, BB2->getFirstTerminator(), VRMap2);
+
+  std::set<SUnit *> BB1SUs(RPSched.begin(), DistPoint),
+      BB2SUs(DistPoint, RPSched.end());
+  std::set<SUnit *> CloneSUs1, CloneSUs2;
+
+  // Gather instructions for loop control
+  if (!BB1SUs.count(Comp))
+    gatherPredSUs(BB1SUs, Comp, CloneSUs1);
+  if (!BB2SUs.count(Comp))
+    gatherPredSUs(BB2SUs, Comp, CloneSUs2);
+
+  // Gather instructions contained in a cycle that crosses a division.
+  // Assume they are all trivial instructions.
+  for (auto I = RPSched.begin(); I != DistPoint; ++I) {
+    for (auto Dep : (*I)->Preds) {
+      if (Dep.getSUnit()->isBoundaryNode() ||
+          !Dep.getSUnit()->getInstr()->isPHI())
+        continue;
+      gatherPredSUs(BB1SUs, Dep.getSUnit(), CloneSUs1);
+    }
+  }
+  for (auto I = DistPoint; I != RPSched.end(); ++I) {
+    for (auto Dep : (*I)->Preds) {
+      if (Dep.getSUnit()->isBoundaryNode())
+        continue;
+      if (Dep.getSUnit()->getInstr()->isPHI())
+        gatherPredSUs(BB2SUs, Dep.getSUnit(), CloneSUs2);
+      else if (!BB2SUs.count(Dep.getSUnit()) && !isTargetRegKind(Dep))
+        gatherPredSUs(BB2SUs, Dep.getSUnit(), CloneSUs2);
+    }
+  }
+
+  // Clone gathered instructions.
+  for (SUnit *I : RPSched)
+    if (CloneSUs1.count(I))
+      cloneInstr(I->getInstr(), BB1, BB1->getFirstTerminator(), VRMap1);
+  for (SUnit *I : CloneSUs1)
+    if (I->getInstr()->isPHI())
+      cloneInstr(I->getInstr(), BB1, BB1->begin(), VRMap1);
+  for (SUnit *I : RPSched)
+    if (CloneSUs2.count(I))
+      cloneInstr(I->getInstr(), BB2, BB2->getFirstTerminator(), VRMap2);
+  for (SUnit *I : CloneSUs2)
+    if (I->getInstr()->isPHI()) {
+      MachineInstr *NewMI =
+          cloneInstr(I->getInstr(), BB2, BB2->begin(), VRMap2);
+      replacePhiBB(NewMI, PreheaderBB2, OrgPreheader);
+    }
+
+  // Replace references to new VRs.
+  for (auto &I : VRMap2)
+    // References outside the loop use the result of the division on the lower
+    // side.
+    for (auto &MO : llvm::make_early_inc_range(MRI.use_operands(I.first)))
+      if (MO.getParent()->getParent() != BB1)
+        MO.setReg(I.second);
+  for (auto &I : VRMap1)
+    for (auto &MO : llvm::make_early_inc_range(MRI.use_operands(I.first)))
+      if (MO.getParent()->getParent() != BB2)
+        MO.setReg(I.second);
+}
+
+/// Gather registers across the division of the target register kind.
+void Distribute::gatherRegsStored(std::map<Register, Register> &Reg2Offset,
+                                  std::vector<SUnit *> &RPSched,
+                                  std::vector<SUnit *>::iterator DistPoint) {
+  std::set<SUnit *> BB1SUs(RPSched.begin(), DistPoint);
+  for (auto I = RPSched.begin(); I != DistPoint; ++I) {
+    for (auto Dep : (*I)->Succs)
+      if (!Dep.getSUnit()->isBoundaryNode() && !BB1SUs.count(Dep.getSUnit()) &&
+          isTargetRegKind(Dep)) {
+        Register Reg = Dep.getReg();
+        for (auto Def : (*I)->getInstr()->defs())
+          if (Def.isReg() && Def.getReg() == Reg)
+            Reg2Offset.insert({Def.getReg(), 0});
+      }
+  }
+}
+
+/// Calculate offsets of each temporary arrays.
+void Distribute::calcTmpArrayOffset(Register TripCount,
+                                    std::map<Register, Register> &Reg2Offset,
+                                    Register &ArraySize) {
+  Register TripCountCommon =
+      MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+  BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(),
+          TII->get(AArch64::COPY), TripCountCommon)
+      .addReg(TripCount);
+
+  unsigned AlignSize = 8;
+  Register Zero = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+  BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(),
+          TII->get(AArch64::MOVi64imm), Zero)
+      .addImm(0);
+  Register Cur = Zero;
+  for (auto &I : Reg2Offset) {
+    // offset0 = 0
+    // offset1 = sizeof(type0)*trip_count
+    // offset2 = (offset1+7)&-8 + sizeof(type1)*trip_count
+    // ...
+    I.second = Cur;
+    const TargetRegisterClass *RC = MRI.getRegClass(I.first);
+    unsigned Size = TRI->getRegSizeInBits(*RC) / 8;
+    Register SizeReg = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+    BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(),
+            TII->get(AArch64::MOVi64imm), SizeReg)
+        .addImm(Size);
+
+    Register MultReg = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+    BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(),
+            TII->get(AArch64::MADDXrrr), MultReg)
+        .addReg(TripCountCommon)
+        .addReg(SizeReg)
+        .addReg(Cur);
+
+    Register AddReg2 = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+    BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(),
+            TII->get(AArch64::ADDXri), AddReg2)
+        .addReg(MultReg)
+        .addImm(AlignSize - 1)
+        .addImm(0);
+
+    Register AndReg = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+    BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(),
+            TII->get(AArch64::ANDXri), AndReg)
+        .addReg(AddReg2)
+        .addImm(AArch64_AM::encodeLogicalImmediate(UINT64_MAX - (AlignSize - 1),
+                                                   64));
+    Cur = AndReg;
+  }
+  ArraySize = Cur;
+}
+
+/// Allocate stack area for temporary arrays.
+void Distribute::allocStackArea(Register ArraySize,
+                                MachineBasicBlock *UniqExitBB, Register &Addr) {
+  // Need to confirm the correct method.
+  MFI.CreateVariableSizedObject(Align(1), nullptr);
+  MFI.setAdjustsStack(true);
+  BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(),
+          TII->get(AArch64::ADJCALLSTACKDOWN))
+      .addImm(0)
+      .addImm(0);
+  Register CopySP = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+  BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(),
+          TII->get(AArch64::COPY), CopySP)
+      .addReg(AArch64::SP);
+  Register SubSP = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+  BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(),
+          TII->get(AArch64::SUBXrr), SubSP)
+      .addReg(CopySP)
+      .addReg(ArraySize);
+  BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(),
+          TII->get(AArch64::COPY), AArch64::SP)
+      .addReg(SubSP);
+  BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(),
+          TII->get(AArch64::ADJCALLSTACKUP))
+      .addImm(0)
+      .addImm(0);
+
+  BuildMI(*UniqExitBB, UniqExitBB->getFirstNonPHI(), DebugLoc(),
+          TII->get(AArch64::COPY), AArch64::SP)
+      .addReg(CopySP);
+
+  Addr = SubSP;
+}
+
+/// Insert load and store instructions for temporary arrays.
+void Distribute::insertTmpLoadStore(MachineBasicBlock *BB1,
+                                    MachineBasicBlock *BB2,
+                                    MachineBasicBlock *PreheaderBB2,
+                                    std::map<Register, Register> &Reg2Offset,
+                                    Register TmpAreaTop,
+                                    std::map<Register, Register> &VRMap1) {
+  Register FirstCounter = createCounter(BB1, OrgPreheader);
+  Register SecondCounter = createCounter(BB2, PreheaderBB2);
+  std::map<Register, Register> LoadVRMap;
+  unsigned Offset = 0;
+  for (auto &I : Reg2Offset) {
+    // Array0 = TmpArrayTop + Offset0
+    // FirstCounter = 0
+    // BB1:
+    //   Reg0_1 = ...
+    //   store(Reg0_1, Array0+FirstCounter*Size0)
+    //   FirstCounter += 1
+    //   ...
+    // SecondCounter = 0
+    // BB2:
+    //   Reg0_2 = load(Array0+SecondCounter*Size0)
+    //   ... = ... Reg0_2 ...
+    //   SecondCounter += 1
+    //   ...
+
+    Register Array = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+    BuildMI(*OrgPreheader, OrgPreheader->getFirstTerminator(), DebugLoc(),
+            TII->get(AArch64::ADDXrr), Array)
+        .addReg(I.second)
+        .addReg(TmpAreaTop);
+
+    Register Addr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+    unsigned Shift =
+        llvm::Log2_64(TRI->getRegSizeInBits(*MRI.getRegClass(I.first)) / 8);
+    BuildMI(*BB1, BB1->getFirstTerminator(), DebugLoc(),
+            TII->get(AArch64::ADDXrs), Addr)
+        .addReg(Array)
+        .addReg(FirstCounter)
+        .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
+    MachineInstr *Store = BuildMI(*BB1, BB1->getFirstTerminator(), DebugLoc(),
+                                  TII->get(getStoreOpc(I.first)))
+                              .addReg(VRMap1[I.first])
+                              .addReg(Addr)
+                              .addImm(0);
+
+    Addr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
+    Register Loaded = MRI.createVirtualRegister(MRI.getRegClass(I.first));
+    MachineInstr *Load = BuildMI(*BB2, BB2->getFirstNonPHI(), DebugLoc(),
+                                 TII->get(getLoadOpc(Loaded)), Loaded)
+                             .addReg(Addr)
+                             .addImm(0);
+    BuildMI(*BB2, BB2->getFirstNonPHI(), DebugLoc(), TII->get(AArch64::ADDXrs),
+            Addr)
+        .addReg(Array)
+        .addReg(SecondCounter)
+        .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
+
+    // Workaround to cause later analysis to determine that there are no
+    // aliases. Requires modification to correct pointer information.
+    Load->addMemOperand(
+        MF,
+        MF.getMachineMemOperand(
+            MachinePointerInfo(MF.getPSVManager().getConstantPool(), Offset),
+            MachineMemOperand::MOLoad, 8, Align(8)));
+    Store->addMemOperand(
+        MF,
+        MF.getMachineMemOperand(
+            MachinePointerInfo(MF.getPSVManager().getConstantPool(), Offset),
+            MachineMemOperand::MOStore, 8, Align(8)));
+
+    LoadVRMap[I.first] = Loaded;
+    Offset += 8;
+  }
+
+  for (auto &I : LoadVRMap)
+    for (auto &MO : llvm::make_early_inc_range(MRI.use_operands(I.first)))
+      if (MO.getParent()->getParent() == BB2)
+        MO.setReg(I.second);
+}
+
+/// Performe loop distribution.
+void Distribute::schedule() {
+  SUnit *Comp;
+  Register TripCount;
+  if (!findTripCountReg(&TripCount, &Comp))
+    return;
+
+  // Tentatively, schedule the instruction to minimize register pressure and
+  // divide it into two parts. Need some improvements:
+  //   * Divide more flexibly based on DFG.
+  //   * Preserve memory dependencies.
+  //   * Ensure that cycles containing costly instructions do not cross the
+  //   division.
+  //   * Repeat division.
+
+  std::vector<SUnit *> RPSched;
+  regPressListSched(RPSched);
+
+  std::vector<SUnit *>::iterator DistPoint = findDistPoint(RPSched);
+  if (DistPoint == RPSched.begin())
+    return;
+
+  MachineBasicBlock *OrgExitBB, *BB1, *BB2, *PreheaderBB2, *UniqExitBB;
+  constructBlocks(OrgExitBB, BB1, BB2, PreheaderBB2, UniqExitBB);
+
+  std::map<Register, Register> VRMap1, VRMap2;
+  cloneInstrs(RPSched, DistPoint, Comp, BB1, BB2, PreheaderBB2, VRMap1, VRMap2);
+
+  std::map<Register, Register> Reg2Offset;
+  gatherRegsStored(Reg2Offset, RPSched, DistPoint);
+
+  Register ArraySize;
+  calcTmpArrayOffset(TripCount, Reg2Offset, ArraySize);
+
+  Register TmpAreaTop;
+  allocStackArea(ArraySize, UniqExitBB, TmpAreaTop);
+
+  insertTmpLoadStore(BB1, BB2, PreheaderBB2, Reg2Offset, TmpAreaTop, VRMap1);
+
+  MF.erase(BB);
+}
+
+/// List scheduling to minimize register pressure
+void Distribute::regPressListSched(std::vector<SUnit *> &RPSched) {
+  std::set<SUnit *> Ready, Scheduled;
+  for (SUnit &SU : SUnits) {
+    if (SU.getInstr()->isPHI() || SU.getInstr()->isBranch()) {
+      Scheduled.insert(&SU);
+      continue;
+    }
+    bool IsReady = true;
+    for (SDep &Pred : SU.Preds) {
+      if (Scheduled.count(Pred.getSUnit()) == 0) {
+        IsReady = false;
+        break;
+      }
+    }
+    if (IsReady)
+      Ready.insert(&SU);
+  }
+
+  while (Scheduled.size() < SUnits.size()) {
+    int MaxNumDead;
+    SUnit *Candidate = nullptr;
+    for (SUnit *SU : Ready) {
+      int NumDead = 0;
+      for (SDep &Pred : SU->Preds) {
+        bool Dead = true;
+        for (SDep &User : Pred.getSUnit()->Succs) {
+          if (User.getSUnit() != SU && Scheduled.count(User.getSUnit()) == 0) {
+            Dead = false;
+            break;
+          }
+          if (Dead)
+            NumDead++;
+        }
+        if (!Candidate || MaxNumDead < NumDead) {
+          MaxNumDead = NumDead;
+          Candidate = SU;
+        }
+      }
+    }
+    RPSched.push_back(Candidate);
+    Scheduled.insert(Candidate);
+    Ready.erase(Candidate);
+    for (SDep &Succ : Candidate->Succs) {
+      if (Succ.getSUnit()->isBoundaryNode() || Scheduled.count(Succ.getSUnit()))
+        continue;
+      bool IsReady = true;
+      for (SDep &Pred : Succ.getSUnit()->Preds) {
+        if (!Pred.getSUnit()->isBoundaryNode() &&
+            Scheduled.count(Pred.getSUnit()) == 0) {
+          IsReady = false;
+          break;
+        }
+      }
+      if (IsReady)
+        Ready.insert(Succ.getSUnit());
+    }
+  }
+}
+
+int Distribute::numInvariant(std::vector<SUnit *>::iterator B,
+                             std::vector<SUnit *>::iterator E) {
+  std::set<Register> Invariant;
+  for (auto I = B; I != E; ++I) {
+    for (MachineOperand &MO : (*I)->getInstr()->uses()) {
+      if (!MO.isReg() || !MO.getReg().isVirtual())
+        continue;
+      if (MRI.getVRegDef(MO.getReg())->getParent() != BB)
+        Invariant.insert(MO.getReg());
+    }
+  }
+  return Invariant.size();
+}
+
+int Distribute::numOutRegs(std::vector<SUnit *>::iterator B,
+                           std::vector<SUnit *>::iterator E) {
+  int Res = 0;
+  for (auto I = B; I != E; ++I) {
+    for (MachineOperand &MO : (*I)->getInstr()->defs()) {
+      if (!MO.isReg() || !MO.getReg().isVirtual())
+        continue;
+      if (MRI.getVRegDef(MO.getReg())->getParent() != BB)
+        Res++;
+    }
+  }
+  return Res;
+}
+
+int Distribute::lenCritPath(std::vector<SUnit *>::iterator B,
+                            std::vector<SUnit *>::iterator E) {
+  std::set<SUnit *> Ready, Ranged(B, E);
+  std::map<SUnit *, unsigned> Deapth;
+  for (auto I = B; I != E; ++I) {
+    bool IsReady = true;
+    for (SDep &Pred : (*I)->Preds) {
+      if (Ranged.count(Pred.getSUnit())) {
+        IsReady = false;
+        break;
+      }
+      if (IsReady)
+        Ready.insert(*I);
+    }
+  }
+
+  unsigned MaxDeapth = 0;
+  while (Ready.size()) {
+    SUnit *SU = *Ready.begin();
+    Ready.erase(SU);
+    unsigned Max = 0;
+    for (SDep &Pred : SU->Preds) {
+      auto I = Deapth.find(Pred.getSUnit());
+      if (I == Deapth.end())
+        continue;
+      Max = std::max(Max, I->second + Pred.getLatency());
+    }
+    Deapth[SU] = Max;
+    MaxDeapth = std::max(MaxDeapth, Max);
+    for (SDep &Succ : SU->Succs) {
+      if (Ranged.count(Succ.getSUnit()) == 0)
+        continue;
+      bool IsReady = true;
+      for (SDep &SPred : Succ.getSUnit()->Preds) {
+        if (Ranged.count(SPred.getSUnit()) && !Deapth.count(SPred.getSUnit())) {
+          IsReady = false;
+          break;
+        }
+      }
+      if (IsReady)
+        Ready.insert(Succ.getSUnit());
+    }
+  }
+  return MaxDeapth;
+}
+
+/// Calculate maximum number of registers consumed per iteration.
+/// The number of registers consumed across iterations is not counted.
+int Distribute::maxNumRegPerIter(std::vector<SUnit *>::iterator B,
+                                 std::vector<SUnit *>::iterator E) {
+  std::set<SUnit *> Past, Ranged(B, E);
+  int MaxReg, CurReg;
+  MaxReg = CurReg = 0;
+  bool SpillAndDeadPrev = false;
+  for (auto I = B; I != E; ++I) {
+    Past.insert(*I);
+    if (SpillAndDeadPrev) {
+      CurReg--;
+      SpillAndDeadPrev = false;
+    }
+    if (isTargetRegKind((*I)->getInstr())) {
+      CurReg++;
+      bool HasNoUser = true;
+      for (SDep &D : (*I)->Succs) {
+        if (Ranged.count(D.getSUnit())) {
+          HasNoUser = false;
+          break;
+        }
+      }
+      if (HasNoUser)
+        // A case where it is only used in the other division.
+        SpillAndDeadPrev = true;
+    }
+    for (SDep &Pred : (*I)->Preds) {
+      if (!isTargetRegKind(Pred))
+        continue;
+      if (Ranged.count(Pred.getSUnit()) == 0) {
+        bool IsLoaded = false;
+        for (SDep &PSucc : Pred.getSUnit()->Succs) {
+          if (Past.count(PSucc.getSUnit()) && PSucc.getSUnit() != *I) {
+            IsLoaded = true;
+            break;
+          }
+        }
+        if (!IsLoaded)
+          // A case where the definition in the other division is first
+          // referenced. Assume other references use the same loaded register.
+          CurReg++;
+      }
+      bool IsDead = true;
+      for (SDep &PSucc : Pred.getSUnit()->Succs) {
+        if (Past.count(PSucc.getSUnit()) == 0 &&
+            Ranged.count(PSucc.getSUnit())) {
+          IsDead = false;
+          break;
+        }
+      }
+      if (IsDead)
+        CurReg--;
+    }
+    MaxReg = std::max(MaxReg, CurReg);
+  }
+  return MaxReg;
+}
+
+static void consumeResources(SmallVectorImpl<uint64_t> &ResourceCount,
+                             int &NumMops, const MCSchedClassDesc &SCDesc,
+                             const MCSubtargetInfo &STI, int Times = 1) {
+  NumMops += SCDesc.NumMicroOps * Times;
+  for (const MCWriteProcResEntry &PRE :
+       make_range(STI.getWriteProcResBegin(&SCDesc),
+                  STI.getWriteProcResEnd(&SCDesc))) {
+    ResourceCount[PRE.ProcResourceIdx] += PRE.Cycles * Times;
+  }
+}
+
+int Distribute::countNumTmpLoads(std::vector<SUnit *>::iterator B,
+                                 std::vector<SUnit *>::iterator E) {
+  std::set<SUnit *> SetSUs(B, E);
+  std::set<unsigned> CrossRegs;
+  for (auto I = B; I != E; ++I)
+    for (SDep &SD : (*I)->Preds)
+      if (isTargetRegKind(SD) && !SetSUs.count(SD.getSUnit()))
+        CrossRegs.insert(SD.getReg());
+  return CrossRegs.size();
+}
+
+int Distribute::countNumTmpStores(std::vector<SUnit *>::iterator B,
+                                  std::vector<SUnit *>::iterator E) {
+  std::set<SUnit *> SetSUs(B, E);
+  std::set<unsigned> CrossRegs;
+  for (auto I = B; I != E; ++I) {
+    for (SDep &SD : (*I)->Succs)
+      if (isTargetRegKind(SD) && !SetSUs.count(SD.getSUnit()))
+        CrossRegs.insert(SD.getReg());
+  }
+  return CrossRegs.size();
+}
+
+/// Calculate resource Min II.
+int Distribute::resMII(std::vector<SUnit *>::iterator B,
+                       std::vector<SUnit *>::iterator E) {
+  // Same method as MachinePipeliner.
+  // Instructions duplicated from the other division and address calculations
+  // for temporary arrays are assumed to be trivial and ignored.
+
+  int NumMops = 0;
+  SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds());
+  for (auto I = B; I != E; ++I) {
+    SUnit &SU = **I;
+    if (TII->isZeroCost(SU.getInstr()->getOpcode()))
+      continue;
+
+    const MCSchedClassDesc *SCDesc = getSchedClass(&SU);
+    if (!SCDesc->isValid())
+      continue;
+
+    consumeResources(ResourceCount, NumMops, *SCDesc, STI);
+  }
+
+  // Consider load/store of temporary arrays.
+  consumeResources(ResourceCount, NumMops,
+                   *STI.getSchedModel().getSchedClassDesc(
+                       TII->get(AArch64::LDRDui).getSchedClass()),
+                   STI, countNumTmpLoads(B, E));
+  consumeResources(ResourceCount, NumMops,
+                   *STI.getSchedModel().getSchedClassDesc(
+                       TII->get(AArch64::STRDui).getSchedClass()),
+                   STI, countNumTmpStores(B, E));
+
+  int Result = (NumMops + SM.IssueWidth - 1) / SM.IssueWidth;
+  for (unsigned I = 1, E = SM.getNumProcResourceKinds(); I < E; ++I) {
+    const MCProcResourceDesc *Desc = SM.getProcResource(I);
+    int Cycles = (ResourceCount[I] + Desc->NumUnits - 1) / Desc->NumUnits;
+    if (Cycles > Result)
+      Result = Cycles;
+  }
+  return Result;
+}
+
+/// Estimate the smallest II satisfying register constraints in the division.
+int Distribute::estII(std::vector<SUnit *>::iterator B,
+                      std::vector<SUnit *>::iterator E, EstIIDetail &Detail) {
+  // Consider II as the total latency per iteration divided by the number of
+  // overlapping iterations. The number of iterations that can overlap is
+  // calculated as the number of registers in the architecture divided by the
+  // maximum register consumption per iteration. This calculation may be
+  // underestimated because the areas with maximum register pressure do not
+  // always overlap.
+
+  // TODO: RecMII is not considered.
+
+  if (B == E)
+    return 0;
+  Detail.NumRegPerIter = maxNumRegPerIter(B, E);
+  if (Detail.NumRegPerIter == 0)
+    return 0;
+  Detail.NumRegInvariant = numInvariant(B, E);
+  Detail.NumRegOutput = numOutRegs(B, E);
+  Detail.PossibleParallel =
+      (double)(NumRegs - Detail.NumRegInvariant - Detail.NumRegOutput) /
+      Detail.NumRegPerIter;
+  Detail.ResMII = resMII(B, E);
+  if (Detail.ResMII == 0)
+    return 0;
+  Detail.LenCritPath = lenCritPath(B, E);
+  Detail.RequiredParallel = (double)Detail.LenCritPath / Detail.ResMII;
+  Detail.NumTmpArray = countNumTmpLoads(B, E) + countNumTmpStores(B, E);
+  if (Detail.RequiredParallel <= 1)
+    // No pipelining required.
+    Detail.EstimatedII = Detail.ResMII;
+  else if (Detail.PossibleParallel > Detail.RequiredParallel)
+    // Can achieve ResMII.
+    Detail.EstimatedII =
+        std::ceil(Detail.LenCritPath / Detail.RequiredParallel);
+  else
+    // Need to pipeline loosely due to lack of registers.
+    Detail.EstimatedII =
+        std::ceil(Detail.LenCritPath / Detail.PossibleParallel);
+  return Detail.EstimatedII;
+}
+
+bool AArch64LoopDistribute::runOnMachineFunction(MachineFunction &Mf) {
+  MF = &Mf;
+  MLI = &getAnalysis<MachineLoopInfo>();
+  TII = MF->getSubtarget().getInstrInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  for (const auto &L : *MLI)
+    distributeLoop(*L);
+
+  return false;
+}
+
+void AArch64LoopDistribute::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addRequired<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+FunctionPass *llvm::createAArch64LoopDistribute() {
+  return new AArch64LoopDistribute();
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -191,6 +191,9 @@
   bool enableMachineScheduler() const override { return true; }
   bool enablePostRAScheduler() const override { return usePostRAScheduler(); }
 
+  bool enableMachinePipeliner() const override;
+  bool useDFAforSMS() const override { return false; }
+
   /// Returns ARM processor family.
   /// Avoid this function! CPU specifics should be kept local to this class
   /// and preferably modeled with SubtargetFeatures or properties in
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -68,6 +68,11 @@
     ForceStreamingCompatibleSVE("force-streaming-compatible-sve",
                                 cl::init(false), cl::Hidden);
 
+static cl::opt<bool>
+    EnableMachinePipeliner("aarch64-enable-pipeliner",
+                           cl::desc("Enable Machine Pipeliner for AArch64"),
+                           cl::init(false), cl::Hidden);
+
 unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const {
   if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0)
     return OverrideVectorInsertExtractBaseCost;
@@ -479,3 +484,7 @@
   }
   return false;
 }
+
+bool AArch64Subtarget::enableMachinePipeliner() const {
+  return getSchedModel().hasInstrSchedModel() && EnableMachinePipeliner;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -172,6 +172,10 @@
                         cl::desc("Enable the AArch64 branch target pass"),
                         cl::init(true));
 
+static cl::opt<bool> EnableLoopDist("aarch64-enable-loop-dist",
+                                    cl::desc("Enable loop distribution"),
+                                    cl::init(false), cl::Hidden);
+
 static cl::opt<unsigned> SVEVectorBitsMaxOpt(
     "aarch64-sve-vector-bits-max",
     cl::desc("Assume SVE vector registers are at most this big, "
@@ -239,6 +243,7 @@
   initializeAArch64LowerHomogeneousPrologEpilogPass(*PR);
   initializeAArch64DAGToDAGISelPass(*PR);
   initializeAArch64GlobalsTaggingPass(*PR);
+  initializeAArch64LoopDistributePass(*PR);
 }
 
 //===----------------------------------------------------------------------===//
@@ -751,6 +756,11 @@
     // be register coalescer friendly.
     addPass(&PeepholeOptimizerID);
   }
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDist) {
+    addPass(createAArch64LoopDistribute());
+  }
+  if (TM->getOptLevel() >= CodeGenOpt::Default)
+    addPass(&MachinePipelinerID);
 }
 
 void AArch64PassConfig::addPostRegAlloc() {
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -65,6 +65,7 @@
   AArch64InstrInfo.cpp
   AArch64KCFI.cpp
   AArch64LoadStoreOptimizer.cpp
+  AArch64LoopDistribute.cpp
   AArch64LowerHomogeneousPrologEpilog.cpp
   AArch64MachineFunctionInfo.cpp
   AArch64MachineScheduler.cpp