diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -42,6 +42,7 @@
 FunctionPass *createAArch64IndirectThunks();
 FunctionPass *createAArch64SpeculationHardeningPass();
 FunctionPass *createAArch64LoadStoreOptimizationPass();
+ModulePass *createAArch64LowerHomogeneousPrologEpilogPass();
 FunctionPass *createAArch64SIMDInstrOptPass();
 ModulePass *createAArch64PromoteConstantPass();
 FunctionPass *createAArch64ConditionOptimizerPass();
@@ -77,6 +78,7 @@
 void initializeAArch64SLSHardeningPass(PassRegistry&);
 void initializeAArch64SpeculationHardeningPass(PassRegistry&);
 void initializeAArch64LoadStoreOptPass(PassRegistry&);
+void initializeAArch64LowerHomogeneousPrologEpilogPass(PassRegistry &);
 void initializeAArch64SIMDInstrOptPass(PassRegistry&);
 void initializeAArch64PreLegalizerCombinerPass(PassRegistry&);
 void initializeAArch64PostLegalizerCombinerPass(PassRegistry &);
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -105,6 +105,16 @@
   }
 
 private:
+  /// Returns true if a homogeneous prolog or epilog code can be emitted
+  /// for the size optimization. If so, HOM_Prolog/HOM_Epilog pseudo
+  /// instructions are emitted in place. When Exit block is given, this check is
+  /// for epilog.
+  bool homogeneousPrologEpilog(MachineFunction &MF,
+                               MachineBasicBlock *Exit = nullptr) const;
+
+  /// Returns true if CSRs should be paired.
+  bool producePairRegisters(MachineFunction &MF) const;
+
   bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
                                       uint64_t StackBumpBytes) const;
 
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -175,6 +175,73 @@
     cl::desc("merge settag instruction in function epilog"), cl::init(true),
     cl::Hidden);
 
+cl::opt<bool> EnableHomogeneousPrologEpilog(
+    "homogeneous-prolog-epilog", cl::init(false), cl::Hidden,
+    cl::desc("Emit homogeneous prologue and epilogue for the size "
+             "optimization (default = off)"));
+
+static bool produceCompactUnwindFrame(MachineFunction &MF);
+static bool needsWinCFI(const MachineFunction &MF);
+
+static uint64_t getArgumentPopSize(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  bool IsTailCallReturn = false;
+  if (MBB.end() != MBBI) {
+    unsigned RetOpcode = MBBI->getOpcode();
+    IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
+                       RetOpcode == AArch64::TCRETURNri ||
+                       RetOpcode == AArch64::TCRETURNriBTI;
+  }
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+
+  uint64_t ArgumentPopSize = 0;
+  if (IsTailCallReturn) {
+    MachineOperand &StackAdjust = MBBI->getOperand(1);
+
+    // For a tail-call in a callee-pops-arguments environment, some or all of
+    // the stack may actually be in use for the call's arguments, this is
+    // calculated during LowerCall and consumed here...
+    ArgumentPopSize = StackAdjust.getImm();
+  } else {
+    // ... otherwise the amount to pop is *all* of the argument space,
+    // conveniently stored in the MachineFunctionInfo by
+    // LowerFormalArguments. This will, of course, be zero for the C calling
+    // convention.
+    ArgumentPopSize = AFI->getArgumentStackToRestore();
+  }
+
+  return ArgumentPopSize;
+}
+
+/// Returns true if a homogeneous prolog or epilog code can be emitted
+/// for the size optimization. If possible, a frame helper call is injected.
+/// When Exit block is given, this check is for epilog.
+bool AArch64FrameLowering::homogeneousPrologEpilog(
+    MachineFunction &MF, MachineBasicBlock *Exit) const {
+  if (!MF.getFunction().hasOptSize())
+    return false;
+  if (!EnableHomogeneousPrologEpilog)
+    return false;
+  if (ReverseCSRRestoreSeq)
+    return false;
+  if (EnableRedZone)
+    return false;
+  if (needsWinCFI(MF))
+    return false;
+  if (MF.getFrameInfo().hasVarSizedObjects())
+    return false;
+  if (MF.getSubtarget().getRegisterInfo()->needsStackRealignment(MF))
+    return false;
+  if (Exit && getArgumentPopSize(MF, *Exit))
+    return false;
+  return true;
+}
+
+bool AArch64FrameLowering::producePairRegisters(MachineFunction &MF) const {
+  return produceCompactUnwindFrame(MF) || homogeneousPrologEpilog(MF);
+}
+
 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
 
 /// This is the biggest offset to the stack pointer we can encode in aarch64
@@ -472,6 +539,8 @@
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  if (homogeneousPrologEpilog(MF))
+    return false;
 
   if (AFI->getLocalStackSize() == 0)
     return false;
@@ -1025,6 +1094,9 @@
                     {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false,
                     NeedsWinCFI, &HasWinCFI);
     NumBytes = 0;
+  } else if (homogeneousPrologEpilog(MF)) {
+    // Stack has been already adjusted.
+    NumBytes -= PrologueSaveSize;
   } else if (PrologueSaveSize != 0) {
     MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(
         MBB, MBBI, DL, TII, -PrologueSaveSize, NeedsWinCFI, &HasWinCFI);
@@ -1416,7 +1488,6 @@
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL;
-  bool IsTailCallReturn = false;
   bool NeedsWinCFI = needsWinCFI(MF);
   bool HasWinCFI = false;
   bool IsFunclet = false;
@@ -1427,10 +1498,6 @@
 
   if (MBB.end() != MBBI) {
     DL = MBBI->getDebugLoc();
-    unsigned RetOpcode = MBBI->getOpcode();
-    IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
-                       RetOpcode == AArch64::TCRETURNri ||
-                       RetOpcode == AArch64::TCRETURNriBTI;
     IsFunclet = isFuncletReturnInstr(*MBBI);
   }
 
@@ -1445,21 +1512,7 @@
 
   // Initial and residual are named for consistency with the prologue. Note that
   // in the epilogue, the residual adjustment is executed first.
-  uint64_t ArgumentPopSize = 0;
-  if (IsTailCallReturn) {
-    MachineOperand &StackAdjust = MBBI->getOperand(1);
-
-    // For a tail-call in a callee-pops-arguments environment, some or all of
-    // the stack may actually be in use for the call's arguments, this is
-    // calculated during LowerCall and consumed here...
-    ArgumentPopSize = StackAdjust.getImm();
-  } else {
-    // ... otherwise the amount to pop is *all* of the argument space,
-    // conveniently stored in the MachineFunctionInfo by
-    // LowerFormalArguments. This will, of course, be zero for the C calling
-    // convention.
-    ArgumentPopSize = AFI->getArgumentStackToRestore();
-  }
+  uint64_t ArgumentPopSize = getArgumentPopSize(MF, MBB);
 
   // The stack frame should be like below,
   //
@@ -1502,6 +1555,26 @@
   // function.
   if (MF.hasEHFunclets())
     AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
+  if (homogeneousPrologEpilog(MF, &MBB)) {
+    assert(!NeedsWinCFI);
+    auto LastPopI = MBB.getFirstTerminator();
+    if (LastPopI != MBB.begin()) {
+      auto HomogeneousEpilog = std::prev(LastPopI);
+      if (HomogeneousEpilog->getOpcode() == AArch64::HOM_Epilog)
+        LastPopI = HomogeneousEpilog;
+    }
+
+    // Adjust local stack
+    uint64_t LocalStackSize = AFI->getLocalStackSize();
+    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
+                    {(int64_t)LocalStackSize, MVT::i8}, TII,
+                    MachineInstr::FrameDestroy, false, NeedsWinCFI);
+
+    // SP has been already adjusted while restoring callee save regs.
+    // We've bailed-out the case with adjusting SP for arguments.
+    assert(AfterCSRPopSize == 0);
+    return;
+  }
   bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
   // Assume we can't combine the last pop with the sp restore.
 
@@ -2165,6 +2238,22 @@
     MBB.addLiveIn(AArch64::X18);
   }
 
+  if (homogeneousPrologEpilog(MF)) {
+    auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Prolog))
+                   .setMIFlag(MachineInstr::FrameSetup);
+
+    for (auto &RPI : RegPairs) {
+      MIB.addReg(RPI.Reg1, RegState::Implicit);
+      MIB.addReg(RPI.Reg2, RegState::Implicit);
+
+      // Update register live in.
+      if (!MRI.isReserved(RPI.Reg1))
+        MBB.addLiveIn(RPI.Reg1);
+      if (!MRI.isReserved(RPI.Reg2))
+        MBB.addLiveIn(RPI.Reg2);
+    }
+    return true;
+  }
   for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
        ++RPII) {
     RegPairInfo RPI = *RPII;
@@ -2360,6 +2449,14 @@
     for (const RegPairInfo &RPI : reverse(RegPairs))
       if (!RPI.isScalable())
         EmitMI(RPI);
+  } else if (homogeneousPrologEpilog(MF, &MBB)) {
+    auto MIB = BuildMI(MBB, MI, DL, TII.get(AArch64::HOM_Epilog))
+                   .setMIFlag(MachineInstr::FrameDestroy);
+    for (auto &RPI : RegPairs) {
+      MIB.addReg(RPI.Reg1, RegState::Implicit | RegState::Define);
+      MIB.addReg(RPI.Reg2, RegState::Implicit | RegState::Define);
+    }
+    return true;
   } else
     for (const RegPairInfo &RPI : RegPairs)
       if (!RPI.isScalable())
@@ -2429,7 +2526,7 @@
     // MachO's compact unwind format relies on all registers being stored in
     // pairs.
     // FIXME: the usual format is actually better if unwinding isn't needed.
-    if (produceCompactUnwindFrame(MF) && PairedReg != AArch64::NoRegister &&
+    if (producePairRegisters(MF) && PairedReg != AArch64::NoRegister &&
         !SavedRegs.test(PairedReg)) {
       SavedRegs.set(PairedReg);
       if (AArch64::GPR64RegClass.contains(PairedReg) &&
@@ -2508,7 +2605,7 @@
       // MachO's compact unwind format relies on all registers being stored in
       // pairs, so if we need to spill one extra for BigStack, then we need to
       // store the pair.
-      if (produceCompactUnwindFrame(MF))
+      if (producePairRegisters(MF))
         SavedRegs.set(UnspilledCSGPRPaired);
       ExtraCSSpill = UnspilledCSGPR;
     }
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -3779,6 +3779,11 @@
                     Sched<[]>;
 }
 
+// Pseudo instructions for homogeneous prolog/epilog
+let isPseudo = 1 in {
+  def HOM_Prolog : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def HOM_Epilog : Pseudo<(outs), (ins), []>, Sched<[]>;
+}
 //===----------------------------------------------------------------------===//
 // Floating point immediate move.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64LowerHomogeneousPrologEpilog.cpp
@@ -0,0 +1,658 @@
+//===- AArch64LowerHomogeneousPrologEpilog.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that lowers homogeneous prolog/epilog instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64InstPrinter.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include <sstream>
+
+using namespace llvm;
+
+#define AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME                           \
+  "AArch64 homogeneous prolog/epilog lowering pass"
+
+cl::opt<int> FrameHelperSizeThreshold(
+    "frame-helper-size-threshold", cl::init(2), cl::Hidden,
+    cl::desc("The minimum number of instructions that are outlined in a frame "
+             "helper (default = 2)"));
+
+namespace {
+
+class AArch64LowerHomogeneousPE {
+public:
+  const AArch64InstrInfo *TII;
+
+  AArch64LowerHomogeneousPE(Module *M, MachineModuleInfo *MMI)
+      : M(M), MMI(MMI) {}
+
+  bool run();
+  bool runOnMachineFunction(MachineFunction &Fn);
+
+private:
+  Module *M;
+  MachineModuleInfo *MMI;
+
+  bool runOnMBB(MachineBasicBlock &MBB);
+  bool runOnMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+               MachineBasicBlock::iterator &NextMBBI);
+
+  /// Lower a HOM_Prolog pseudo instruction into a helper call
+  /// or a sequence of homogeneous stores.
+  /// When a a fp setup follows, it can be optimized.
+  bool lowerHOM_Prolog(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                       MachineBasicBlock::iterator &NextMBBI);
+  /// Lower a HOM_Epilog pseudo instruction into a helper call
+  /// or a sequence of homogeneous loads.
+  /// When a return follow, it can be optimized.
+  bool lowerHOM_Epilog(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                       MachineBasicBlock::iterator &NextMBBI);
+};
+
+class AArch64LowerHomogeneousPrologEpilog : public ModulePass {
+public:
+  static char ID;
+
+  AArch64LowerHomogeneousPrologEpilog() : ModulePass(ID) {
+    initializeAArch64LowerHomogeneousPrologEpilogPass(
+        *PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
+    AU.setPreservesAll();
+    ModulePass::getAnalysisUsage(AU);
+  }
+  bool runOnModule(Module &M) override;
+
+  StringRef getPassName() const override {
+    return AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME;
+  }
+};
+
+} // end anonymous namespace
+
+char AArch64LowerHomogeneousPrologEpilog::ID = 0;
+
+INITIALIZE_PASS(AArch64LowerHomogeneousPrologEpilog,
+                "aarch64-lower-homogeneous-prolog-epilog",
+                AARCH64_LOWER_HOMOGENEOUS_PROLOG_EPILOG_NAME, false, false)
+
+bool AArch64LowerHomogeneousPrologEpilog::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  MachineModuleInfo *MMI =
+      &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+  return AArch64LowerHomogeneousPE(&M, MMI).run();
+}
+
+bool AArch64LowerHomogeneousPE::run() {
+  bool Changed = false;
+  for (auto &F : *M) {
+    if (F.empty())
+      continue;
+
+    MachineFunction *MF = MMI->getMachineFunction(F);
+    if (!MF)
+      continue;
+    Changed |= runOnMachineFunction(*MF);
+  }
+
+  return Changed;
+}
+enum FrameHelperType { Prolog, PrologFrame, Epilog, EpilogTail };
+
+/// Return a frame helper name with the given CSRs and the helper type.
+/// For instance, a prolog helper that saves x19 and x20 is named as
+/// OUTLINED_FUNCTION_PROLOG_x19x20.
+static std::string getFrameHelperName(SmallVectorImpl<unsigned> &Regs,
+                                      FrameHelperType Type, unsigned FpOffset) {
+  std::ostringstream RegStream;
+  switch (Type) {
+  case FrameHelperType::Prolog:
+    RegStream << "OUTLINED_FUNCTION_PROLOG_";
+    break;
+  case FrameHelperType::PrologFrame:
+    RegStream << "OUTLINED_FUNCTION_PROLOG_FRAME" << FpOffset << "_";
+    break;
+  case FrameHelperType::Epilog:
+    RegStream << "OUTLINED_FUNCTION_EPILOG_";
+    break;
+  case FrameHelperType::EpilogTail:
+    RegStream << "OUTLINED_FUNCTION_EPILOG_TAIL_";
+    break;
+  }
+
+  for (auto Reg : Regs)
+    RegStream << AArch64InstPrinter::getRegisterName(Reg);
+
+  return RegStream.str();
+}
+
+/// Create a Function for the unique frame helper with the given name.
+/// Return a newly created MachineFunction with an empty MachineBasicBlock.
+static MachineFunction &createFrameHelperMachineFunction(Module *M,
+                                                         MachineModuleInfo *MMI,
+                                                         StringRef Name) {
+  LLVMContext &C = M->getContext();
+  Function *F = M->getFunction(Name);
+  assert(F == nullptr && "Function has been created before");
+  F = Function::Create(FunctionType::get(Type::getVoidTy(C), false),
+                       Function::ExternalLinkage, Name, M);
+  assert(F && "Function was null!");
+
+  // Use ODR linkage to avoid duplication.
+  F->setLinkage(GlobalValue::LinkOnceODRLinkage);
+  F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+  // Set no-opt/minsize, so we don't insert padding between outlined
+  // functions.
+  F->addFnAttr(Attribute::OptimizeNone);
+  F->addFnAttr(Attribute::NoInline);
+  F->addFnAttr(Attribute::MinSize);
+  F->addFnAttr(Attribute::Naked);
+
+  MachineFunction &MF = MMI->getOrCreateMachineFunction(*F);
+  // Remove unnecessary register liveness and set NoVRegs.
+  MF.getProperties().reset(MachineFunctionProperties::Property::TracksLiveness);
+  MF.getProperties().reset(MachineFunctionProperties::Property::IsSSA);
+  MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+  MF.getRegInfo().freezeReservedRegs(MF);
+
+  // Create entry block.
+  BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
+  IRBuilder<> Builder(EntryBB);
+  Builder.CreateRetVoid();
+
+  // Insert the new block into the function.
+  MachineBasicBlock *MBB = MF.CreateMachineBasicBlock();
+  MF.insert(MF.begin(), MBB);
+
+  return MF;
+}
+
+/// Emit a homogeneous store-pair instruction for frame-setup.
+static void emitHomogeneousStore(MachineFunction &MF, MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator Pos,
+                                 const TargetInstrInfo &TII, unsigned Reg1,
+                                 unsigned Reg2) {
+  bool IsFloat = AArch64::FPR64RegClass.contains(Reg1);
+  assert(!(IsFloat ^ AArch64::FPR64RegClass.contains(Reg2)));
+  int Opc = IsFloat ? AArch64::STPDpre : AArch64::STPXpre;
+  MachineInstrBuilder MIB = BuildMI(MBB, Pos, DebugLoc(), TII.get(Opc));
+  MIB.addDef(AArch64::SP)
+      .addReg(Reg2)
+      .addReg(Reg1)
+      .addReg(AArch64::SP)
+      .addImm(-2)
+      .addMemOperand(
+          MF.getMachineMemOperand(MachinePointerInfo::getUnknownStack(MF),
+                                  MachineMemOperand::MOStore, 8, Align(8)))
+      .addMemOperand(
+          MF.getMachineMemOperand(MachinePointerInfo::getUnknownStack(MF),
+                                  MachineMemOperand::MOStore, 8, Align(8)))
+      .setMIFlag(MachineInstr::FrameSetup);
+}
+
+/// Emit a homogeneous load-pair instruction for frame-destroy.
+static void emitHomogeneousLoad(MachineFunction &MF, MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator Pos,
+                                const TargetInstrInfo &TII, unsigned Reg1,
+                                unsigned Reg2) {
+  bool IsFloat = AArch64::FPR64RegClass.contains(Reg1);
+  assert(!(IsFloat ^ AArch64::FPR64RegClass.contains(Reg2)));
+  int Opc = IsFloat ? AArch64::LDPDpost : AArch64::LDPXpost;
+  MachineInstrBuilder MIB = BuildMI(MBB, Pos, DebugLoc(), TII.get(Opc));
+  MIB.addDef(AArch64::SP)
+      .addReg(Reg2)
+      .addReg(Reg1)
+      .addReg(AArch64::SP)
+      .addImm(2)
+      .addMemOperand(
+          MF.getMachineMemOperand(MachinePointerInfo::getUnknownStack(MF),
+                                  MachineMemOperand::MOLoad, 8, Align(8)))
+      .addMemOperand(
+          MF.getMachineMemOperand(MachinePointerInfo::getUnknownStack(MF),
+                                  MachineMemOperand::MOLoad, 8, Align(8)))
+      .setMIFlag(MachineInstr::FrameDestroy);
+}
+
+/// Return a unique function if a helper can be formed with the given Regs
+/// and frame type.
+/// 1) _OUTLINED_FUNCTION_PROLOG_x19x20x21x22:
+///    stp x20, x19, [sp, #-16]!
+///    stp x22, x21, [sp, #-16]!
+///    ret
+///
+/// 2) _OUTLINED_FUNCTION_PROLOG_x19x20x30x29x21x22:
+///    mov x16, x30
+///    ldp x29, x30, [sp], #16      ; Restore x29/x30 stored at the caller
+///    stp x20, x19, [sp, #-16]!
+///    stp x29, x30, [sp, #-16]!    ; Save x29/30 (NeedSaveLR = true)
+///    stp x22, x21, [sp, #-16]!
+///    br x16
+///
+/// 3) _OUTLINED_FUNCTION_PROLOG_FRAME32_x19x20x21x22:
+///    stp x20, x19, [sp, #-16]!
+///    stp x22, x21, [sp, #-16]!
+///    add fp, sp, #32
+///    ret
+///
+/// 4) _OUTLINED_FUNCTION_PROLOG_FRAME0_x19x20x30x29x21x22:
+///    mov x16, x30
+///    ldp x29, x30, [sp], #16      ; Restore x29/x30 stored at the caller
+///    stp x20, x19, [sp, #-16]!
+///    stp x29, x30, [sp, #-16]!    ; Save x29/30 (NeedSaveLR = true)
+///    stp x22, x21, [sp, #-16]!
+///    mov fp, sp
+///    br x16
+///
+/// 5) _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22:
+///    mov x16, x30
+///    ldp x22, x21, [sp], #16
+///    ldp x20, x19, [sp], #16
+///    ldp x29, x30, [sp], #16
+///    br x16
+///
+/// 6) _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20x21x22:
+///    ldp x22, x21, [sp], #16
+///    ldp x20, x19, [sp], #16
+///    ldp x29, x30, [sp], #16
+///    ret
+/// @param M module
+/// @param MMI machine module info
+/// @param Regs callee save regs that the helper will handle
+/// @param Type frame helper type
+/// @return a helper function
+static Function *getOrCreateFrameHelper(Module *M, MachineModuleInfo *MMI,
+                                        SmallVectorImpl<unsigned> &Regs,
+                                        FrameHelperType Type,
+                                        unsigned FpOffset = 0) {
+  assert(Regs.size() >= 2);
+  bool NeedSaveLR = false;
+  if (Type == FrameHelperType::Prolog || Type == FrameHelperType::PrologFrame) {
+    // When FP/LR is the first pair, it has been already saved in the caller.
+    NeedSaveLR = Regs[0] != AArch64::LR;
+    if (!NeedSaveLR) {
+      // Prolog helpers do not need to store FP/LR
+      Regs.erase(Regs.begin());
+      Regs.erase(Regs.begin());
+    }
+  }
+
+  auto Name = getFrameHelperName(Regs, Type, FpOffset);
+  auto F = M->getFunction(Name);
+  if (F)
+    return F;
+
+  auto &MF = createFrameHelperMachineFunction(M, MMI, Name);
+  MachineBasicBlock &MBB = *MF.begin();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+  if (NeedSaveLR) {
+    // Stash LR to X16
+    BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::ORRXrs))
+        .addDef(AArch64::X16)
+        .addReg(AArch64::XZR)
+        .addUse(AArch64::LR)
+        .addImm(0);
+    // Restore FP/LR from the stack
+    emitHomogeneousLoad(MF, MBB, MBB.end(), TII, AArch64::LR, AArch64::FP);
+  }
+
+  int Size = (int)Regs.size();
+  switch (Type) {
+  case FrameHelperType::Prolog:
+    for (int I = 0; I < Size; I += 2)
+      emitHomogeneousStore(MF, MBB, MBB.end(), TII, Regs[I], Regs[I + 1]);
+    if (NeedSaveLR)
+      BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::BR))
+          .addUse(AArch64::X16);
+    else
+      BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::RET))
+          .addReg(AArch64::LR, RegState::Undef);
+    break;
+
+  case FrameHelperType::PrologFrame:
+    for (int I = 0; I < Size; I += 2)
+      emitHomogeneousStore(MF, MBB, MBB.end(), TII, Regs[I], Regs[I + 1]);
+    BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::ADDXri))
+        .addDef(AArch64::FP)
+        .addUse(AArch64::SP)
+        .addImm(FpOffset)
+        .addImm(0)
+        .setMIFlag(MachineInstr::FrameSetup);
+    if (NeedSaveLR)
+      BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::BR))
+          .addUse(AArch64::X16);
+    else
+      BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::RET))
+          .addReg(AArch64::LR, RegState::Undef);
+    break;
+
+  case FrameHelperType::Epilog:
+    // Stash LR to X16
+    BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::ORRXrs))
+        .addDef(AArch64::X16)
+        .addReg(AArch64::XZR)
+        .addUse(AArch64::LR)
+        .addImm(0);
+    // Restore CSRs in the reverse order
+    for (int I = Size - 1; I >= 0; I -= 2)
+      emitHomogeneousLoad(MF, MBB, MBB.end(), TII, Regs[I - 1], Regs[I]);
+    // Branch on X16 not to trash LR.
+    BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::BR))
+        .addUse(AArch64::X16);
+    break;
+
+  case FrameHelperType::EpilogTail:
+    // Restore CSRs in the reverse order
+    for (int I = Size - 1; I >= 0; I -= 2)
+      emitHomogeneousLoad(MF, MBB, MBB.end(), TII, Regs[I - 1], Regs[I]);
+    BuildMI(MBB, MBB.end(), DebugLoc(), TII.get(AArch64::RET))
+        .addReg(AArch64::LR, RegState::Undef);
+    break;
+  }
+
+  return M->getFunction(Name);
+}
+
+/// Get a valid non-negative adjustment to set fp from sp.
+/// @param MBBI instruciton setting fp from sp.
+/// @return a valid non-negative adjustment. Or -1 for any other case.
+int getFpAdjustmentFromSp(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  if (!MI.getFlag(MachineInstr::FrameSetup))
+    return -1;
+  unsigned Opcode = MI.getOpcode();
+  if (Opcode != AArch64::ADDXri && Opcode != AArch64::SUBXri)
+    return -1;
+  if (!MI.getOperand(0).isReg())
+    return -1;
+  if (MI.getOperand(0).getReg() != AArch64::FP)
+    return -1;
+  if (!MI.getOperand(1).isReg())
+    return -1;
+  if (MI.getOperand(1).getReg() != AArch64::SP)
+    return -1;
+
+  int Imm = MI.getOperand(2).getImm();
+  if (Opcode == AArch64::ADDXri && Imm >= 0)
+    return Imm;
+  else if (Opcode == AArch64::SUBXri && Imm <= 0)
+    return -Imm;
+
+  return -1;
+}
+
+/// This function checks if a frame helper should be used for
+/// HOM_Prolog/HOM_Epilog pseudo instruction expansion.
+/// @param MBB machine basic block
+/// @param NextMBBI  next instruction following HOM_Prolog/HOM_Epilog
+/// @param Regs callee save registers that are saved or restored.
+/// @param Type frame helper type
+/// @return True if a use of helper is qualified.
+static bool shouldUseFrameHelper(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &NextMBBI,
+                                 SmallVectorImpl<unsigned> &Regs,
+                                 FrameHelperType Type) {
+  int RegCount = (int)Regs.size();
+  assert(RegCount > 0 && (RegCount % 2 == 0));
+  // # of instructions that will be outlined.
+  int InstCount = RegCount >> 1;
+
+  // Do not use a helper call when not saving LR.
+  if (std::find(Regs.begin(), Regs.end(), AArch64::LR) == Regs.end())
+    return false;
+
+  switch (Type) {
+  case FrameHelperType::Prolog:
+    // Prolog helper cannot save FP/LR.
+    InstCount--;
+    break;
+  case FrameHelperType::PrologFrame: {
+    // Prolog helper cannot save FP/LR.
+    // Check if the following instruction is beneficial to be included.
+    if (NextMBBI == MBB.end())
+      return false;
+    int FpAdjustment = getFpAdjustmentFromSp(NextMBBI);
+    if (FpAdjustment == -1)
+      return false;
+    // Effecitvely no change in InstCount since FpAdjusment is included.
+    break;
+  }
+  case FrameHelperType::Epilog:
+    // No change in InstCount for the regular epilog case.
+    break;
+  case FrameHelperType::EpilogTail: {
+    // EpilogTail helper includes the caller's return.
+    if (NextMBBI == MBB.end())
+      return false;
+    if (NextMBBI->getOpcode() != AArch64::RET_ReallyLR)
+      return false;
+    InstCount++;
+    break;
+  }
+  }
+
+  return InstCount >= FrameHelperSizeThreshold;
+}
+
+/// Lower a HOM_Epilog pseudo instruction into a helper call while
+/// creating the helper on demand. Or emit a sequence of homogeneous loads in
+/// place when not using a helper call.
+///
+/// 1. With a helper including ret
+///    HOM_Epilog x30, x29, x19, x20, x21, x22              ; MBBI
+///    ret                                                  ; NextMBBI
+///    =>
+///    b _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20x21x22
+///    ...                                                  ; NextMBBI
+///
+/// 2. With a helper
+///    HOM_Epilog x30, x29, x19, x20, x21, x22
+///    =>
+///    bl _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22
+///
+/// 3. Without a helper
+///    HOM_Epilog x30, x29, x19, x20, x21, x22
+///    =>
+///    ldp x22, x21, [sp], #16
+///    ldp x20, x19, [sp], #16
+///    ldp x29, x30, [sp], #16
+bool AArch64LowerHomogeneousPE::lowerHOM_Epilog(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  auto &MF = *MBB.getParent();
+  MachineInstr &MI = *MBBI;
+
+  DebugLoc DL = MI.getDebugLoc();
+  SmallVector<unsigned, 8> Regs;
+  for (auto &MO : MI.implicit_operands())
+    if (MO.isReg())
+      Regs.push_back(MO.getReg());
+  int Size = (int)Regs.size();
+  if (Size == 0)
+    return false;
+  // Registers are in pair.
+  assert(Size % 2 == 0);
+  assert(MI.getOpcode() == AArch64::HOM_Epilog);
+
+  auto Return = NextMBBI;
+  if (shouldUseFrameHelper(MBB, NextMBBI, Regs, FrameHelperType::EpilogTail)) {
+    // When MBB ends with a return, emit a tail-call to the epilog helper
+    auto EpilogTailHelper =
+        getOrCreateFrameHelper(M, MMI, Regs, FrameHelperType::EpilogTail);
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::TCRETURNdi))
+        .addGlobalAddress(EpilogTailHelper)
+        .addImm(0)
+        .setMIFlag(MachineInstr::FrameDestroy)
+        .copyImplicitOps(MI)
+        .copyImplicitOps(*Return);
+    NextMBBI = std::next(Return);
+    Return->removeFromParent();
+  } else if (shouldUseFrameHelper(MBB, NextMBBI, Regs,
+                                  FrameHelperType::Epilog)) {
+    // The default epilog helper case.
+    auto EpilogHelper =
+        getOrCreateFrameHelper(M, MMI, Regs, FrameHelperType::Epilog);
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+        .addGlobalAddress(EpilogHelper)
+        .setMIFlag(MachineInstr::FrameDestroy)
+        .copyImplicitOps(MI);
+  } else {
+    // Fall back to no-helper.
+    for (int I = Size - 1; I >= 0; I -= 2)
+      emitHomogeneousLoad(MF, MBB, MBBI, *TII, Regs[I - 1], Regs[I]);
+  }
+
+  MBBI->removeFromParent();
+  return true;
+}
+
+/// Lower a HOM_Prolog pseudo instruction into a helper call while
+/// creating the helper on demand. Or emit a sequence of homogeneous stores in
+/// place when not using a helper call.
+///
+/// 1. With a helper including frame-setup
+///    HOM_Prolog x30, x29, x19, x20, x21, x22      ; MBBI
+///    add x29, x30, #32                            ; NextMBBI
+///    =>
+///    stp x29, x30, [sp, #-16]!
+///    bl _OUTLINED_FUNCTION_PROLOG_FRAME32_x19x20x21x22
+///    ...                                          ; NextMBBI
+///
+/// 2. With a helper
+///    HOM_Prolog x30, x29, x19, x20, x21, x22
+///    =>
+///    stp x29, x30, [sp, #-16]!
+///    bl _OUTLINED_FUNCTION_PROLOG_x19x20x21x22
+///
+/// 3. Without a helper
+///    HOM_Prolog x30, x29, x19, x20, x21, x22
+///    =>
+///    stp x29, x30, [sp, #-16]!
+///    stp x20, x19, [sp, #-16]!
+///    stp x22, x21, [sp, #-16]!
+bool AArch64LowerHomogeneousPE::lowerHOM_Prolog(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  auto &MF = *MBB.getParent();
+  MachineInstr &MI = *MBBI;
+
+  DebugLoc DL = MI.getDebugLoc();
+  SmallVector<unsigned, 8> Regs;
+  for (auto &MO : MI.implicit_operands())
+    if (MO.isReg())
+      Regs.push_back(MO.getReg());
+  int Size = (int)Regs.size();
+  if (Size == 0)
+    return false;
+  // Allow compact unwind case only for oww.
+  assert(Size % 2 == 0);
+  assert(MI.getOpcode() == AArch64::HOM_Prolog);
+
+  auto FpAdjustment = NextMBBI;
+  if (shouldUseFrameHelper(MBB, NextMBBI, Regs, FrameHelperType::PrologFrame)) {
+    // FP/LR is stored at the top of stack before the prolog helper call.
+    emitHomogeneousStore(MF, MBB, MBBI, *TII, AArch64::LR, AArch64::FP);
+    auto FpOffset = getFpAdjustmentFromSp(NextMBBI);
+    auto PrologFrameHelper = getOrCreateFrameHelper(
+        M, MMI, Regs, FrameHelperType::PrologFrame, FpOffset);
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+        .addGlobalAddress(PrologFrameHelper)
+        .setMIFlag(MachineInstr::FrameSetup)
+        .copyImplicitOps(MI)
+        .copyImplicitOps(*FpAdjustment)
+        .addReg(AArch64::FP, RegState::Implicit | RegState::Define)
+        .addReg(AArch64::SP, RegState::Implicit);
+    NextMBBI = std::next(FpAdjustment);
+    FpAdjustment->removeFromParent();
+  } else if (shouldUseFrameHelper(MBB, NextMBBI, Regs,
+                                  FrameHelperType::Prolog)) {
+    // FP/LR is stored at the top of stack before the prolog helper call.
+    emitHomogeneousStore(MF, MBB, MBBI, *TII, AArch64::LR, AArch64::FP);
+    auto PrologHelper =
+        getOrCreateFrameHelper(M, MMI, Regs, FrameHelperType::Prolog);
+    BuildMI(MBB, MBBI, DL, TII->get(AArch64::BL))
+        .addGlobalAddress(PrologHelper)
+        .setMIFlag(MachineInstr::FrameSetup)
+        .copyImplicitOps(MI);
+  } else {
+    // Fall back to no-helper.
+    for (int I = 0; I < Size; I += 2)
+      emitHomogeneousStore(MF, MBB, MBBI, *TII, Regs[I], Regs[I + 1]);
+  }
+
+  MBBI->removeFromParent();
+  return true;
+}
+
+/// Process each machine instruction
+/// @param MBB machine basic block
+/// @param MBBI current instruction iterator
+/// @param NextMBBIT next instruction iterator which can be updated
+/// @return True when IR is changed.
+bool AArch64LowerHomogeneousPE::runOnMI(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        MachineBasicBlock::iterator &NextMBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  default:
+    break;
+  case AArch64::HOM_Prolog:
+    return lowerHOM_Prolog(MBB, MBBI, NextMBBI);
+  case AArch64::HOM_Epilog:
+    return lowerHOM_Epilog(MBB, MBBI, NextMBBI);
+  }
+  return false;
+}
+
+bool AArch64LowerHomogeneousPE::runOnMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= runOnMI(MBB, MBBI, NMBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool AArch64LowerHomogeneousPE::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
+
+  bool Modified = false;
+  for (auto &MBB : MF)
+    Modified |= runOnMBB(MBB);
+  return Modified;
+}
+
+ModulePass *llvm::createAArch64LowerHomogeneousPrologEpilogPass() {
+  return new AArch64LowerHomogeneousPrologEpilog();
+}
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -161,6 +161,8 @@
                         cl::desc("Enable the AAcrh64 branch target pass"),
                         cl::init(true));
 
+extern cl::opt<bool> EnableHomogeneousPrologEpilog;
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
@@ -195,6 +197,7 @@
   initializeAArch64SLSHardeningPass(*PR);
   initializeAArch64StackTaggingPass(*PR);
   initializeAArch64StackTaggingPreRAPass(*PR);
+  initializeAArch64LowerHomogeneousPrologEpilogPass(*PR);
 }
 
 //===----------------------------------------------------------------------===//
@@ -621,6 +624,9 @@
 }
 
 void AArch64PassConfig::addPreSched2() {
+  // Lower homogeneous frame instructions
+  if (EnableHomogeneousPrologEpilog)
+    addPass(createAArch64LowerHomogeneousPrologEpilogPass());
   // Expand some pseudo instructions to allow proper scheduling.
   addPass(createAArch64ExpandPseudoPass());
   // Use load/store pair instructions when possible.
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -53,6 +53,7 @@
   AArch64ISelLowering.cpp
   AArch64InstrInfo.cpp
   AArch64LoadStoreOptimizer.cpp
+  AArch64LowerHomogeneousPrologEpilog.cpp
   AArch64MachineFunctionInfo.cpp
   AArch64MacroFusion.cpp
   AArch64MCInstLower.cpp
diff --git a/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-frame-tail.ll b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-frame-tail.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-frame-tail.ll
@@ -0,0 +1,88 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0  -homogeneous-prolog-epilog | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu  -homogeneous-prolog-epilog | FileCheck %s --check-prefixes=CHECK-SAVELR
+
+; CHECK-LABEL: __Z3foofffi:
+; CHECK:      stp     x29, x30, [sp, #-16]!
+; CHECK-NEXT: bl      _OUTLINED_FUNCTION_PROLOG_FRAME48_x19x20d8d9d10d11
+; CHECK:      bl      __Z3goof
+; CHECK:      bl      __Z3goof
+; CHECK:      b       _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20d8d9d10d11
+
+; CHECK-SAVELR-LABEL: _Z3foofffi:
+; CHECK-SAVELR:      stp     x29, x30, [sp, #-16]!
+; CHECK-SAVELR-NEXT: bl      OUTLINED_FUNCTION_PROLOG_FRAME0_x19x20x30x29d8d9d10d11
+; CHECK-SAVELR:      bl      _Z3goof
+; CHECK-SAVELR:      bl      _Z3goof
+; CHECK-SAVELR:      b       OUTLINED_FUNCTION_EPILOG_TAIL_x19x20x30x29d8d9d10d11
+
+define float @_Z3foofffi(float %b, float %x, float %y, i32 %z) ssp optsize "frame-pointer"="non-leaf" {
+entry:
+  %inc = fadd float %b, 1.000000e+00
+  %add = fadd float %inc, %x
+  %add1 = fadd float %add, %y
+  %conv = sitofp i32 %z to float
+  %sub = fsub float %add1, %conv
+  %dec = add nsw i32 %z, -1
+  %call = tail call float @_Z3goof(float %inc) #2
+  %call2 = tail call float @_Z3goof(float %sub) #2
+  %add3 = fadd float %call, %call2
+  %mul = fmul float %inc, %add3
+  %add4 = fadd float %sub, %mul
+  %conv5 = sitofp i32 %dec to float
+  %sub6 = fsub float %add4, %conv5
+  ret float %sub6
+}
+
+; CHECK-LABEL: _Z3zoov:
+; CHECK:      stp     x29, x30, [sp, #-16]!
+; CHECK:      bl      __Z3hoo
+; CHECK:      b       _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29
+
+define i32 @_Z3zoov() nounwind ssp optsize {
+  %1 = tail call i32 @_Z3hoov() #2
+  %2 = add nsw i32 %1, 1
+  ret i32 %2
+}
+
+
+declare float @_Z3goof(float) nounwind ssp optsize
+declare i32 @_Z3hoov() nounwind ssp optsize
+
+; CHECK-LABEL:  _OUTLINED_FUNCTION_PROLOG_FRAME48_x19x20d8d9d10d11:
+; CHECK:      stp     x20, x19, [sp, #-16]!
+; CHECK-NEXT: stp     d9, d8, [sp, #-16]!
+; CHECK-NEXT: stp     d11, d10, [sp, #-16]!
+; CHECK-NEXT: add     x29, sp, #48
+; CHECK-NEXT: ret
+
+; CHECK-LABEL: _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29x19x20d8d9d10d11:
+; CHECK:      ldp     d11, d10, [sp], #16
+; CHECK-NEXT: ldp     d9, d8, [sp], #16
+; CHECK-NEXT: ldp     x20, x19, [sp], #16
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-NEXT: ret
+
+; CHECK-LABEL: _OUTLINED_FUNCTION_EPILOG_TAIL_x30x29:
+; CHECK:      ldp     x29, x30, [sp], #16
+; CHECK-NEXT: ret
+
+; CHECK-SAVELR-LABEL:  OUTLINED_FUNCTION_PROLOG_FRAME0_x19x20x30x29d8d9d10d11:
+; CHECK-SAVELR:      mov     x16, x30
+; CHECK-SAVELR-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-SAVELR-NEXT: stp     x20, x19, [sp, #-16]!
+; CHECK-SAVELR-NEXT: stp     x29, x30, [sp, #-16]!
+; CHECK-SAVELR-NEXT: stp     d9, d8, [sp, #-16]!
+; CHECK-SAVELR-NEXT: stp     d11, d10, [sp, #-16]!
+; CHECK-SAVELR-NEXT: mov     x29, sp
+; CHECK-SAVELR-NEXT: br      x16
+
+; CHECK-SAVELR-LABEL: OUTLINED_FUNCTION_EPILOG_TAIL_x19x20x30x29d8d9d10d11:
+; CHECK-SAVELR:      ldp     d11, d10, [sp], #16
+; CHECK-SAVELR-NEXT: ldp     d9, d8, [sp], #16
+; CHECK-SAVELR-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-SAVELR-NEXT: ldp     x20, x19, [sp], #16
+; CHECK-SAVELR-NEXT: ret
+
+; CHECK-SAVELR-LABEL: OUTLINED_FUNCTION_EPILOG_TAIL_x30x29:
+; CHECK-SAVELR:      ldp     x29, x30, [sp], #16
+; CHECK-SAVELR-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog-no-helper.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0  -homogeneous-prolog-epilog -frame-helper-size-threshold=6 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu  -homogeneous-prolog-epilog -frame-helper-size-threshold=6 | FileCheck %s --check-prefixes=CHECK-SAVELR
+
+; CHECK-LABEL: __Z3foofffi:
+; CHECK:      stp     x29, x30, [sp, #-16]!
+; CHECK-NEXT: stp     x20, x19, [sp, #-16]!
+; CHECK-NEXT: stp     d9, d8, [sp, #-16]!
+; CHECK-NEXT: stp     d11, d10, [sp, #-16]!
+; CHECK-NEXT: add     x29, sp, #48
+; CHECK:      bl      __Z3goof
+; CHECK:      bl      __Z3goof
+; CHECK:      ldp     d11, d10, [sp], #16
+; CHECK-NEXT: ldp     d9, d8, [sp], #16
+; CHECK-NEXT: ldp     x20, x19, [sp], #16
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-NEXT: ret
+
+; CHECK-SAVELR-LABEL: _Z3foofffi:
+; CHECK-SAVELR:      stp     x20, x19, [sp, #-16]!
+; CHECK-SAVELR-NEXT: stp     x29, x30, [sp, #-16]!
+; CHECK-SAVELR-NEXT: stp     d9, d8, [sp, #-16]!
+; CHECK-SAVELR-NEXT: stp     d11, d10, [sp, #-16]!
+; CHECK-SAVELR-NEXT: mov     x29, sp
+; CHECK-SAVELR:      bl      _Z3goof
+; CHECK-SAVELR:      bl      _Z3goof
+; CHECK-SAVELR:      ldp     d11, d10, [sp], #16
+; CHECK-SAVELR-NEXT: ldp     d9, d8, [sp], #16
+; CHECK-SAVELR-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-SAVELR-NEXT: ldp     x20, x19, [sp], #16
+; CHECK-SAVELR-NEXT: ret
+
+define float @_Z3foofffi(float %b, float %x, float %y, i32 %z) uwtable ssp optsize "frame-pointer"="non-leaf" {
+entry:
+  %inc = fadd float %b, 1.000000e+00
+  %add = fadd float %inc, %x
+  %add1 = fadd float %add, %y
+  %conv = sitofp i32 %z to float
+  %sub = fsub float %add1, %conv
+  %dec = add nsw i32 %z, -1
+  %call = tail call float @_Z3goof(float %inc) #2
+  %call2 = tail call float @_Z3goof(float %sub) #2
+  %add3 = fadd float %call, %call2
+  %mul = fmul float %inc, %add3
+  %add4 = fadd float %sub, %mul
+  %conv5 = sitofp i32 %dec to float
+  %sub6 = fsub float %add4, %conv5
+  ret float %sub6
+}
+
+; CHECK-LABEL: __Z3zoov:
+; CHECK:      stp     x29, x30, [sp, #-16]!
+; CHECK:      bl      __Z3hoo
+; CHECK:      ldp     x29, x30, [sp], #16
+; CHECK-NEXT: ret
+
+; CHECK-SAVELR-LABEL: _Z3zoov:
+; CHECK-SAVELR:      stp     x29, x30, [sp, #-16]!
+; CHECK-SAVELR:      bl      _Z3hoo
+; CHECK-SAVELR:      ldp     x29, x30, [sp], #16
+; CHECK-SAVELR-NEXT: ret
+
+define i32 @_Z3zoov() nounwind ssp optsize {
+  %1 = tail call i32 @_Z3hoov() #2
+  %2 = add nsw i32 %1, 1
+  ret i32 %2
+}
+
+
+declare float @_Z3goof(float) nounwind ssp optsize
+declare i32 @_Z3hoov() nounwind ssp optsize
diff --git a/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog.ll b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-homogeneous-prolog-epilog.ll
@@ -0,0 +1,58 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -homogeneous-prolog-epilog| FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu  -homogeneous-prolog-epilog | FileCheck %s --check-prefixes=CHECK-SAVELR
+
+; CHECK-LABEL: __Z3hooii:
+; CHECK:      stp     x29, x30, [sp, #-16]!
+; CHECK-NEXT: bl      _OUTLINED_FUNCTION_PROLOG_x19x20x21x22
+; CHECK:      bl      __Z3gooi
+; CHECK:      bl      __Z3gooi
+; CHECK:      bl      _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22
+; CHECK-NEXT: b __Z3gooi
+
+; CHECK-SAVELR-LABEL: _Z3hooii:
+; CHECK-SAVELR:      stp     x29, x30, [sp, #-16]!
+; CHECK-SAVELR-NEXT: bl      OUTLINED_FUNCTION_PROLOG_x19x20x21x22x30x29
+; CHECK-SAVELR:      bl      _Z3gooi
+; CHECK-SAVELR:      bl      _Z3gooi
+; CHECK-SAVELR:      bl      OUTLINED_FUNCTION_EPILOG_x19x20x21x22x30x29
+; CHECK-SAVELR-NEXT: b _Z3gooi
+
+define i32 @_Z3hooii(i32 %b, i32 %a) nounwind ssp optsize {
+  %1 = tail call i32 @_Z3gooi(i32 %b)
+  %2 = tail call i32 @_Z3gooi(i32 %a)
+  %3 = add i32 %a, %b
+  %4 = add i32 %3, %1
+  %5 = add i32 %4, %2
+  %6 = tail call i32 @_Z3gooi(i32 %5)
+  ret i32 %6
+}
+
+declare i32 @_Z3gooi(i32);
+
+
+; CHECK-LABEL: _OUTLINED_FUNCTION_PROLOG_x19x20x21x22:
+; CHECK:      stp     x20, x19, [sp, #-16]!
+; CHECK-NEXT: stp     x22, x21, [sp, #-16]!
+; CHECK-NEXT: ret
+
+; CHECK-LABEL: _OUTLINED_FUNCTION_EPILOG_x30x29x19x20x21x22:
+; CHECK:      mov     x16, x30
+; CHECK-NEXT: ldp     x22, x21, [sp], #16
+; CHECK-NEXT: ldp     x20, x19, [sp], #16
+; CHECK-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-NEXT: br      x16
+
+; CHECK-SAVELR-LABEL: OUTLINED_FUNCTION_PROLOG_x19x20x21x22x30x29:
+; CHECK-SAVELR:      mov     x16, x30
+; CHECK-SAVELR-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-SAVELR-NEXT: stp     x20, x19, [sp, #-16]!
+; CHECK-SAVELR-NEXT: stp     x22, x21, [sp, #-16]!
+; CHECK-SAVELR-NEXT: stp     x29, x30, [sp, #-16]!
+; CHECK-SAVELR-NEXT: br      x16
+
+; CHECK-SAVELR-LABEL: OUTLINED_FUNCTION_EPILOG_x19x20x21x22x30x29:
+; CHECK-SAVELR:      mov     x16, x30
+; CHECK-SAVELR-NEXT: ldp     x29, x30, [sp], #16
+; CHECK-SAVELR-NEXT: ldp     x22, x21, [sp], #16
+; CHECK-SAVELR-NEXT: ldp     x20, x19, [sp], #16
+; CHECK-SAVELR-NEXT: br      x16