Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -196,10 +196,19 @@
   /// \brief Estimate the cost of a GEP operation when lowered.
   ///
   /// This user-based overload adds the ability to check if the GEP can be
-  /// folded into its users.
+  /// folded into all of its users.
   int getGEPCost(const GEPOperator *GEP,
                  ArrayRef<const Value *> Operands) const;
 
+  /// \brief Estimate the cost of a GEP operation when lowered.
+  ///
+  /// This user-based overload adds the ability to check if the GEP can be
+  /// folded into its users in \p Users.
+  int getGEPCost(const GEPOperator *GEP,
+                 ArrayRef<const Value *> Operands,
+                 ArrayRef<const User *>Users) const;
+
+
   /// \brief Estimate the cost of a EXT operation when lowered.
   ///
   /// The contract for this function is the same as \c getOperationCost except
@@ -941,6 +950,9 @@
                          ArrayRef<const Value *> Operands) = 0;
   virtual int getGEPCost(const GEPOperator *GEP,
                          ArrayRef<const Value *> Operands) = 0;
+  virtual int getGEPCost(const GEPOperator *GEP,
+                         ArrayRef<const Value *> Operands,
+                         ArrayRef<const User *>Users) = 0;
   virtual int getExtCost(const Instruction *I, const Value *Src) = 0;
   virtual int getCallCost(FunctionType *FTy, int NumArgs) = 0;
   virtual int getCallCost(const Function *F, int NumArgs) = 0;
@@ -1126,6 +1138,11 @@
                  ArrayRef<const Value *> Operands) override {
     return Impl.getGEPCost(GEP, Operands);
   }
+  int getGEPCost(const GEPOperator *GEP,
+                 ArrayRef<const Value *> Operands,
+                 ArrayRef<const User *>Users) override {
+    return Impl.getGEPCost(GEP, Operands, Users);
+  }
   int getExtCost(const Instruction *I, const Value *Src) override {
     return Impl.getExtCost(I, Src);
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -727,6 +727,13 @@
   }
 
   int getGEPCost(const GEPOperator *GEP, ArrayRef<const Value *> Operands) {
+    SmallVector<const User *, 8> Users(GEP->user_begin(), GEP->user_end());
+    return getGEPCost(GEP, Operands, Users);
+  }
+
+
+  int getGEPCost(const GEPOperator *GEP, ArrayRef<const Value *> Operands,
+                 ArrayRef<const User *>Users) {
     if (!isa<Instruction>(GEP))
       return TTI::TCC_Basic;
 
@@ -741,7 +748,7 @@
       // load/store instructions together with other instructions (e.g., other
       // GEPs). Handling all such cases must be expensive to be performed
       // in this function, so we stay conservative for now.
-      for (const User *U : GEP->users()) {
+      for (const User *U : Users) {
         const Operator *UOP = cast<Operator>(U);
         const Value *PointerOperand = nullptr;
         if (auto *LI = dyn_cast<LoadInst>(UOP))
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -194,6 +194,11 @@
     return BaseT::getGEPCost(GEP, Operands);
   }
 
+  int getGEPCost(const GEPOperator *GEP, ArrayRef<const Value *> Operands,
+                 ArrayRef<const User *>Users) {
+    return BaseT::getGEPCost(GEP, Operands, Users);
+  }
+
   int getExtCost(const Instruction *I, const Value *Src) {
     if (getTLI()->isExtFree(I))
       return TargetTransformInfo::TCC_Free;
Index: include/llvm/Transforms/Utils/LoopUtils.h
===================================================================
--- include/llvm/Transforms/Utils/LoopUtils.h
+++ include/llvm/Transforms/Utils/LoopUtils.h
@@ -424,8 +424,9 @@
 /// instructions of the loop and loop safety information as
 /// arguments. Diagnostics is emitted via \p ORE. It returns changed status.
 bool sinkRegion(DomTreeNode *, AliasAnalysis *, LoopInfo *, DominatorTree *,
-                TargetLibraryInfo *, Loop *, AliasSetTracker *,
-                LoopSafetyInfo *, OptimizationRemarkEmitter *ORE);
+                TargetLibraryInfo *, TargetTransformInfo *, Loop *,
+                AliasSetTracker *, LoopSafetyInfo *,
+                OptimizationRemarkEmitter *ORE);
 
 /// \brief Walk the specified region of the CFG (defined by all blocks
 /// dominated by the specified block, and that are in the current loop) in depth
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -93,6 +93,12 @@
   return TTIImpl->getGEPCost(GEP, Operands);
 }
 
+int TargetTransformInfo::getGEPCost(const GEPOperator *GEP,
+                                    ArrayRef<const Value *> Operands,
+                                    ArrayRef<const User *>Users) const {
+  return TTIImpl->getGEPCost(GEP, Operands, Users);
+}
+
 int TargetTransformInfo::getExtCost(const Instruction *I,
                                     const Value *Src) const {
   return TTIImpl->getExtCost(I, Src);
Index: lib/Transforms/Scalar/LICM.cpp
===================================================================
--- lib/Transforms/Scalar/LICM.cpp
+++ lib/Transforms/Scalar/LICM.cpp
@@ -88,15 +88,18 @@
              "invariance in loop using invariant start (default = 8)"));
 
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
-                            const LoopSafetyInfo *SafetyInfo);
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+                                  const LoopSafetyInfo *SafetyInfo,
+                                  TargetTransformInfo *TTI,
+                                  bool &ContainFoldableUsersInLoop);
 static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
                   const LoopSafetyInfo *SafetyInfo,
                   OptimizationRemarkEmitter *ORE);
 static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
                  const Loop *CurLoop, AliasSetTracker *CurAST,
                  const LoopSafetyInfo *SafetyInfo,
-                 OptimizationRemarkEmitter *ORE);
+                 OptimizationRemarkEmitter *ORE,
+                 bool ContainFoldableUsersInLoop);
 static bool isSafeToExecuteUnconditionally(Instruction &Inst,
                                            const DominatorTree *DT,
                                            const Loop *CurLoop,
@@ -114,8 +117,9 @@
 namespace {
 struct LoopInvariantCodeMotion {
   bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
-                 TargetLibraryInfo *TLI, ScalarEvolution *SE,
-                 OptimizationRemarkEmitter *ORE, bool DeleteAST);
+                 TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
+                 ScalarEvolution *SE, OptimizationRemarkEmitter *ORE,
+                 bool DeleteAST);
 
   DenseMap<Loop *, AliasSetTracker *> &getLoopToAliasSetMap() {
     return LoopToAliasSetMap;
@@ -155,6 +159,8 @@
                           &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
                           &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
                           &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+                          &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+                              *L->getHeader()->getParent()),
                           SE ? &SE->getSE() : nullptr, &ORE, false);
   }
 
@@ -164,6 +170,7 @@
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 
@@ -204,7 +211,8 @@
                        "cached at a higher level");
 
   LoopInvariantCodeMotion LICM;
-  if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, ORE, true))
+  if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.TTI, &AR.SE, ORE,
+                      true))
     return PreservedAnalyses::all();
 
   auto PA = getLoopPassPreservedAnalyses();
@@ -217,6 +225,7 @@
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
                     false)
 
@@ -228,12 +237,10 @@
 /// We should delete AST for inner loops in the new pass manager to avoid
 /// memory leak.
 ///
-bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AliasAnalysis *AA,
-                                        LoopInfo *LI, DominatorTree *DT,
-                                        TargetLibraryInfo *TLI,
-                                        ScalarEvolution *SE,
-                                        OptimizationRemarkEmitter *ORE,
-                                        bool DeleteAST) {
+bool LoopInvariantCodeMotion::runOnLoop(
+    Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
+    TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE,
+    OptimizationRemarkEmitter *ORE, bool DeleteAST) {
   bool Changed = false;
 
   assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@@ -258,7 +265,7 @@
   // instructions, we perform another pass to hoist them out of the loop.
   //
   if (L->hasDedicatedExits())
-    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
+    Changed |= sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, TTI, L,
                           CurAST, &SafetyInfo, ORE);
   if (Preheader)
     Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, TLI, L,
@@ -351,7 +358,8 @@
 /// definitions, allowing us to sink a loop body in one pass without iteration.
 ///
 bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
-                      DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
+                      DominatorTree *DT, TargetLibraryInfo *TLI,
+                      TargetTransformInfo *TTI, Loop *CurLoop,
                       AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo,
                       OptimizationRemarkEmitter *ORE) {
 
@@ -392,10 +400,14 @@
       // outside of the loop.  In this case, it doesn't even matter if the
       // operands of the instruction are loop invariant.
       //
-      if (isNotUsedInLoop(I, CurLoop, SafetyInfo) &&
+      bool ContainFoldableUsersInLoop = false;
+      if (isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI,
+                                ContainFoldableUsersInLoop) &&
           canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, SafetyInfo, ORE)) {
-        ++II;
-        Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo, ORE);
+        if (!ContainFoldableUsersInLoop)
+          ++II;
+        Changed |= sink(I, LI, DT, CurLoop, CurAST, SafetyInfo, ORE,
+                        ContainFoldableUsersInLoop);
       }
     }
   }
@@ -688,13 +700,42 @@
   return true;
 }
 
+static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
+                         const TargetTransformInfo *TTI) {
+  // Unlike other instructions, in GEPs, users are considered when estimating
+  // the user cost. We only need to see if the GEP is foldable into its users in
+  // the loop.
+  if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+    SmallVector<const User *, 8> UsersInLoop;
+    for (const User *U : GEP->users()) {
+      const Instruction *UI = cast<Instruction>(U);
+      if (CurLoop->contains(UI))
+        UsersInLoop.push_back(U);
+    }
+    SmallVector<const Value *, 4> Indices;
+    for (auto II = GEP->idx_begin(); II != GEP->idx_end(); ++II)
+      Indices.push_back(*II);
+
+    return TTI->getGEPCost(cast<GEPOperator>(GEP), Indices, UsersInLoop) ==
+           TargetTransformInfo::TCC_Free;
+  }
+  return TTI->getUserCost(&I) == TargetTransformInfo::TCC_Free;
+}
+
 /// Return true if the only users of this instruction are outside of
 /// the loop. If this is true, we can sink the instruction to the exit
 /// blocks of the loop.
 ///
-static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
-                            const LoopSafetyInfo *SafetyInfo) {
+/// We also return true if the instruction is foldable in the loop at isel time
+/// (e.g.,  a GEP can be folded into a load as an addressing mode in the loop).
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+                                  const LoopSafetyInfo *SafetyInfo,
+                                  TargetTransformInfo *TTI,
+                                  bool &ContainFoldableUsersInLoop) {
   const auto &BlockColors = SafetyInfo->BlockColors;
+
+  bool IsFree = isFreeInLoop(I, CurLoop, TTI);
+
   for (const User *U : I.users()) {
     const Instruction *UI = cast<Instruction>(U);
     if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
@@ -731,8 +772,14 @@
       continue;
     }
 
-    if (CurLoop->contains(UI))
+    if (CurLoop->contains(UI)) {
+      // Check if the instruction is foldable with its user in the loop.
+      if (IsFree) {
+        ContainFoldableUsersInLoop = true;
+        continue;
+      }
       return false;
+    }
   }
   return true;
 }
@@ -806,7 +853,8 @@
 static bool sink(Instruction &I, const LoopInfo *LI, const DominatorTree *DT,
                  const Loop *CurLoop, AliasSetTracker *CurAST,
                  const LoopSafetyInfo *SafetyInfo,
-                 OptimizationRemarkEmitter *ORE) {
+                 OptimizationRemarkEmitter *ORE,
+                 bool ContainFoldableUsersInLoop) {
   DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
   ORE->emit(OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
             << "sinking " << ore::NV("Inst", &I));
@@ -827,13 +875,19 @@
 
   // Clones of this instruction. Don't create more than one per exit block!
   SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+  SmallPtrSet<Instruction *, 2> UsersToBeRemoved;
 
   // If this instruction is only used outside of the loop, then all users are
   // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
   // the instruction.
-  while (!I.use_empty()) {
-    Value::user_iterator UI = I.user_begin();
+  for (Value::user_iterator UI = I.user_begin(), UE = I.user_end(); UI != UE;) {
     auto *User = cast<Instruction>(*UI);
+    Use &U = UI.getUse();
+    ++UI;
+
+    if (CurLoop->contains(User) || UsersToBeRemoved.count(User))
+      continue;
+
     if (!DT->isReachableFromEntry(User->getParent())) {
       User->replaceUsesOfWith(&I, UndefValue::get(I.getType()));
       continue;
@@ -844,7 +898,6 @@
     // Surprisingly, instructions can be used outside of loops without any
     // exits.  This can only happen in PHI nodes if the incoming block is
     // unreachable.
-    Use &U = UI.getUse();
     BasicBlock *BB = PN->getIncomingBlock(U);
     if (!DT->isReachableFromEntry(BB)) {
       U = UndefValue::get(I.getType());
@@ -863,12 +916,17 @@
       New = SunkCopies[ExitBlock] =
           CloneInstructionInExitBlock(I, *ExitBlock, *PN, LI, SafetyInfo);
 
+    UsersToBeRemoved.insert(PN);
     PN->replaceAllUsesWith(New);
-    PN->eraseFromParent();
   }
 
-  CurAST->deleteValue(&I);
-  I.eraseFromParent();
+  for (auto *User : UsersToBeRemoved)
+    User->eraseFromParent();
+
+  if (!ContainFoldableUsersInLoop) {
+    CurAST->deleteValue(&I);
+    I.eraseFromParent();
+  }
   return Changed;
 }
 
Index: test/Transforms/LICM/sink-foldable.ll
===================================================================
--- /dev/null
+++ test/Transforms/LICM/sink-foldable.ll
@@ -0,0 +1,147 @@
+; RUN: opt < %s  -licm -S   | FileCheck %s
+target triple = "aarch64--linux-gnueabi"
+
+; CHECK-LABEL:@test1
+; CHECK-LABEL:loopexit1:
+; CHECK: %[[PHI:.+]] = phi i8** [ %arrayidx0, %if.end ]
+; CHECK: getelementptr inbounds i8*, i8** %[[PHI]], i64 1
+define i8** @test1(i32 %j, i8** readonly %P, i8* readnone %Q) {
+entry:
+  %cmp0 = icmp slt i32 0, %j
+  br i1 %cmp0, label %for.body.lr.ph, label %return
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %P.addr = phi i8** [ %P, %for.body.lr.ph ], [ %arrayidx0, %if.end  ]
+  %i0 = phi i32 [ 0, %for.body.lr.ph ], [ %i.add, %if.end]
+
+  %i0.ext = sext i32 %i0 to i64
+  %arrayidx0 = getelementptr inbounds i8*, i8** %P.addr, i64 %i0.ext
+  %l0 = load i8*, i8** %arrayidx0, align 8
+  %cmp1 = icmp ugt i8* %l0, %Q
+  br i1 %cmp1, label %loopexit0, label %if.end
+
+if.end:                                           ; preds = %for.body
+  %arrayidx1 = getelementptr inbounds i8*, i8** %arrayidx0, i64 1
+  %l1 = load i8*, i8** %arrayidx1, align 8
+  %cmp4 = icmp ugt i8* %l1, %Q
+  %i.add = add nsw i32 %i0, 2
+  br i1 %cmp4, label %loopexit1, label %for.body
+
+loopexit0:
+  %p1 = phi i8** [%arrayidx0, %for.body]
+  br label %return
+
+loopexit1:
+  %p2 = phi i8** [%arrayidx1, %if.end]
+  br label  %return
+
+return:
+  %retval.0 = phi i8** [ %p1, %loopexit0 ], [%p2, %loopexit1], [ null, %entry ]
+  ret i8** %retval.0
+}
+
+; CHECK-LABEL: @test2
+; CHECK-LABEL: loopexit2:
+; CHECK: %[[PHI:.*]] = phi i8** [ %add.ptr, %if.end ]
+; CHECK: getelementptr inbounds i8*, i8** %[[PHI]]
+define i8** @test2(i32 %j, i8** readonly %P, i8* readnone %Q) {
+
+entry:
+  br label %for.body
+
+for.cond:
+  %i.addr.0 = phi i32 [ %add, %if.end ]
+  %P.addr.0 = phi i8** [ %add.ptr, %if.end ]
+  %cmp = icmp slt i32 %i.addr.0, %j
+  br i1 %cmp, label %for.body, label %loopexit0
+
+for.body:
+  %P.addr = phi i8** [ %P, %entry ], [ %P.addr.0, %for.cond ]
+  %i.addr = phi i32 [ 0, %entry ], [ %i.addr.0, %for.cond ]
+
+  %idx.ext = sext i32 %i.addr to i64
+  %add.ptr = getelementptr inbounds i8*, i8** %P.addr, i64 %idx.ext
+  %l0 = load i8*, i8** %add.ptr, align 8
+
+  %cmp1 = icmp ugt i8* %l0, %Q
+  br i1 %cmp1, label %loopexit1, label %if.end
+
+if.end:
+  %add.i = add i32 %i.addr, 1
+  %idx2.ext = sext i32 %add.i to i64
+  %arrayidx2 = getelementptr inbounds i8*, i8** %add.ptr, i64 %idx2.ext
+  %l1 = load i8*, i8** %arrayidx2, align 8
+  %cmp2 = icmp ugt i8* %l1, %Q
+  %add = add nsw i32 %add.i, 1
+  br i1 %cmp2, label %loopexit2, label %for.cond
+
+loopexit0:
+  %p0 = phi i8** [ null, %for.cond ]
+  br label %return
+
+loopexit1:
+  %p1 = phi i8** [ %add.ptr, %for.body ]
+  br label %return
+
+loopexit2:
+  %p2 = phi i8** [ %arrayidx2, %if.end ]
+  br label %return
+
+return:
+  %retval.0 = phi i8** [ %p1, %loopexit1 ], [ %p2, %loopexit2 ], [ %p0, %loopexit0 ]
+  ret i8** %retval.0
+}
+
+
+; CHECK-LABEL: @test3
+; CHECK-LABEL: loopexit1:
+; CHECK: %[[ADD:.*]]  = phi i64 [ %add, %if.end ]
+; CHECK: %[[ADDR:.*]] = phi i8** [ %P.addr, %if.end ]
+; CHECK: %[[TRUNC:.*]] = trunc i64 %[[ADD]] to i32
+; CHECK: getelementptr inbounds i8*, i8** %[[ADDR]], i32 %[[TRUNC]]
+; CHECK: call void @dummy(i32 %[[TRUNC]])
+define i8** @test3(i64 %j, i8** readonly %P, i8* readnone %Q) {
+entry:
+  %cmp0 = icmp slt i64 0, %j
+  br i1 %cmp0, label %for.body.lr.ph, label %return
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %P.addr = phi i8** [ %P, %for.body.lr.ph ], [ %arrayidx0, %if.end  ]
+  %i0 = phi i32 [ 0, %for.body.lr.ph ], [ %i.add, %if.end]
+
+  %i0.ext = sext i32 %i0 to i64
+  %arrayidx0 = getelementptr inbounds i8*, i8** %P.addr, i64 %i0.ext
+  %l0 = load i8*, i8** %arrayidx0, align 8
+  %cmp1 = icmp ugt i8* %l0, %Q
+  br i1 %cmp1, label %loopexit0, label %if.end
+
+if.end:                                           ; preds = %for.body
+  %add = add i64 %i0.ext, 1
+  %trunc = trunc i64 %add to i32
+  %arrayidx1 = getelementptr inbounds i8*, i8** %P.addr, i32 %trunc
+  %l1 = load i8*, i8** %arrayidx1, align 8
+  %cmp4 = icmp ugt i8* %l1, %Q
+  %i.add = add nsw i32 %i0, 2
+  br i1 %cmp4, label %loopexit1, label %for.body
+
+loopexit0:
+  %p1 = phi i8** [%arrayidx0, %for.body]
+  br label %return
+
+loopexit1:
+  %p2 = phi i8** [%arrayidx1, %if.end]
+  call void @dummy(i32 %trunc)
+  br label  %return
+
+return:
+  %retval.0 = phi i8** [ %p1, %loopexit0 ], [%p2, %loopexit1], [ null, %entry ]
+  ret i8** %retval.0
+}
+
+declare void @dummy(i32)