Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -252,7 +252,7 @@
 void initializeRegisterCoalescerPass(PassRegistry&);
 void initializeSingleLoopExtractorPass(PassRegistry&);
 void initializeSinkingPass(PassRegistry&);
-void initializeSeparateConstOffsetFromGEPPass(PassRegistry &);
+void initializeReassociateGEPsPass(PassRegistry &);
 void initializeSlotIndexesPass(PassRegistry&);
 void initializeSpillPlacementPass(PassRegistry&);
 void initializeStackProtectorPass(PassRegistry&);
Index: include/llvm/LinkAllPasses.h
===================================================================
--- include/llvm/LinkAllPasses.h
+++ include/llvm/LinkAllPasses.h
@@ -169,7 +169,7 @@
       (void) llvm::createBBVectorizePass();
       (void) llvm::createPartiallyInlineLibCallsPass();
       (void) llvm::createScalarizerPass();
-      (void) llvm::createSeparateConstOffsetFromGEPPass();
+      (void)llvm::createReassociateGEPsPass();
       (void) llvm::createRewriteSymbolsPass();
       (void) llvm::createStraightLineStrengthReducePass();
       (void) llvm::createMemDerefPrinter();
Index: include/llvm/Transforms/Scalar.h
===================================================================
--- include/llvm/Transforms/Scalar.h
+++ include/llvm/Transforms/Scalar.h
@@ -415,11 +415,10 @@
 
 //===----------------------------------------------------------------------===//
 //
-// SeparateConstOffsetFromGEP - Split GEPs for better CSE
+// ReassociateGEPs - Split and reassociate GEPs for better CSE and LICM.
 //
-FunctionPass *
-createSeparateConstOffsetFromGEPPass(const TargetMachine *TM = nullptr,
-                                     bool LowerGEP = false);
+FunctionPass *createReassociateGEPsPass(const TargetMachine *TM = nullptr,
+                                        bool LowerGEP = false);
 
 //===----------------------------------------------------------------------===//
 //
Index: lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetMachine.cpp
+++ lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -228,10 +228,10 @@
   TargetPassConfig::addIRPasses();
 
   if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
-    // Call SeparateConstOffsetFromGEP pass to extract constants within indices
+    // Call ReassociateGEPs pass to extract constants within indices
     // and lower a GEP with multiple indices to either arithmetic operations or
     // multiple GEPs with single index.
-    addPass(createSeparateConstOffsetFromGEPPass(TM, true));
+    addPass(createReassociateGEPsPass(TM, true));
     // Call EarlyCSE pass to find and remove subexpressions in the lowered
     // result.
     addPass(createEarlyCSEPass());
Index: lib/Target/NVPTX/NVPTXTargetMachine.cpp
===================================================================
--- lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -165,22 +165,23 @@
   addPass(createGenericToNVVMPass());
   addPass(createNVPTXFavorNonGenericAddrSpacesPass());
   addPass(createStraightLineStrengthReducePass());
-  addPass(createSeparateConstOffsetFromGEPPass());
-  // The SeparateConstOffsetFromGEP pass creates variadic bases that can be used
-  // by multiple GEPs. Run GVN or EarlyCSE to really reuse them. GVN generates
-  // significantly better code than EarlyCSE for some of our benchmarks.
+  addPass(createReassociateGEPsPass());
+  // The ReassociateGEPs pass creates variadic bases that can be used
+  // by multiple GEPs. Run GVN or EarlyCSE to really reuse them. GVN
+  // generates significantly better code than EarlyCSE for some of our
+  // benchmarks.
   if (getOptLevel() == CodeGenOpt::Aggressive)
     addPass(createGVNPass());
   else
     addPass(createEarlyCSEPass());
-  // Both FavorNonGenericAddrSpaces and SeparateConstOffsetFromGEP may leave
+  // Both FavorNonGenericAddrSpaces and ReassociateGEPs may leave
   // some dead code.  We could remove dead code in an ad-hoc manner, but that
   // requires manual work and might be error-prone.
   //
   // The FavorNonGenericAddrSpaces pass shortcuts unnecessary addrspacecasts,
   // and leave them unused.
   //
-  // SeparateConstOffsetFromGEP rebuilds a new index from the old index, and the
+  // ReassociateGEPs rebuilds a new index from the old index, and the
   // old index and some of its intermediate results may become unused.
   addPass(createDeadCodeEliminationPass());
 }
Index: lib/Target/PowerPC/PPCTargetMachine.cpp
===================================================================
--- lib/Target/PowerPC/PPCTargetMachine.cpp
+++ lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -263,10 +263,10 @@
     addPass(createPPCLoopDataPrefetchPass());
 
   if (TM->getOptLevel() == CodeGenOpt::Aggressive && EnableGEPOpt) {
-    // Call SeparateConstOffsetFromGEP pass to extract constants within indices
+    // Call ReassociateGEPs pass to extract constants within indices
     // and lower a GEP with multiple indices to either arithmetic operations or
     // multiple GEPs with single index.
-    addPass(createSeparateConstOffsetFromGEPPass(TM, true));
+    addPass(createReassociateGEPsPass(TM, true));
     // Call EarlyCSE pass to find and remove subexpressions in the lowered
     // result.
     addPass(createEarlyCSEPass());
Index: lib/Transforms/Scalar/CMakeLists.txt
===================================================================
--- lib/Transforms/Scalar/CMakeLists.txt
+++ lib/Transforms/Scalar/CMakeLists.txt
@@ -33,6 +33,7 @@
   PartiallyInlineLibCalls.cpp
   PlaceSafepoints.cpp
   Reassociate.cpp
+  ReassociateGEPs.cpp
   Reg2Mem.cpp
   RewriteStatepointsForGC.cpp
   SCCP.cpp
@@ -41,7 +42,6 @@
   Scalar.cpp
   ScalarReplAggregates.cpp
   Scalarizer.cpp
-  SeparateConstOffsetFromGEP.cpp
   SimplifyCFGPass.cpp
   Sink.cpp
   StraightLineStrengthReduce.cpp
Index: lib/Transforms/Scalar/ReassociateGEPs.cpp
===================================================================
--- lib/Transforms/Scalar/ReassociateGEPs.cpp
+++ lib/Transforms/Scalar/ReassociateGEPs.cpp
@@ -1,4 +1,4 @@
-//===-- SeparateConstOffsetFromGEP.cpp - ------------------------*- C++ -*-===//
+//===-- ReassociateGEPs.cpp - ------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,6 +7,20 @@
 //
 //===----------------------------------------------------------------------===//
 //
+// This pass reassociates the operands of GEP instructions and then
+// splits the GEPs to enable later optimizations such as loop
+// invariant code motion, constant folding, and common subexpression
+// elimination. Specifically, this pass performs the following
+// transformations:
+//
+// (1) Reassociate and gather constant offsets in GEP sequential index operands.
+// (2) Reassociate loop invariant terms in GEP sequential index operands.
+// (3) (Optional) Lower GEPs into simpler GEPs or their arithmetic equivalents.
+//
+// Below are the details of the motivation and mechanics of each transformation:
+//
+// (1) Reassociate and gather constant offsets
+//
 // Loop unrolling may create many similar GEPs for array accesses.
 // e.g., a 2-level loop
 //
@@ -79,10 +93,36 @@
 // ld.global.f32   %f3, [%rl6+128]; // much better
 // ld.global.f32   %f4, [%rl6+132]; // much better
 //
-// Another improvement enabled by the LowerGEP flag is to lower a GEP with
-// multiple indices to either multiple GEPs with a single index or arithmetic
-// operations (depending on whether the target uses alias analysis in codegen).
-// Such transformation can have following benefits:
+// (2) Reassociate loop invariant terms
+//
+// If the base pointer of a GEP is loop invariant and some values in
+// GEP index expressions are also loop invariant it may be possible to
+// split the GEP into two GEPs, one of which is loop invariant.
+// Example:
+//
+//   loop:
+//     %sum = add %a, %invariant_b
+//     %idx = add %sum, %c
+//     %p = gep %invariant_base, %idx
+//
+// Where %invariant_b and %invariant_base are loop invariant. This
+// GEP can be split into two GEPs:
+//
+//   loop:
+//     %tmp = gep %invariant_base, %invariant_b
+//     %idx = add %a, %c
+//     %p = gep %tmp, %idx
+//
+// The first GEP (%tmp) is now invariant and can be hoisted out of the
+// loop. GEPs with multiple sequential index operands might be split
+// more than once.
+//
+// (3) Lower GEPs
+//
+// If the LowerGEP option is set GEPs with multiple indices are
+// lowered to either multiple GEPs with a single index or arithmetic
+// operations (depending on whether the target uses alias analysis in
+// codegen). Such transformation can have following benefits:
 // (1) It can always extract constants in the indices of structure type.
 // (2) After such Lowering, there are more optimization opportunities such as
 //     CSE, LICM and CGP.
@@ -145,17 +185,16 @@
 //    load %p2
 //    ...
 //
-// Lowering GEPs can also benefit other passes such as LICM and CGP.
-// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
-// indices if one of the index is variant. If we lower such GEP into invariant
-// parts and variant parts, LICM can hoist/sink those invariant parts.
-// CGP (CodeGen Prepare) tries to sink address calculations that match the
-// target's addressing modes. A GEP with multiple indices may not match and will
-// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
-// them. So we end up with a better addressing mode.
+// Lowering GEPs can also benefit other passes such as CGP.  CGP
+// (CodeGen Prepare) tries to sink address calculations that match the
+// target's addressing modes. A GEP with multiple indices may not
+// match and will not be sunk. If we lower such GEP into smaller
+// parts, CGP may sink some of them. So we end up with a better
+// addressing mode.
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
@@ -173,79 +212,217 @@
 
 using namespace llvm;
 
-static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
-    "disable-separate-const-offset-from-gep", cl::init(false),
-    cl::desc("Do not separate the constant offset from a GEP instruction"),
-    cl::Hidden);
+static cl::opt<bool>
+    DisableReassociateGEPs("disable-reassociate-geps", cl::init(false),
+                           cl::desc("Do not reassociate GEP instructions"),
+                           cl::Hidden);
 
 namespace {
 
-/// \brief A helper class for separating a constant offset from a GEP index.
+/// \brief Element in the def-use chain used for identifying and
+/// extracting hoistable values from an expression.
+struct UserChainElement {
+  // The value in the def-use chain.
+  Value *value;
+  // For values with more than one operand, this is the operand number
+  // for this link in the chain.
+  unsigned operandNo;
+};
+
+/// \brief A helper class for finding values in an expression which
+/// may be hoisted to the outermost level of the expression.
 ///
-/// In real programs, a GEP index may be more complicated than a simple addition
-/// of something and a constant integer which can be trivially splitted. For
-/// example, to split ((a << 3) | 5) + b, we need to search deeper for the
-/// constant offset, so that we can separate the index to (a << 3) + b and 5.
+/// When reassociating a GEP index expression for LICM or for
+/// extracting constant offsets we want to find a value within the
+/// expression which may be safely hoisted to the outermost level of
+/// the expression as an addition. For example, the constant 5 can be
+/// hoisted out of the expression ((a << 3) | 5) + b to form a new
+/// equivalent expression ((a << 3) + b) + 5.
 ///
-/// Therefore, this class looks into the expression that computes a given GEP
-/// index, and tries to find a constant integer that can be hoisted to the
-/// outermost level of the expression as an addition. Not every constant in an
-/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
-/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
-/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
-class ConstantOffsetExtractor {
- public:
-  /// Extracts a constant offset from the given GEP index. It returns the
-  /// new index representing the remainder (equal to the original index minus
-  /// the constant offset), or nullptr if we cannot extract a constant offset.
-  /// \p Idx    The given GEP index
-  /// \p GEP    The given GEP
-   static Value *Extract(Value *Idx, GetElementPtrInst *GEP);
-  /// Looks for a constant offset from the given GEP index without extracting
-  /// it. It returns the numeric value of the extracted constant offset (0 if
-  /// failed). The meaning of the arguments are the same as Extract.
-   static int64_t Find(Value *Idx, GetElementPtrInst *GEP);
-
- private:
-   ConstantOffsetExtractor(Instruction *InsertionPt) : IP(InsertionPt) {}
-  /// Searches the expression that computes V for a non-zero constant C s.t.
-  /// V can be reassociated into the form V' + C. If the searching is
-  /// successful, returns C and update UserChain as a def-use chain from C to V;
-  /// otherwise, UserChain is empty.
+/// This abstract class includes logic to selectively visit values
+/// within an expression which are safe to hoist. Subclasses implement
+/// the visit method to select among these hoistable values based on
+/// subclass-specific objectives such as loop invariance or whether
+/// the value is a constant.
+class HoistableValueFinder {
+public:
+  HoistableValueFinder(const DataLayout &DLayout) : DL(DLayout) {}
+  virtual ~HoistableValueFinder() {}
+  /// Walks the expression tree of Expression visiting each hoistable value.
+  /// Returns true if a hoistable value is found as indicated by the
+  /// visit method. The resulting UserChain vector may be fed to
+  /// ValueExtractor::Extract to extract the hoistable value from the
+  /// expression.
+  ///
+  /// \p Expression   The given expression
+  /// \p NonNegative  Whether the expression is guaranteed to be nonnegative
+  ///                 This may uncover more hoistable values.
+  /// \p UserChain    If a hoistable value is found, this is filled with
+  ///                 the def-use chain of the hoistable value from the
+  ///                 hoistable value up to the full expression.
+  bool find(Value *Expression, bool NonNegative,
+            SmallVectorImpl<UserChainElement> &UserChain);
+
+protected:
+  /// This method is called with each hoistable value found during
+  /// the walk of the expression tree performed in find(). This method
+  /// should return true if the hoistable value is what the subclass
+  /// is looking for (for example, a constant value for
+  /// ConstantOffsetFinder). ContinueSearch should be set to true or
+  /// false depending on whether the search should continue beneath
+  /// this value.
+  virtual bool visit(Value *V, bool &ContinueSearch) = 0;
+
+private:
+  /// Searches the expression that computes V for a value W s.t. V
+  /// can be reassociated into the form V' + W where V' is V with W
+  /// removed. If the searching is successful, returns true and
+  /// updates UserChain as a def-use chain from W to V; otherwise,
+  /// UserChain is empty.
   ///
   /// \p V            The given expression
   /// \p SignExtended Whether V will be sign-extended in the computation of the
-  ///                 GEP index
+  ///                 top-level expression
   /// \p ZeroExtended Whether V will be zero-extended in the computation of the
-  ///                 GEP index
+  ///                 top-level expression
   /// \p NonNegative  Whether V is guaranteed to be non-negative. For example,
   ///                 an index of an inbounds GEP is guaranteed to be
-  ///                 non-negative. Levaraging this, we can better split
+  ///                 non-negative. Leveraging this, we can better split
   ///                 inbounds GEPs.
-  APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
+  bool findInSubexpression(Value *V, bool SignExtended, bool ZeroExtended,
+                           bool NonNegative,
+                           SmallVectorImpl<UserChainElement> &UserChain);
   /// A helper function to look into both operands of a binary operator.
-  APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
-                            bool ZeroExtended);
-  /// After finding the constant offset C from the GEP index I, we build a new
-  /// index I' s.t. I' + C = I. This function builds and returns the new
-  /// index I' according to UserChain produced by function "find".
+  bool findInEitherOperand(BinaryOperator *BO, bool SignExtended,
+                           bool ZeroExtended, unsigned &Operand,
+                           SmallVectorImpl<UserChainElement> &UserChain);
+  /// Returns true if LHS and RHS have no bits in common, i.e., LHS | RHS == 0.
+  bool noCommonBits(Value *LHS, Value *RHS) const;
+  /// Computes which bits are known to be one or zero.
+  /// \p KnownOne Mask of all bits that are known to be one.
+  /// \p KnownZero Mask of all bits that are known to be zero.
+  void computeKnownBits(Value *V, APInt &KnownOne, APInt &KnownZero) const;
+  /// A helper function that returns whether we can trace into the operands
+  /// of binary operator BO for a constant offset.
+  ///
+  /// \p SignExtended Whether BO is surrounded by sext
+  /// \p ZeroExtended Whether BO is surrounded by zext
+  /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
+  ///                array index.
+  bool canTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
+                    bool NonNegative);
+
+  const DataLayout &DL;
+};
+
+// \brief A subclass of HoistableValueFinder for finding hoistable
+// constants within an expression.
+class ConstantOffsetFinder : HoistableValueFinder {
+public:
+  /// Looks for a constant offset within the given GEP index
+  /// expression. It returns the numeric value of the extracted
+  /// constant offset (0 if failed). If a constant is found,
+  /// UserChain is set to the def-use chain for this value which can
+  /// then be fed to ValueExtractor::Extract to extract the constant
+  /// from the expression.
+  static int64_t FindConstOffset(GetElementPtrInst *GEP, unsigned OperandIdx,
+                                 SmallVectorImpl<UserChainElement> &UserChain);
+
+protected:
+  virtual bool visit(Value *V, bool &ContinueSearch);
+
+private:
+  ConstantOffsetFinder(const DataLayout &DLayout)
+      : HoistableValueFinder(DLayout), ConstantOffset(1, 0) {}
+
+  APInt ConstantOffset;
+};
+
+/// \brief A helper class which ranks values according to their loop invariance.
+///
+/// The rank of a value is the minimal loop depth that the value can be
+/// hoisted up to. Constants and function arguments, for example, have
+/// rank 0.
+class LICMRanker {
+public:
+  LICMRanker(const LoopInfo *Info) : LI(Info) {}
+  int getRank(Value *V);
+
+private:
+  // Returns true if an instruction cannot be moved. An immovable
+  // instruction's rank is simply its loop depth.
+  static bool isImmovableInstruction(Instruction *I);
+
+  const LoopInfo *LI;
+  // Cache of already computed ranks.
+  DenseMap<Instruction *, int> RankMap;
+};
+
+/// \brief A subclass of HoistableValueFinder for finding a loop
+/// invariant value within an index expression of a GEP.
+class LoopInvariantFinder : HoistableValueFinder {
+public:
+  // Looks for a loop invariant value in the expression for the given
+  // GEP index. Returns true if a value is found, and UserChain is
+  // set to the def-use chain of the invariant value. UserChain can
+  // then be fed to ValueExtractor::Extract.
+  static bool
+  FindLoopInvariantValue(GetElementPtrInst *GEP, unsigned OperandIdx,
+                         LICMRanker &Ranker,
+                         SmallVectorImpl<UserChainElement> &UserChain);
+
+protected:
+  virtual bool visit(Value *V, bool &ContinueSearch);
+
+private:
+  LoopInvariantFinder(int BR, int MR, LICMRanker &R, const DataLayout &DLayout)
+      : HoistableValueFinder(DLayout), BaseRank(BR), MaximumRank(MR),
+        Candidate(nullptr), Ranker(R) {}
+  // The rank of the expression(s) which the loop invariant value will
+  // be reassociated with.
+  int BaseRank;
+  // The maximum rank of any value to be considered as a profitable candidate.
+  int MaximumRank;
+  // Best candidate found so far.
+  Value *Candidate;
+  LICMRanker &Ranker;
+};
+
+/// \brief A helper class for extracting a hoistable value (as defined
+/// by HoistableValueFinder) from an expression.
+class ValueExtractor {
+public:
+  /// Extracts the value indicated by UserChain from the given
+  /// expression. UserChain is generated by
+  /// HoistableValueFinder::Find. Returns the extracted
+  /// value. Remainder is set to the remaining expression without the
+  /// extracted value.
+  static Value *Extract(SmallVectorImpl<UserChainElement> &UserChain,
+                        Instruction *InsertionPoint, Value *&Remainder);
+
+private:
+  ValueExtractor(Instruction *InsertionPoint) : IP(InsertionPoint) {}
+  /// After finding a hoistable value V within an expression E, we build a new
+  /// expression E' s.t. E' + V = I. This function builds and returns the new
+  /// expression E'.
   ///
   /// The building conceptually takes two steps:
   /// 1) iteratively distribute s/zext towards the leaves of the expression tree
-  /// that computes I
-  /// 2) reassociate the expression tree to the form I' + C.
+  /// that computes E
+  /// 2) reassociate the expression tree to the form E' + V.
   ///
   /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
   /// sext to a, b and 5 so that we have
   ///   sext(a) + (sext(b) + 5).
   /// Then, we reassociate it to
   ///   (sext(a) + sext(b)) + 5.
-  /// Given this form, we know I' is sext(a) + sext(b).
-  Value *rebuildWithoutConstOffset();
-  /// After the first step of rebuilding the GEP index without the constant
-  /// offset, distribute s/zext to the operands of all operators in UserChain.
-  /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
-  /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
+  /// Given this form, we know E' is sext(a) + sext(b).
+  Value *rebuildWithoutValue(SmallVectorImpl<UserChainElement> &UserChain,
+                             Value *&Remainder);
+  /// After the first step of rebuilding the expression without the
+  /// hoistable value, distribute s/zext to the operands of all
+  /// operators in UserChain. e.g., zext(sext(a + (b + 5)) (assuming
+  /// no overflow) => zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
   ///
   /// The function also updates UserChain to point to new subexpressions after
   /// distributing s/zext. e.g., the old UserChain of the above example is
@@ -257,90 +434,77 @@
   /// \p ChainIndex The index to UserChain. ChainIndex is initially
   ///               UserChain.size() - 1, and is decremented during
   ///               the recursion.
-  Value *distributeExtsAndCloneChain(unsigned ChainIndex);
-  /// Reassociates the GEP index to the form I' + C and returns I'.
-  Value *removeConstOffset(unsigned ChainIndex);
+  Value *
+  distributeExtsAndCloneChain(SmallVectorImpl<UserChainElement> &UserChain,
+                              unsigned ChainIndex);
+  /// Reassociates the expression to the form E' + V and returns E'.
+  Value *removeHoistableValue(SmallVectorImpl<UserChainElement> &UserChain,
+                              unsigned ChainIndex);
   /// A helper function to apply ExtInsts, a list of s/zext, to value V.
   /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
   /// returns "sext i32 (zext i16 V to i32) to i64".
   Value *applyExts(Value *V);
 
-  /// Returns true if LHS and RHS have no bits in common, i.e., LHS | RHS == 0.
-  bool NoCommonBits(Value *LHS, Value *RHS) const;
-  /// Computes which bits are known to be one or zero.
-  /// \p KnownOne Mask of all bits that are known to be one.
-  /// \p KnownZero Mask of all bits that are known to be zero.
-  void ComputeKnownBits(Value *V, APInt &KnownOne, APInt &KnownZero) const;
-  /// A helper function that returns whether we can trace into the operands
-  /// of binary operator BO for a constant offset.
-  ///
-  /// \p SignExtended Whether BO is surrounded by sext
-  /// \p ZeroExtended Whether BO is surrounded by zext
-  /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
-  ///                array index.
-  bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
-                    bool NonNegative);
-
-  /// The path from the constant offset to the old GEP index. e.g., if the GEP
-  /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
-  /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
-  /// UserChain[2] will be the entire expression "a * b + (c + 5)".
-  ///
-  /// This path helps to rebuild the new GEP index.
-  SmallVector<User *, 8> UserChain;
   /// A data structure used in rebuildWithoutConstOffset. Contains all
   /// sext/zext instructions along UserChain.
   SmallVector<CastInst *, 16> ExtInsts;
-  Instruction *IP;  /// Insertion position of cloned instructions.
+  Instruction *IP; /// Insertion position of cloned instructions.
 };
 
-/// \brief A pass that tries to split every GEP in the function into a variadic
-/// base and a constant offset. It is a FunctionPass because searching for the
-/// constant offset may inspect other basic blocks.
-class SeparateConstOffsetFromGEP : public FunctionPass {
- public:
+/// \brief A pass that tries to split GEP instructions to enable later
+/// optimization opportunities. It is a FunctionPass because searching
+/// for the GEP index subexpressions may inspect other basic blocks.
+class ReassociateGEPs : public FunctionPass {
+public:
   static char ID;
-  SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr,
-                             bool LowerGEP = false)
+  ReassociateGEPs(const TargetMachine *TM = nullptr, bool LowerGEP = false)
       : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) {
-    initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
+    initializeReassociateGEPsPass(*PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
     AU.setPreservesCFG();
   }
 
   bool runOnFunction(Function &F) override;
 
- private:
-  /// Tries to split the given GEP into a variadic base and a constant offset,
-  /// and returns true if the splitting succeeds.
-  bool splitGEP(GetElementPtrInst *GEP);
-  /// Lower a GEP with multiple indices into multiple GEPs with a single index.
-  /// Function splitGEP already split the original GEP into a variadic part and
-  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
-  /// variadic part into a set of GEPs with a single index and applies
-  /// AccumulativeByteOffset to it.
-  /// \p Variadic                  The variadic part of the original GEP.
-  /// \p AccumulativeByteOffset    The constant offset.
-  void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
-                              int64_t AccumulativeByteOffset);
-  /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
-  /// Function splitGEP already split the original GEP into a variadic part and
-  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
-  /// variadic part into a set of arithmetic operations and applies
-  /// AccumulativeByteOffset to it.
-  /// \p Variadic                  The variadic part of the original GEP.
-  /// \p AccumulativeByteOffset    The constant offset.
-  void lowerToArithmetics(GetElementPtrInst *Variadic,
-                          int64_t AccumulativeByteOffset);
+private:
+  /// Tries to split the given GEP in two different ways.  First, we
+  /// try to split the given GEP into a variadic base and a constant
+  /// offset. Second, we try to split the GEP into a loop invariant
+  /// and loop variant GEP. Returns true if the IR is changed.
+  bool splitGEP(GetElementPtrInst *GEP, LICMRanker &Ranker);
+  /// Performs the same transformation as splitGEP but also lowers a
+  /// GEP into either multiple single-index GEPs or arithmetic
+  /// operands. Unlike in splitGEP the initial splitting for constant
+  /// aggregation also considers indices of structure type. Returns
+  /// true if the IR is changed.
+  bool splitAndLowerGEP(GetElementPtrInst *GEP, LICMRanker &Ranker);
+  /// Removes hoistable constants from the index operand expressions of a GEP.
+  void stripConstantsFromGEP(GetElementPtrInst *GEP);
+  /// Tries to split the GEP to enable loop invariant code motion.
+  /// Ranker is used to rank values based on loop invariance.  Upon
+  /// return, AddedGEPs includes all GEPs created by splitting.  The
+  /// original GEP may be split more than once. Returns true if the IR
+  /// is changed.
+  bool splitGEPForLICM(GetElementPtrInst *GEP, LICMRanker &Ranker,
+                       SmallVectorImpl<GetElementPtrInst *> &AddedGEPs);
+  /// Lower a GEP with multiple indices into multiple GEPs with a
+  /// single index and adds in Offset (in bytes).
+  void lowerToSingleIndexGEPs(GetElementPtrInst *GEP, int64_t Offset);
+  /// Lowers a GEP into ptrtoint+arithmetics+inttoptr form and adds in
+  /// Offset (in bytes).
+  void lowerToArithmetics(GetElementPtrInst *GEP, int64_t Offset);
   /// Finds the constant offset within each index and accumulates them. If
   /// LowerGEP is true, it finds in indices of both sequential and structure
   /// types, otherwise it only finds in sequential indices. The output
   /// NeedsExtraction indicates whether we successfully find a non-zero constant
   /// offset.
-  int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
+  int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction,
+                               bool accumulateFieldOffsets);
   /// Canonicalize array indices to pointer-size integers. This helps to
   /// simplify the logic of splitting a GEP. For example, if a + b is a
   /// pointer-size integer, we have
@@ -363,29 +527,25 @@
   /// multiple GEPs with a single index.
   bool LowerGEP;
 };
-}  // anonymous namespace
+} // anonymous namespace
 
-char SeparateConstOffsetFromGEP::ID = 0;
-INITIALIZE_PASS_BEGIN(
-    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
-    "Split GEPs to a variadic base and a constant offset for better CSE", false,
-    false)
+char ReassociateGEPs::ID = 0;
+INITIALIZE_PASS_BEGIN(ReassociateGEPs, "reassociate-geps",
+                      "Reassociate and split GEPs for better LICM and CSE",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(
-    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
-    "Split GEPs to a variadic base and a constant offset for better CSE", false,
-    false)
-
-FunctionPass *
-llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM,
-                                           bool LowerGEP) {
-  return new SeparateConstOffsetFromGEP(TM, LowerGEP);
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(ReassociateGEPs, "reassociate-geps",
+                    "Reassociate and split GEPs for better LICM and CSE", false,
+                    false)
+
+FunctionPass *llvm::createReassociateGEPsPass(const TargetMachine *TM,
+                                              bool LowerGEP) {
+  return new ReassociateGEPs(TM, LowerGEP);
 }
 
-bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
-                                            bool ZeroExtended,
-                                            BinaryOperator *BO,
-                                            bool NonNegative) {
+bool HoistableValueFinder::canTraceInto(bool SignExtended, bool ZeroExtended,
+                                        BinaryOperator *BO, bool NonNegative) {
   // We only consider ADD, SUB and OR, because a non-zero constant found in
   // expressions composed of these operations can be easily hoisted as a
   // constant offset by reassociation.
@@ -398,7 +558,7 @@
   Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
   // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
   // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
-  if (BO->getOpcode() == Instruction::Or && !NoCommonBits(LHS, RHS))
+  if (BO->getOpcode() == Instruction::Or && !noCommonBits(LHS, RHS))
     return false;
 
   // In addition, tracing into BO requires that its surrounding s/zext (if
@@ -444,70 +604,238 @@
   return true;
 }
 
-APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
-                                                   bool SignExtended,
-                                                   bool ZeroExtended) {
-  // BO being non-negative does not shed light on whether its operands are
-  // non-negative. Clear the NonNegative flag here.
-  APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
-                              /* NonNegative */ false);
-  // If we found a constant offset in the left operand, stop and return that.
-  // This shortcut might cause us to miss opportunities of combining the
-  // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
-  // However, such cases are probably already handled by -instcombine,
-  // given this pass runs after the standard optimizations.
-  if (ConstantOffset != 0) return ConstantOffset;
-  ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
-                        /* NonNegative */ false);
-  // If U is a sub operator, negate the constant offset found in the right
-  // operand.
-  if (BO->getOpcode() == Instruction::Sub)
-    ConstantOffset = -ConstantOffset;
-  return ConstantOffset;
-}
-
-APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
-                                    bool ZeroExtended, bool NonNegative) {
-  // TODO(jingyue): We could trace into integer/pointer casts, such as
-  // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
-  // integers because it gives good enough results for our benchmarks.
-  unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
-
-  // We cannot do much with Values that are not a User, such as an Argument.
-  User *U = dyn_cast<User>(V);
-  if (U == nullptr) return APInt(BitWidth, 0);
+bool HoistableValueFinder::findInEitherOperand(
+    BinaryOperator *BO, bool SignExtended, bool ZeroExtended, unsigned &Operand,
+    SmallVectorImpl<UserChainElement> &UserChain) {
+  bool Found = false;
+  for (unsigned Op = 0; Op < 2; ++Op) {
+    // BO being non-negative does not shed light on whether its operands are
+    // non-negative. Clear the NonNegative flag here.
+    if (findInSubexpression(BO->getOperand(Op), SignExtended, ZeroExtended,
+                            /* NonNegative */ false, UserChain)) {
+      Found = true;
+      Operand = Op;
+    }
+  }
+  return Found;
+}
+
+bool HoistableValueFinder::findInSubexpression(
+    Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative,
+    SmallVectorImpl<UserChainElement> &UserChain) {
+  bool ContinueSearch;
+  bool Found = visit(V, ContinueSearch);
+  if (Found)
+    UserChain.clear();
+
+  unsigned Operand = 0;
+  if (ContinueSearch) {
+    // We cannot do much with Values that are not a User, such as an Argument.
+    if (User *U = dyn_cast<User>(V)) {
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
+        // Trace into subexpressions for more hoisting opportunities.
+        if (canTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) {
+          Found |= findInEitherOperand(BO, SignExtended, ZeroExtended, Operand,
+                                       UserChain);
+        }
+      } else if (isa<SExtInst>(V)) {
+        Found |= findInSubexpression(U->getOperand(0), /* SignExtended */ true,
+                                     ZeroExtended, NonNegative, UserChain);
+      } else if (isa<ZExtInst>(V)) {
+        // As an optimization, we can clear the SignExtended flag because
+        // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
+        //
+        // Clear the NonNegative flag, because zext(a) >= 0 does not imply
+        // a >= 0.
+        Found |= findInSubexpression(U->getOperand(0), /* SignExtended */ false,
+                                     /* ZeroExtended */ true,
+                                     /* NonNegative */ false, UserChain);
+      }
+    }
+  }
+
+  if (Found)
+    UserChain.push_back({V, Operand});
+
+  return Found;
+}
+
+bool HoistableValueFinder::find(Value *Expression, bool NonNegative,
+                                SmallVectorImpl<UserChainElement> &UserChain) {
+  UserChain.clear();
+  return (findInSubexpression(Expression, /* SignExtended */ false,
+                              /* ZeroExtended */ false, NonNegative,
+                              UserChain));
+}
+
+void HoistableValueFinder::computeKnownBits(Value *V, APInt &KnownOne,
+                                            APInt &KnownZero) const {
+  IntegerType *IT = cast<IntegerType>(V->getType());
+  KnownOne = APInt(IT->getBitWidth(), 0);
+  KnownZero = APInt(IT->getBitWidth(), 0);
+  llvm::computeKnownBits(V, KnownZero, KnownOne, DL, 0);
+}
+
+bool HoistableValueFinder::noCommonBits(Value *LHS, Value *RHS) const {
+  assert(LHS->getType() == RHS->getType() &&
+         "LHS and RHS should have the same type");
+  APInt LHSKnownOne, LHSKnownZero, RHSKnownOne, RHSKnownZero;
+  computeKnownBits(LHS, LHSKnownOne, LHSKnownZero);
+  computeKnownBits(RHS, RHSKnownOne, RHSKnownZero);
+  return (LHSKnownZero | RHSKnownZero).isAllOnesValue();
+}
+
+int64_t ConstantOffsetFinder::FindConstOffset(
+    GetElementPtrInst *GEP, unsigned OperandIdx,
+    SmallVectorImpl<UserChainElement> &UserChain) {
+  ConstantOffsetFinder Finder(GEP->getModule()->getDataLayout());
+  // Indices of an inbound GEP are guaranteed to be non-negative.
+  if (!Finder.find(GEP->getOperand(OperandIdx), GEP->isInBounds(), UserChain))
+    return 0;
+
+  // Sign and zero extend the constant as determined by UserChain.
+  // Also invert the constant if it appears on the RHS of a subtract.
+  APInt ExtValue = Finder.ConstantOffset;
+  for (unsigned i = 0; i < UserChain.size(); ++i) {
+    Value *V = UserChain[i].value;
+    unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+    if (isa<SExtInst>(V))
+      ExtValue = ExtValue.sext(BitWidth);
+    else if (isa<ZExtInst>(V))
+      ExtValue = ExtValue.zext(BitWidth);
+    else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
+      if ((BO->getOpcode() == Instruction::Sub) &&
+          (UserChain[i].operandNo == 1))
+        ExtValue = -ExtValue;
+    }
+  }
+  return ExtValue.getSExtValue();
+}
+
+bool ConstantOffsetFinder::visit(Value *V, bool &ContinueSearch) {
+  // Don't keep searching if we've already found a constant.  This
+  // shortcut might cause us to miss opportunities of combining the
+  // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a
+  // + b) + 9.  However, such cases are probably already handled by
+  // -instcombine, given this pass runs after the standard
+  // optimizations.
+  if (ConstantOffset != 0) {
+    ContinueSearch = false;
+    return false;
+  }
 
-  APInt ConstantOffset(BitWidth, 0);
   if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
     // Hooray, we found it!
     ConstantOffset = CI->getValue();
-  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
-    // Trace into subexpressions for more hoisting opportunities.
-    if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) {
-      ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
-    }
-  } else if (isa<SExtInst>(V)) {
-    ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
-                          ZeroExtended, NonNegative).sext(BitWidth);
-  } else if (isa<ZExtInst>(V)) {
-    // As an optimization, we can clear the SignExtended flag because
-    // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
-    //
-    // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
-    ConstantOffset =
-        find(U->getOperand(0), /* SignExtended */ false,
-             /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth);
+    if (ConstantOffset != 0) {
+      ContinueSearch = false;
+      return true;
+    }
+  }
+
+  ContinueSearch = true;
+  return false;
+}
+
+bool LICMRanker::isImmovableInstruction(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::PHI:
+  case Instruction::LandingPad:
+  case Instruction::Alloca:
+  case Instruction::Load:
+  case Instruction::Invoke:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Call:
+    return true;
+  default:
+    return false;
+  }
+}
+
+int LICMRanker::getRank(Value *V) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return 0;
+
+  if (RankMap.find(I) != RankMap.end())
+    return RankMap[I];
+
+  int Rank = 0;
+  if (Loop *L = LI->getLoopFor(I->getParent())) {
+    int MaxRank = L->getLoopDepth();
+    if (isImmovableInstruction(I))
+      Rank = MaxRank;
+    else {
+      for (unsigned i = 0, e = I->getNumOperands(); i != e && Rank < MaxRank;
+           ++i)
+        Rank = std::max(Rank, getRank(I->getOperand(i)));
+      Rank = std::min(MaxRank, Rank);
+    }
+  }
+  RankMap[I] = Rank;
+  return Rank;
+}
+
+bool LoopInvariantFinder::FindLoopInvariantValue(
+    GetElementPtrInst *GEP, unsigned OperandIdx, LICMRanker &Ranker,
+    SmallVectorImpl<UserChainElement> &UserChain) {
+  int BaseRank = 0;
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (unsigned I = 0; I < OperandIdx; ++I, ++GTI) {
+    if (isa<SequentialType>(*GTI))
+      BaseRank = std::max(BaseRank, Ranker.getRank(GEP->getOperand(I)));
   }
 
-  // If we found a non-zero constant offset, add it to the path for
-  // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
-  // help this optimization.
-  if (ConstantOffset != 0)
-    UserChain.push_back(U);
-  return ConstantOffset;
+  Value *GEPOperand = GEP->getOperand(OperandIdx);
+  // If the operand as a whole is at least as invariant as the base
+  // then reassociation and splitting at this operand will not enable
+  // any further LICM.
+  if (Ranker.getRank(GEPOperand) <= BaseRank)
+    return false;
+
+  LoopInvariantFinder Finder(BaseRank, Ranker.getRank(GEPOperand) - 1, Ranker,
+                             GEP->getModule()->getDataLayout());
+  // Indices of an inbound GEP are guaranteed to be non-negative.
+  return Finder.find(GEPOperand, GEP->isInBounds(), UserChain);
+}
+
+bool LoopInvariantFinder::visit(Value *V, bool &ContinueSearch) {
+  // Considerations for candidate values:
+  // 1) We want the value to be as loop invariant as possible.
+  // 2) We don't want to break up an expression which is more loop
+  //    invariant (lower rank) than the expression the value will be
+  //    reassociated with (indicated by BaseRank).
+  // 3) We don't want to duplicate any code unnecessarily so only recurse
+  //    into expressions which are not used elsewhere.
+  bool Found = false;
+  if ((Ranker.getRank(V) <= MaximumRank) &&
+      (!Candidate || (Ranker.getRank(V) < Ranker.getRank(Candidate)))) {
+    Candidate = V;
+    Found = true;
+  }
+
+  // To avoid duplicating code during extraction, don't recurse into
+  // expressions which have more than one use. Also, don't recurse
+  // into expressions which are already loop invariant (at least as
+  // loop invariant as BaseRank).
+  ContinueSearch = true;
+  if (!V->hasOneUse() || (Ranker.getRank(V) <= BaseRank))
+    ContinueSearch = false;
+  else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
+    // Don't recurse into subtract expressions.
+    if (BO->getOpcode() == Instruction::Sub)
+      ContinueSearch = false;
+  }
+
+  return Found;
 }
 
-Value *ConstantOffsetExtractor::applyExts(Value *V) {
+Value *ValueExtractor::applyExts(Value *V) {
   Value *Current = V;
   // ExtInsts is built in the use-def order. Therefore, we apply them to V
   // in the reversed order.
@@ -526,71 +854,79 @@
   return Current;
 }
 
-Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
-  distributeExtsAndCloneChain(UserChain.size() - 1);
+Value *ValueExtractor::rebuildWithoutValue(
+    SmallVectorImpl<UserChainElement> &UserChain, Value *&ExtractedValue) {
+  assert(UserChain.size() > 0);
+  ExtInsts.clear();
+  distributeExtsAndCloneChain(UserChain, UserChain.size() - 1);
+
   // Remove all nullptrs (used to be s/zext) from UserChain.
   unsigned NewSize = 0;
   for (auto I = UserChain.begin(), E = UserChain.end(); I != E; ++I) {
-    if (*I != nullptr) {
+    if (I->value != nullptr) {
       UserChain[NewSize] = *I;
       NewSize++;
     }
   }
   UserChain.resize(NewSize);
-  return removeConstOffset(UserChain.size() - 1);
+  ExtractedValue = UserChain[0].value;
+  return removeHoistableValue(UserChain, UserChain.size() - 1);
 }
 
-Value *
-ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
-  User *U = UserChain[ChainIndex];
+Value *ValueExtractor::distributeExtsAndCloneChain(
+    SmallVectorImpl<UserChainElement> &UserChain, unsigned ChainIndex) {
+  Value *V = UserChain[ChainIndex].value;
   if (ChainIndex == 0) {
-    assert(isa<ConstantInt>(U));
-    // If U is a ConstantInt, applyExts will return a ConstantInt as well.
-    return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
+    UserChain[ChainIndex] = {applyExts(V), 0};
+    return UserChain[ChainIndex].value;
   }
 
-  if (CastInst *Cast = dyn_cast<CastInst>(U)) {
+  if (CastInst *Cast = dyn_cast<CastInst>(V)) {
     assert((isa<SExtInst>(Cast) || isa<ZExtInst>(Cast)) &&
            "We only traced into two types of CastInst: sext and zext");
     ExtInsts.push_back(Cast);
-    UserChain[ChainIndex] = nullptr;
-    return distributeExtsAndCloneChain(ChainIndex - 1);
+    UserChain[ChainIndex] = {nullptr, 0};
+    return distributeExtsAndCloneChain(UserChain, ChainIndex - 1);
   }
 
-  // Function find only trace into BinaryOperator and CastInst.
-  BinaryOperator *BO = cast<BinaryOperator>(U);
-  // OpNo = which operand of BO is UserChain[ChainIndex - 1]
-  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
-  Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
-  Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
+  // Function find only traces into BinaryOperator and CastInst.
+  BinaryOperator *BO = cast<BinaryOperator>(V);
+  Value *TheOther =
+      applyExts(BO->getOperand(1 - UserChain[ChainIndex].operandNo));
+  Value *NextInChain = distributeExtsAndCloneChain(UserChain, ChainIndex - 1);
 
   BinaryOperator *NewBO = nullptr;
-  if (OpNo == 0) {
+  if (UserChain[ChainIndex].operandNo == 0)
     NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
                                    BO->getName(), IP);
-  } else {
+  else
     NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
                                    BO->getName(), IP);
-  }
-  return UserChain[ChainIndex] = NewBO;
-}
 
-Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
-  if (ChainIndex == 0) {
-    assert(isa<ConstantInt>(UserChain[ChainIndex]));
-    return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
-  }
+  UserChain[ChainIndex] = {NewBO, UserChain[ChainIndex].operandNo};
+  return UserChain[ChainIndex].value;
+}
 
-  BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
-  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
-  assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
-  Value *NextInChain = removeConstOffset(ChainIndex - 1);
-  Value *TheOther = BO->getOperand(1 - OpNo);
+Value *ValueExtractor::removeHoistableValue(
+    SmallVectorImpl<UserChainElement> &UserChain, unsigned ChainIndex) {
+  if (ChainIndex == 0)
+    return ConstantInt::getNullValue(UserChain[ChainIndex].value->getType());
+
+  // S/Zext operations have been removed from the chain so only binary
+  // operators remain.
+  BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex].value);
+  assert(BO->getOperand(UserChain[ChainIndex].operandNo) ==
+             UserChain[ChainIndex - 1].value &&
+         "Invalid operand number in UserChain");
+  Value *NextInChain = removeHoistableValue(UserChain, ChainIndex - 1);
+  Value *TheOther = BO->getOperand(1 - UserChain[ChainIndex].operandNo);
 
   // If NextInChain is 0 and not the LHS of a sub, we can simplify the
   // sub-expression to be just TheOther.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
-    if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+    if (CI->isZero() &&
+        !(BO->getOpcode() == Instruction::Sub &&
+          UserChain[ChainIndex].operandNo == 0))
       return TheOther;
   }
 
@@ -608,7 +944,7 @@
     //
     // Replacing the "or" with "add" is fine, because
     //   a | (b + 5) = a + (b + 5) = (a + b) + 5
-    if (OpNo == 0) {
+    if (UserChain[ChainIndex].operandNo == 0) {
       return BinaryOperator::CreateAdd(NextInChain, TheOther, BO->getName(),
                                        IP);
     } else {
@@ -623,7 +959,7 @@
          "distributeExtsAndCloneChain clones each BinaryOperator in "
          "UserChain, so no one should be used more than "
          "once");
-  BO->setOperand(OpNo, NextInChain);
+  BO->setOperand(UserChain[ChainIndex].operandNo, NextInChain);
   BO->setHasNoSignedWrap(false);
   BO->setHasNoUnsignedWrap(false);
   // Make sure it appears after all instructions we've inserted so far.
@@ -631,45 +967,15 @@
   return BO;
 }
 
-Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP) {
-  ConstantOffsetExtractor Extractor(GEP);
-  // Find a non-zero constant offset first.
-  APInt ConstantOffset =
-      Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
-                     GEP->isInBounds());
-  if (ConstantOffset == 0)
-    return nullptr;
-  // Separates the constant offset from the GEP index.
-  return Extractor.rebuildWithoutConstOffset();
-}
-
-int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) {
-  // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
-  return ConstantOffsetExtractor(GEP)
-      .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
-            GEP->isInBounds())
-      .getSExtValue();
-}
-
-void ConstantOffsetExtractor::ComputeKnownBits(Value *V, APInt &KnownOne,
-                                               APInt &KnownZero) const {
-  IntegerType *IT = cast<IntegerType>(V->getType());
-  KnownOne = APInt(IT->getBitWidth(), 0);
-  KnownZero = APInt(IT->getBitWidth(), 0);
-  const DataLayout &DL = IP->getModule()->getDataLayout();
-  llvm::computeKnownBits(V, KnownZero, KnownOne, DL, 0);
-}
-
-bool ConstantOffsetExtractor::NoCommonBits(Value *LHS, Value *RHS) const {
-  assert(LHS->getType() == RHS->getType() &&
-         "LHS and RHS should have the same type");
-  APInt LHSKnownOne, LHSKnownZero, RHSKnownOne, RHSKnownZero;
-  ComputeKnownBits(LHS, LHSKnownOne, LHSKnownZero);
-  ComputeKnownBits(RHS, RHSKnownOne, RHSKnownZero);
-  return (LHSKnownZero | RHSKnownZero).isAllOnesValue();
+Value *ValueExtractor::Extract(SmallVectorImpl<UserChainElement> &UserChain,
+                               Instruction *InsertionPoint, Value *&Remainder) {
+  ValueExtractor Extractor(InsertionPoint);
+  Value *Extracted;
+  Remainder = Extractor.rebuildWithoutValue(UserChain, Extracted);
+  return Extracted;
 }
 
-bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
+bool ReassociateGEPs::canonicalizeArrayIndicesToPointerSize(
     GetElementPtrInst *GEP) {
   bool Changed = false;
   const DataLayout &DL = GEP->getModule()->getDataLayout();
@@ -688,27 +994,28 @@
   return Changed;
 }
 
-int64_t
-SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
-                                                 bool &NeedsExtraction) {
+int64_t ReassociateGEPs::accumulateByteOffset(GetElementPtrInst *GEP,
+                                              bool &NeedsExtraction,
+                                              bool accumulateFieldOffsets) {
   NeedsExtraction = false;
   int64_t AccumulativeByteOffset = 0;
   gep_type_iterator GTI = gep_type_begin(*GEP);
   const DataLayout &DL = GEP->getModule()->getDataLayout();
+  SmallVector<UserChainElement, 8> UserChain;
   for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
     if (isa<SequentialType>(*GTI)) {
       // Tries to extract a constant offset from this GEP index.
       int64_t ConstantOffset =
-          ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP);
+          ConstantOffsetFinder::FindConstOffset(GEP, I, UserChain);
       if (ConstantOffset != 0) {
         NeedsExtraction = true;
-        // A GEP may have multiple indices.  We accumulate the extracted
+        // A GEP may have multiple indices. We accumulate the extracted
         // constant offset to a byte offset, and later offset the remainder of
         // the original GEP with this byte offset.
         AccumulativeByteOffset +=
             ConstantOffset * DL.getTypeAllocSize(GTI.getIndexedType());
       }
-    } else if (LowerGEP) {
+    } else if (accumulateFieldOffsets) {
       StructType *StTy = cast<StructType>(*GTI);
       uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
       // Skip field 0 as the offset is always 0.
@@ -722,24 +1029,24 @@
   return AccumulativeByteOffset;
 }
 
-void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
-    GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
-  IRBuilder<> Builder(Variadic);
-  const DataLayout &DL = Variadic->getModule()->getDataLayout();
-  Type *IntPtrTy = DL.getIntPtrType(Variadic->getType());
+void ReassociateGEPs::lowerToSingleIndexGEPs(GetElementPtrInst *GEP,
+                                             int64_t Offset) {
+  IRBuilder<> Builder(GEP);
+  const DataLayout &DL = GEP->getModule()->getDataLayout();
+  Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
 
   Type *I8PtrTy =
-      Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
-  Value *ResultPtr = Variadic->getOperand(0);
+      Builder.getInt8PtrTy(GEP->getType()->getPointerAddressSpace());
+  Value *ResultPtr = GEP->getOperand(0);
   if (ResultPtr->getType() != I8PtrTy)
     ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
 
-  gep_type_iterator GTI = gep_type_begin(*Variadic);
+  gep_type_iterator GTI = gep_type_begin(*GEP);
   // Create an ugly GEP for each sequential index. We don't create GEPs for
   // structure indices, as they are accumulated in the constant offset index.
-  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
     if (isa<SequentialType>(*GTI)) {
-      Value *Idx = Variadic->getOperand(I);
+      Value *Idx = GEP->getOperand(I);
       // Skip zero indices.
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
         if (CI->isZero())
@@ -757,37 +1064,38 @@
         }
       }
       // Create an ugly GEP with a single index for each index.
-      ResultPtr = Builder.CreateGEP(ResultPtr, Idx, "uglygep");
+      ResultPtr =
+          Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
     }
   }
 
   // Create a GEP with the constant offset index.
-  if (AccumulativeByteOffset != 0) {
-    Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
-    ResultPtr = Builder.CreateGEP(ResultPtr, Offset, "uglygep");
-  }
-  if (ResultPtr->getType() != Variadic->getType())
-    ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
-
-  Variadic->replaceAllUsesWith(ResultPtr);
-  Variadic->eraseFromParent();
-}
-
-void
-SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
-                                               int64_t AccumulativeByteOffset) {
-  IRBuilder<> Builder(Variadic);
-  const DataLayout &DL = Variadic->getModule()->getDataLayout();
-  Type *IntPtrTy = DL.getIntPtrType(Variadic->getType());
+  if (Offset != 0) {
+    Value *OffsetValue = ConstantInt::get(IntPtrTy, Offset);
+    ResultPtr = Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, OffsetValue,
+                                  "uglygep");
+  }
+  if (ResultPtr->getType() != GEP->getType())
+    ResultPtr = Builder.CreateBitCast(ResultPtr, GEP->getType());
+
+  GEP->replaceAllUsesWith(ResultPtr);
+  GEP->eraseFromParent();
+}
 
-  Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
-  gep_type_iterator GTI = gep_type_begin(*Variadic);
+void ReassociateGEPs::lowerToArithmetics(GetElementPtrInst *GEP,
+                                         int64_t Offset) {
+  IRBuilder<> Builder(GEP);
+  const DataLayout &DL = GEP->getModule()->getDataLayout();
+  Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
+
+  Value *ResultPtr = Builder.CreatePtrToInt(GEP->getOperand(0), IntPtrTy);
+  gep_type_iterator GTI = gep_type_begin(*GEP);
   // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
   // don't create arithmetics for structure indices, as they are accumulated
   // in the constant offset index.
-  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
     if (isa<SequentialType>(*GTI)) {
-      Value *Idx = Variadic->getOperand(I);
+      Value *Idx = GEP->getOperand(I);
       // Skip zero indices.
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
         if (CI->isZero())
@@ -810,51 +1118,91 @@
   }
 
   // Create an ADD for the constant offset index.
-  if (AccumulativeByteOffset != 0) {
-    ResultPtr = Builder.CreateAdd(
-        ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
+  if (Offset != 0) {
+    ResultPtr =
+        Builder.CreateAdd(ResultPtr, ConstantInt::get(IntPtrTy, Offset));
   }
 
-  ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
-  Variadic->replaceAllUsesWith(ResultPtr);
-  Variadic->eraseFromParent();
+  ResultPtr = Builder.CreateIntToPtr(ResultPtr, GEP->getType());
+  GEP->replaceAllUsesWith(ResultPtr);
+  GEP->eraseFromParent();
 }
 
-bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
-  // Skip vector GEPs.
-  if (GEP->getType()->isVectorTy())
-    return false;
+// Split the GEP at the specified index. IdxPart1 and IdxPart2 are
+// the two components of the index operand at which the gep is split.
+// For example, with:
+//   %idx = add %a, %b
+//        = gep %base, 42, %idx, %foo, %bar
+// The gep can be split at operand 2 (%idx) with IdxPart1 equal to %a
+// and IdxPart2 equals to %b. The resultant code is:
+//   %invgep = gep %base, 42, %a
+//           = gep %invgep, %b, %foo, %bar
+// The potential advantage is that %invgep may be loop invariant and can
+// be hoisted out of loops.
+static std::pair<Value *, Value *> splitGEPAtIndex(GetElementPtrInst *GEP,
+                                                   unsigned OperandIndex,
+                                                   Value *IdxPart1,
+                                                   Value *IdxPart2) {
+  IRBuilder<> Builder(GEP);
+  SmallVector<Value *, 8> GEPOperands;
+  for (unsigned i = 1; i < OperandIndex; ++i)
+    GEPOperands.push_back(GEP->getOperand(i));
+  GEPOperands.push_back(IdxPart1);
+  Value *InvariantGEP =
+      Builder.CreateGEP(GEP->getOperand(0), GEPOperands, "invgep");
+
+  GEPOperands.clear();
+  GEPOperands.push_back(IdxPart2);
+  for (unsigned i = OperandIndex + 1; i < GEP->getNumOperands(); i++)
+    GEPOperands.push_back(GEP->getOperand(i));
+  Value *RemainderGEP = Builder.CreateGEP(InvariantGEP, GEPOperands, "vargep");
+  GEP->replaceAllUsesWith(RemainderGEP);
+  GEP->eraseFromParent();
 
-  // The backend can already nicely handle the case where all indices are
-  // constant.
-  if (GEP->hasAllConstantIndices())
-    return false;
+  return std::make_pair(InvariantGEP, RemainderGEP);
+}
 
-  bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);
+bool ReassociateGEPs::splitGEPForLICM(
+    GetElementPtrInst *GEP, LICMRanker &Ranker,
+    SmallVectorImpl<GetElementPtrInst *> &AddedGEPs) {
+  int GEPRank = Ranker.getRank(GEP);
+  if (GEPRank == 0)
+    // GEP is loop invariant. Nothing to do here.
+    return false;
 
-  bool NeedsExtraction;
-  int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
+  gep_type_iterator GTI = gep_type_begin(*GEP);
+  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+    if (!isa<SequentialType>(*GTI))
+      continue;
 
-  if (!NeedsExtraction)
-    return Changed;
-  // If LowerGEP is disabled, before really splitting the GEP, check whether the
-  // backend supports the addressing mode we are about to produce. If no, this
-  // splitting probably won't be beneficial.
-  // If LowerGEP is enabled, even the extracted constant offset can not match
-  // the addressing mode, we can still do optimizations to other lowered parts
-  // of variable indices. Therefore, we don't check for addressing modes in that
-  // case.
-  if (!LowerGEP) {
-    TargetTransformInfo &TTI =
-        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
-            *GEP->getParent()->getParent());
-    if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(),
-                                   /*BaseGV=*/nullptr, AccumulativeByteOffset,
-                                   /*HasBaseReg=*/true, /*Scale=*/0)) {
-      return Changed;
+    SmallVector<UserChainElement, 8> UserChain;
+    if (LoopInvariantFinder::FindLoopInvariantValue(GEP, I, Ranker,
+                                                    UserChain)) {
+      Value *Remainder = nullptr;
+      Value *Extracted = ValueExtractor::Extract(UserChain, GEP, Remainder);
+      std::pair<Value *, Value *> SplitGEPs =
+          splitGEPAtIndex(GEP, I, Extracted, Remainder);
+
+      // Newly created GEPs may be Constant (not GetElementPtrInst).
+      if (GetElementPtrInst *NewGEP =
+              dyn_cast<GetElementPtrInst>(SplitGEPs.first))
+        AddedGEPs.push_back(NewGEP);
+      // Now try to reassociate the second GEP which may include
+      // opportunities for further reassociation.
+      if (GetElementPtrInst *NewGEP =
+              dyn_cast<GetElementPtrInst>(SplitGEPs.second)) {
+        if (!splitGEPForLICM(NewGEP, Ranker, AddedGEPs))
+          // If the second GEP was not split, then add it to the vector
+          // of new GEPs.
+          AddedGEPs.push_back(NewGEP);
+      }
+      return true;
     }
   }
+  return false;
+}
 
+void ReassociateGEPs::stripConstantsFromGEP(GetElementPtrInst *GEP) {
   // Remove the constant offset in each sequential index. The resultant GEP
   // computes the variadic base.
   // Notice that we don't remove struct field indices here. If LowerGEP is
@@ -867,13 +1215,14 @@
     if (isa<SequentialType>(*GTI)) {
       // Splits this GEP index into a variadic part and a constant offset, and
       // uses the variadic part as the new index.
-      Value *NewIdx = ConstantOffsetExtractor::Extract(GEP->getOperand(I), GEP);
-      if (NewIdx != nullptr) {
-        GEP->setOperand(I, NewIdx);
+      SmallVector<UserChainElement, 8> UserChain;
+      if (ConstantOffsetFinder::FindConstOffset(GEP, I, UserChain)) {
+        Value *Remainder;
+        ValueExtractor::Extract(UserChain, GEP, Remainder);
+        GEP->setOperand(I, Remainder);
       }
     }
   }
-
   // Clear the inbounds attribute because the new index may be off-bound.
   // e.g.,
   //
@@ -894,21 +1243,39 @@
   // TODO(jingyue): do some range analysis to keep as many inbounds as
   // possible. GEPs with inbounds are more friendly to alias analysis.
   GEP->setIsInBounds(false);
+}
 
-  // Lowers a GEP to either GEPs with a single index or arithmetic operations.
-  if (LowerGEP) {
-    // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
-    // arithmetic operations if the target uses alias analysis in codegen.
-    if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA())
-      lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
-    else
-      lowerToArithmetics(GEP, AccumulativeByteOffset);
-    return true;
-  }
+bool ReassociateGEPs::splitGEP(GetElementPtrInst *GEP, LICMRanker &Ranker) {
+  bool NeedsExtraction;
+  int64_t AccumulativeByteOffset =
+      accumulateByteOffset(GEP, NeedsExtraction, false);
+
+  SmallVector<GetElementPtrInst *, 2> AddedGEPs;
+  if (!NeedsExtraction)
+    return splitGEPForLICM(GEP, Ranker, AddedGEPs);
+
+  // If LowerGEP is disabled, before really splitting the GEP, check whether the
+  // backend supports the addressing mode we are about to produce. If no, this
+  // splitting probably won't be beneficial.
+  // If LowerGEP is enabled, even the extracted constant offset can not match
+  // the addressing mode, we can still do optimizations to other lowered parts
+  // of variable indices. Therefore, we don't check for addressing modes in that
+  // case.
+  TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+          *GEP->getParent()->getParent());
+  if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(),
+                                 /*BaseGV=*/nullptr, AccumulativeByteOffset,
+                                 /*HasBaseReg=*/true, /*Scale=*/0))
+    return splitGEPForLICM(GEP, Ranker, AddedGEPs);
+
+  stripConstantsFromGEP(GEP);
 
   // No need to create another GEP if the accumulative byte offset is 0.
-  if (AccumulativeByteOffset == 0)
+  if (AccumulativeByteOffset == 0) {
+    splitGEPForLICM(GEP, Ranker, AddedGEPs);
     return true;
+  }
 
   // Offsets the base with the accumulative byte offset.
   //
@@ -938,7 +1305,7 @@
   //   %uglygep = gep %0, <offset>
   //   %new.gep = bitcast %uglygep to <type of %gep>
   //   ... %new.gep ...
-  Instruction *NewGEP = GEP->clone();
+  GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(GEP->clone());
   NewGEP->insertBefore(GEP);
 
   // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
@@ -948,11 +1315,12 @@
   int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
       DL.getTypeAllocSize(GEP->getType()->getElementType()));
   Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
+  Instruction *NewDef;
   if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
     // Very likely. As long as %gep is natually aligned, the byte offset we
     // extracted should be a multiple of sizeof(*%gep).
     int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
-    NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
+    NewDef = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
                                        ConstantInt::get(IntPtrTy, Index, true),
                                        GEP->getName(), GEP);
   } else {
@@ -970,35 +1338,81 @@
     // sizeof(int64).
     //
     // Emit an uglygep in this case.
-    Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(),
-                                       GEP->getPointerAddressSpace());
-    NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
-    NewGEP = GetElementPtrInst::Create(
-        Type::getInt8Ty(GEP->getContext()), NewGEP,
+    Type *I8PtrTy =
+        Type::getInt8PtrTy(GEP->getContext(), GEP->getPointerAddressSpace());
+    NewDef = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
+    NewDef = GetElementPtrInst::Create(
+        Type::getInt8Ty(GEP->getContext()), NewDef,
         ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
         GEP);
     if (GEP->getType() != I8PtrTy)
-      NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
+      NewDef = new BitCastInst(NewDef, GEP->getType(), GEP->getName(), GEP);
   }
 
-  GEP->replaceAllUsesWith(NewGEP);
+  GEP->replaceAllUsesWith(NewDef);
   GEP->eraseFromParent();
+  splitGEPForLICM(NewGEP, Ranker, AddedGEPs);
 
   return true;
 }
 
-bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
+bool ReassociateGEPs::splitAndLowerGEP(GetElementPtrInst *GEP,
+                                       LICMRanker &Ranker) {
+  bool NeedsExtraction;
+  int64_t AccumulativeByteOffset =
+      accumulateByteOffset(GEP, NeedsExtraction, true);
+
+  SmallVector<GetElementPtrInst *, 2> GEPsToLower;
+  if (NeedsExtraction)
+    stripConstantsFromGEP(GEP);
+
+  splitGEPForLICM(GEP, Ranker, GEPsToLower);
+  if (GEPsToLower.empty())
+    GEPsToLower.push_back(GEP);
+
+  for (unsigned i = 0; i < GEPsToLower.size(); ++i) {
+    GetElementPtrInst *NewGEP = GEPsToLower[i];
+    // Only add the byte offset to the last GEP.
+    int64_t Offset = (i == GEPsToLower.size() - 1) ? AccumulativeByteOffset : 0;
+    // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
+    // arithmetic operations if the target uses alias analysis in codegen.
+    if (TM && TM->getSubtargetImpl(*NewGEP->getParent()->getParent())->useAA())
+      lowerToSingleIndexGEPs(NewGEP, Offset);
+    else
+      lowerToArithmetics(NewGEP, Offset);
+  }
+  return true;
+}
+
+bool ReassociateGEPs::runOnFunction(Function &F) {
   if (skipOptnoneFunction(F))
     return false;
 
-  if (DisableSeparateConstOffsetFromGEP)
+  if (DisableReassociateGEPs)
     return false;
 
+  LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  LICMRanker Ranker(LI);
+
   bool Changed = false;
   for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
-    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) {
+    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE;) {
       if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) {
-        Changed |= splitGEP(GEP);
+        // Skip vector GEPs.
+        if (GEP->getType()->isVectorTy())
+          continue;
+
+        // The backend can already nicely handle the case where all indices are
+        // constant.
+        if (GEP->hasAllConstantIndices())
+          continue;
+
+        Changed |= canonicalizeArrayIndicesToPointerSize(GEP);
+
+        if (LowerGEP)
+          Changed |= splitAndLowerGEP(GEP, Ranker);
+        else
+          Changed |= splitGEP(GEP, Ranker);
       }
       // No need to split GEP ConstantExprs because all its indices are constant
       // already.
Index: lib/Transforms/Scalar/Scalar.cpp
===================================================================
--- lib/Transforms/Scalar/Scalar.cpp
+++ lib/Transforms/Scalar/Scalar.cpp
@@ -73,7 +73,7 @@
   initializeStructurizeCFGPass(Registry);
   initializeSinkingPass(Registry);
   initializeTailCallElimPass(Registry);
-  initializeSeparateConstOffsetFromGEPPass(Registry);
+  initializeReassociateGEPsPass(Registry);
   initializeStraightLineStrengthReducePass(Registry);
   initializeLoadCombinePass(Registry);
   initializePlaceBackedgeSafepointsImplPass(Registry);
Index: lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
===================================================================
--- lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1,1010 +0,0 @@
-//===-- SeparateConstOffsetFromGEP.cpp - ------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Loop unrolling may create many similar GEPs for array accesses.
-// e.g., a 2-level loop
-//
-// float a[32][32]; // global variable
-//
-// for (int i = 0; i < 2; ++i) {
-//   for (int j = 0; j < 2; ++j) {
-//     ...
-//     ... = a[x + i][y + j];
-//     ...
-//   }
-// }
-//
-// will probably be unrolled to:
-//
-// gep %a, 0, %x, %y; load
-// gep %a, 0, %x, %y + 1; load
-// gep %a, 0, %x + 1, %y; load
-// gep %a, 0, %x + 1, %y + 1; load
-//
-// LLVM's GVN does not use partial redundancy elimination yet, and is thus
-// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs
-// significant slowdown in targets with limited addressing modes. For instance,
-// because the PTX target does not support the reg+reg addressing mode, the
-// NVPTX backend emits PTX code that literally computes the pointer address of
-// each GEP, wasting tons of registers. It emits the following PTX for the
-// first load and similar PTX for other loads.
-//
-// mov.u32         %r1, %x;
-// mov.u32         %r2, %y;
-// mul.wide.u32    %rl2, %r1, 128;
-// mov.u64         %rl3, a;
-// add.s64         %rl4, %rl3, %rl2;
-// mul.wide.u32    %rl5, %r2, 4;
-// add.s64         %rl6, %rl4, %rl5;
-// ld.global.f32   %f1, [%rl6];
-//
-// To reduce the register pressure, the optimization implemented in this file
-// merges the common part of a group of GEPs, so we can compute each pointer
-// address by adding a simple offset to the common part, saving many registers.
-//
-// It works by splitting each GEP into a variadic base and a constant offset.
-// The variadic base can be computed once and reused by multiple GEPs, and the
-// constant offsets can be nicely folded into the reg+immediate addressing mode
-// (supported by most targets) without using any extra register.
-//
-// For instance, we transform the four GEPs and four loads in the above example
-// into:
-//
-// base = gep a, 0, x, y
-// load base
-// laod base + 1  * sizeof(float)
-// load base + 32 * sizeof(float)
-// load base + 33 * sizeof(float)
-//
-// Given the transformed IR, a backend that supports the reg+immediate
-// addressing mode can easily fold the pointer arithmetics into the loads. For
-// example, the NVPTX backend can easily fold the pointer arithmetics into the
-// ld.global.f32 instructions, and the resultant PTX uses much fewer registers.
-//
-// mov.u32         %r1, %tid.x;
-// mov.u32         %r2, %tid.y;
-// mul.wide.u32    %rl2, %r1, 128;
-// mov.u64         %rl3, a;
-// add.s64         %rl4, %rl3, %rl2;
-// mul.wide.u32    %rl5, %r2, 4;
-// add.s64         %rl6, %rl4, %rl5;
-// ld.global.f32   %f1, [%rl6]; // so far the same as unoptimized PTX
-// ld.global.f32   %f2, [%rl6+4]; // much better
-// ld.global.f32   %f3, [%rl6+128]; // much better
-// ld.global.f32   %f4, [%rl6+132]; // much better
-//
-// Another improvement enabled by the LowerGEP flag is to lower a GEP with
-// multiple indices to either multiple GEPs with a single index or arithmetic
-// operations (depending on whether the target uses alias analysis in codegen).
-// Such transformation can have following benefits:
-// (1) It can always extract constants in the indices of structure type.
-// (2) After such Lowering, there are more optimization opportunities such as
-//     CSE, LICM and CGP.
-//
-// E.g. The following GEPs have multiple indices:
-//  BB1:
-//    %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
-//    load %p
-//    ...
-//  BB2:
-//    %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
-//    load %p2
-//    ...
-//
-// We can not do CSE for to the common part related to index "i64 %i". Lowering
-// GEPs can achieve such goals.
-// If the target does not use alias analysis in codegen, this pass will
-// lower a GEP with multiple indices into arithmetic operations:
-//  BB1:
-//    %1 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity
-//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
-//    %3 = add i64 %1, %2                          ; CSE opportunity
-//    %4 = mul i64 %j1, length_of_struct
-//    %5 = add i64 %3, %4
-//    %6 = add i64 %3, struct_field_3              ; Constant offset
-//    %p = inttoptr i64 %6 to i32*
-//    load %p
-//    ...
-//  BB2:
-//    %7 = ptrtoint [10 x %struct]* %ptr to i64    ; CSE opportunity
-//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
-//    %9 = add i64 %7, %8                          ; CSE opportunity
-//    %10 = mul i64 %j2, length_of_struct
-//    %11 = add i64 %9, %10
-//    %12 = add i64 %11, struct_field_2            ; Constant offset
-//    %p = inttoptr i64 %12 to i32*
-//    load %p2
-//    ...
-//
-// If the target uses alias analysis in codegen, this pass will lower a GEP
-// with multiple indices into multiple GEPs with a single index:
-//  BB1:
-//    %1 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity
-//    %2 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
-//    %3 = getelementptr i8* %1, i64 %2            ; CSE opportunity
-//    %4 = mul i64 %j1, length_of_struct
-//    %5 = getelementptr i8* %3, i64 %4
-//    %6 = getelementptr i8* %5, struct_field_3    ; Constant offset
-//    %p = bitcast i8* %6 to i32*
-//    load %p
-//    ...
-//  BB2:
-//    %7 = bitcast [10 x %struct]* %ptr to i8*     ; CSE opportunity
-//    %8 = mul i64 %i, length_of_10xstruct         ; CSE opportunity
-//    %9 = getelementptr i8* %7, i64 %8            ; CSE opportunity
-//    %10 = mul i64 %j2, length_of_struct
-//    %11 = getelementptr i8* %9, i64 %10
-//    %12 = getelementptr i8* %11, struct_field_2  ; Constant offset
-//    %p2 = bitcast i8* %12 to i32*
-//    load %p2
-//    ...
-//
-// Lowering GEPs can also benefit other passes such as LICM and CGP.
-// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
-// indices if one of the index is variant. If we lower such GEP into invariant
-// parts and variant parts, LICM can hoist/sink those invariant parts.
-// CGP (CodeGen Prepare) tries to sink address calculations that match the
-// target's addressing modes. A GEP with multiple indices may not match and will
-// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
-// them. So we end up with a better addressing mode.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
-#include "llvm/IR/IRBuilder.h"
-
-using namespace llvm;
-
-static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
-    "disable-separate-const-offset-from-gep", cl::init(false),
-    cl::desc("Do not separate the constant offset from a GEP instruction"),
-    cl::Hidden);
-
-namespace {
-
-/// \brief A helper class for separating a constant offset from a GEP index.
-///
-/// In real programs, a GEP index may be more complicated than a simple addition
-/// of something and a constant integer which can be trivially splitted. For
-/// example, to split ((a << 3) | 5) + b, we need to search deeper for the
-/// constant offset, so that we can separate the index to (a << 3) + b and 5.
-///
-/// Therefore, this class looks into the expression that computes a given GEP
-/// index, and tries to find a constant integer that can be hoisted to the
-/// outermost level of the expression as an addition. Not every constant in an
-/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
-/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
-/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
-class ConstantOffsetExtractor {
- public:
-  /// Extracts a constant offset from the given GEP index. It returns the
-  /// new index representing the remainder (equal to the original index minus
-  /// the constant offset), or nullptr if we cannot extract a constant offset.
-  /// \p Idx    The given GEP index
-  /// \p GEP    The given GEP
-   static Value *Extract(Value *Idx, GetElementPtrInst *GEP);
-  /// Looks for a constant offset from the given GEP index without extracting
-  /// it. It returns the numeric value of the extracted constant offset (0 if
-  /// failed). The meaning of the arguments are the same as Extract.
-   static int64_t Find(Value *Idx, GetElementPtrInst *GEP);
-
- private:
-   ConstantOffsetExtractor(Instruction *InsertionPt) : IP(InsertionPt) {}
-  /// Searches the expression that computes V for a non-zero constant C s.t.
-  /// V can be reassociated into the form V' + C. If the searching is
-  /// successful, returns C and update UserChain as a def-use chain from C to V;
-  /// otherwise, UserChain is empty.
-  ///
-  /// \p V            The given expression
-  /// \p SignExtended Whether V will be sign-extended in the computation of the
-  ///                 GEP index
-  /// \p ZeroExtended Whether V will be zero-extended in the computation of the
-  ///                 GEP index
-  /// \p NonNegative  Whether V is guaranteed to be non-negative. For example,
-  ///                 an index of an inbounds GEP is guaranteed to be
-  ///                 non-negative. Levaraging this, we can better split
-  ///                 inbounds GEPs.
-  APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
-  /// A helper function to look into both operands of a binary operator.
-  APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
-                            bool ZeroExtended);
-  /// After finding the constant offset C from the GEP index I, we build a new
-  /// index I' s.t. I' + C = I. This function builds and returns the new
-  /// index I' according to UserChain produced by function "find".
-  ///
-  /// The building conceptually takes two steps:
-  /// 1) iteratively distribute s/zext towards the leaves of the expression tree
-  /// that computes I
-  /// 2) reassociate the expression tree to the form I' + C.
-  ///
-  /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
-  /// sext to a, b and 5 so that we have
-  ///   sext(a) + (sext(b) + 5).
-  /// Then, we reassociate it to
-  ///   (sext(a) + sext(b)) + 5.
-  /// Given this form, we know I' is sext(a) + sext(b).
-  Value *rebuildWithoutConstOffset();
-  /// After the first step of rebuilding the GEP index without the constant
-  /// offset, distribute s/zext to the operands of all operators in UserChain.
-  /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
-  /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
-  ///
-  /// The function also updates UserChain to point to new subexpressions after
-  /// distributing s/zext. e.g., the old UserChain of the above example is
-  /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
-  /// and the new UserChain is
-  /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
-  ///   zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
-  ///
-  /// \p ChainIndex The index to UserChain. ChainIndex is initially
-  ///               UserChain.size() - 1, and is decremented during
-  ///               the recursion.
-  Value *distributeExtsAndCloneChain(unsigned ChainIndex);
-  /// Reassociates the GEP index to the form I' + C and returns I'.
-  Value *removeConstOffset(unsigned ChainIndex);
-  /// A helper function to apply ExtInsts, a list of s/zext, to value V.
-  /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
-  /// returns "sext i32 (zext i16 V to i32) to i64".
-  Value *applyExts(Value *V);
-
-  /// Returns true if LHS and RHS have no bits in common, i.e., LHS | RHS == 0.
-  bool NoCommonBits(Value *LHS, Value *RHS) const;
-  /// Computes which bits are known to be one or zero.
-  /// \p KnownOne Mask of all bits that are known to be one.
-  /// \p KnownZero Mask of all bits that are known to be zero.
-  void ComputeKnownBits(Value *V, APInt &KnownOne, APInt &KnownZero) const;
-  /// A helper function that returns whether we can trace into the operands
-  /// of binary operator BO for a constant offset.
-  ///
-  /// \p SignExtended Whether BO is surrounded by sext
-  /// \p ZeroExtended Whether BO is surrounded by zext
-  /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
-  ///                array index.
-  bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
-                    bool NonNegative);
-
-  /// The path from the constant offset to the old GEP index. e.g., if the GEP
-  /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
-  /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
-  /// UserChain[2] will be the entire expression "a * b + (c + 5)".
-  ///
-  /// This path helps to rebuild the new GEP index.
-  SmallVector<User *, 8> UserChain;
-  /// A data structure used in rebuildWithoutConstOffset. Contains all
-  /// sext/zext instructions along UserChain.
-  SmallVector<CastInst *, 16> ExtInsts;
-  Instruction *IP;  /// Insertion position of cloned instructions.
-};
-
-/// \brief A pass that tries to split every GEP in the function into a variadic
-/// base and a constant offset. It is a FunctionPass because searching for the
-/// constant offset may inspect other basic blocks.
-class SeparateConstOffsetFromGEP : public FunctionPass {
- public:
-  static char ID;
-  SeparateConstOffsetFromGEP(const TargetMachine *TM = nullptr,
-                             bool LowerGEP = false)
-      : FunctionPass(ID), TM(TM), LowerGEP(LowerGEP) {
-    initializeSeparateConstOffsetFromGEPPass(*PassRegistry::getPassRegistry());
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<TargetTransformInfoWrapperPass>();
-    AU.setPreservesCFG();
-  }
-
-  bool runOnFunction(Function &F) override;
-
- private:
-  /// Tries to split the given GEP into a variadic base and a constant offset,
-  /// and returns true if the splitting succeeds.
-  bool splitGEP(GetElementPtrInst *GEP);
-  /// Lower a GEP with multiple indices into multiple GEPs with a single index.
-  /// Function splitGEP already split the original GEP into a variadic part and
-  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
-  /// variadic part into a set of GEPs with a single index and applies
-  /// AccumulativeByteOffset to it.
-  /// \p Variadic                  The variadic part of the original GEP.
-  /// \p AccumulativeByteOffset    The constant offset.
-  void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
-                              int64_t AccumulativeByteOffset);
-  /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
-  /// Function splitGEP already split the original GEP into a variadic part and
-  /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
-  /// variadic part into a set of arithmetic operations and applies
-  /// AccumulativeByteOffset to it.
-  /// \p Variadic                  The variadic part of the original GEP.
-  /// \p AccumulativeByteOffset    The constant offset.
-  void lowerToArithmetics(GetElementPtrInst *Variadic,
-                          int64_t AccumulativeByteOffset);
-  /// Finds the constant offset within each index and accumulates them. If
-  /// LowerGEP is true, it finds in indices of both sequential and structure
-  /// types, otherwise it only finds in sequential indices. The output
-  /// NeedsExtraction indicates whether we successfully find a non-zero constant
-  /// offset.
-  int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
-  /// Canonicalize array indices to pointer-size integers. This helps to
-  /// simplify the logic of splitting a GEP. For example, if a + b is a
-  /// pointer-size integer, we have
-  ///   gep base, a + b = gep (gep base, a), b
-  /// However, this equality may not hold if the size of a + b is smaller than
-  /// the pointer size, because LLVM conceptually sign-extends GEP indices to
-  /// pointer size before computing the address
-  /// (http://llvm.org/docs/LangRef.html#id181).
-  ///
-  /// This canonicalization is very likely already done in clang and
-  /// instcombine. Therefore, the program will probably remain the same.
-  ///
-  /// Returns true if the module changes.
-  ///
-  /// Verified in @i32_add in split-gep.ll
-  bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
-
-  const TargetMachine *TM;
-  /// Whether to lower a GEP with multiple indices into arithmetic operations or
-  /// multiple GEPs with a single index.
-  bool LowerGEP;
-};
-}  // anonymous namespace
-
-char SeparateConstOffsetFromGEP::ID = 0;
-INITIALIZE_PASS_BEGIN(
-    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
-    "Split GEPs to a variadic base and a constant offset for better CSE", false,
-    false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(
-    SeparateConstOffsetFromGEP, "separate-const-offset-from-gep",
-    "Split GEPs to a variadic base and a constant offset for better CSE", false,
-    false)
-
-FunctionPass *
-llvm::createSeparateConstOffsetFromGEPPass(const TargetMachine *TM,
-                                           bool LowerGEP) {
-  return new SeparateConstOffsetFromGEP(TM, LowerGEP);
-}
-
-bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
-                                            bool ZeroExtended,
-                                            BinaryOperator *BO,
-                                            bool NonNegative) {
-  // We only consider ADD, SUB and OR, because a non-zero constant found in
-  // expressions composed of these operations can be easily hoisted as a
-  // constant offset by reassociation.
-  if (BO->getOpcode() != Instruction::Add &&
-      BO->getOpcode() != Instruction::Sub &&
-      BO->getOpcode() != Instruction::Or) {
-    return false;
-  }
-
-  Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
-  // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
-  // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
-  if (BO->getOpcode() == Instruction::Or && !NoCommonBits(LHS, RHS))
-    return false;
-
-  // In addition, tracing into BO requires that its surrounding s/zext (if
-  // any) is distributable to both operands.
-  //
-  // Suppose BO = A op B.
-  //  SignExtended | ZeroExtended | Distributable?
-  // --------------+--------------+----------------------------------
-  //       0       |      0       | true because no s/zext exists
-  //       0       |      1       | zext(BO) == zext(A) op zext(B)
-  //       1       |      0       | sext(BO) == sext(A) op sext(B)
-  //       1       |      1       | zext(sext(BO)) ==
-  //               |              |     zext(sext(A)) op zext(sext(B))
-  if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) {
-    // If a + b >= 0 and (a >= 0 or b >= 0), then
-    //   sext(a + b) = sext(a) + sext(b)
-    // even if the addition is not marked nsw.
-    //
-    // Leveraging this invarient, we can trace into an sext'ed inbound GEP
-    // index if the constant offset is non-negative.
-    //
-    // Verified in @sext_add in split-gep.ll.
-    if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) {
-      if (!ConstLHS->isNegative())
-        return true;
-    }
-    if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
-      if (!ConstRHS->isNegative())
-        return true;
-    }
-  }
-
-  // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
-  // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
-  if (BO->getOpcode() == Instruction::Add ||
-      BO->getOpcode() == Instruction::Sub) {
-    if (SignExtended && !BO->hasNoSignedWrap())
-      return false;
-    if (ZeroExtended && !BO->hasNoUnsignedWrap())
-      return false;
-  }
-
-  return true;
-}
-
-APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
-                                                   bool SignExtended,
-                                                   bool ZeroExtended) {
-  // BO being non-negative does not shed light on whether its operands are
-  // non-negative. Clear the NonNegative flag here.
-  APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
-                              /* NonNegative */ false);
-  // If we found a constant offset in the left operand, stop and return that.
-  // This shortcut might cause us to miss opportunities of combining the
-  // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
-  // However, such cases are probably already handled by -instcombine,
-  // given this pass runs after the standard optimizations.
-  if (ConstantOffset != 0) return ConstantOffset;
-  ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
-                        /* NonNegative */ false);
-  // If U is a sub operator, negate the constant offset found in the right
-  // operand.
-  if (BO->getOpcode() == Instruction::Sub)
-    ConstantOffset = -ConstantOffset;
-  return ConstantOffset;
-}
-
-APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
-                                    bool ZeroExtended, bool NonNegative) {
-  // TODO(jingyue): We could trace into integer/pointer casts, such as
-  // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
-  // integers because it gives good enough results for our benchmarks.
-  unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
-
-  // We cannot do much with Values that are not a User, such as an Argument.
-  User *U = dyn_cast<User>(V);
-  if (U == nullptr) return APInt(BitWidth, 0);
-
-  APInt ConstantOffset(BitWidth, 0);
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-    // Hooray, we found it!
-    ConstantOffset = CI->getValue();
-  } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
-    // Trace into subexpressions for more hoisting opportunities.
-    if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative)) {
-      ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
-    }
-  } else if (isa<SExtInst>(V)) {
-    ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
-                          ZeroExtended, NonNegative).sext(BitWidth);
-  } else if (isa<ZExtInst>(V)) {
-    // As an optimization, we can clear the SignExtended flag because
-    // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
-    //
-    // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
-    ConstantOffset =
-        find(U->getOperand(0), /* SignExtended */ false,
-             /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth);
-  }
-
-  // If we found a non-zero constant offset, add it to the path for
-  // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
-  // help this optimization.
-  if (ConstantOffset != 0)
-    UserChain.push_back(U);
-  return ConstantOffset;
-}
-
-Value *ConstantOffsetExtractor::applyExts(Value *V) {
-  Value *Current = V;
-  // ExtInsts is built in the use-def order. Therefore, we apply them to V
-  // in the reversed order.
-  for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) {
-    if (Constant *C = dyn_cast<Constant>(Current)) {
-      // If Current is a constant, apply s/zext using ConstantExpr::getCast.
-      // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
-      Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType());
-    } else {
-      Instruction *Ext = (*I)->clone();
-      Ext->setOperand(0, Current);
-      Ext->insertBefore(IP);
-      Current = Ext;
-    }
-  }
-  return Current;
-}
-
-Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
-  distributeExtsAndCloneChain(UserChain.size() - 1);
-  // Remove all nullptrs (used to be s/zext) from UserChain.
-  unsigned NewSize = 0;
-  for (auto I = UserChain.begin(), E = UserChain.end(); I != E; ++I) {
-    if (*I != nullptr) {
-      UserChain[NewSize] = *I;
-      NewSize++;
-    }
-  }
-  UserChain.resize(NewSize);
-  return removeConstOffset(UserChain.size() - 1);
-}
-
-Value *
-ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
-  User *U = UserChain[ChainIndex];
-  if (ChainIndex == 0) {
-    assert(isa<ConstantInt>(U));
-    // If U is a ConstantInt, applyExts will return a ConstantInt as well.
-    return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
-  }
-
-  if (CastInst *Cast = dyn_cast<CastInst>(U)) {
-    assert((isa<SExtInst>(Cast) || isa<ZExtInst>(Cast)) &&
-           "We only traced into two types of CastInst: sext and zext");
-    ExtInsts.push_back(Cast);
-    UserChain[ChainIndex] = nullptr;
-    return distributeExtsAndCloneChain(ChainIndex - 1);
-  }
-
-  // Function find only trace into BinaryOperator and CastInst.
-  BinaryOperator *BO = cast<BinaryOperator>(U);
-  // OpNo = which operand of BO is UserChain[ChainIndex - 1]
-  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
-  Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
-  Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
-
-  BinaryOperator *NewBO = nullptr;
-  if (OpNo == 0) {
-    NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
-                                   BO->getName(), IP);
-  } else {
-    NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
-                                   BO->getName(), IP);
-  }
-  return UserChain[ChainIndex] = NewBO;
-}
-
-Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
-  if (ChainIndex == 0) {
-    assert(isa<ConstantInt>(UserChain[ChainIndex]));
-    return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
-  }
-
-  BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
-  unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
-  assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
-  Value *NextInChain = removeConstOffset(ChainIndex - 1);
-  Value *TheOther = BO->getOperand(1 - OpNo);
-
-  // If NextInChain is 0 and not the LHS of a sub, we can simplify the
-  // sub-expression to be just TheOther.
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
-    if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
-      return TheOther;
-  }
-
-  if (BO->getOpcode() == Instruction::Or) {
-    // Rebuild "or" as "add", because "or" may be invalid for the new
-    // epxression.
-    //
-    // For instance, given
-    //   a | (b + 5) where a and b + 5 have no common bits,
-    // we can extract 5 as the constant offset.
-    //
-    // However, reusing the "or" in the new index would give us
-    //   (a | b) + 5
-    // which does not equal a | (b + 5).
-    //
-    // Replacing the "or" with "add" is fine, because
-    //   a | (b + 5) = a + (b + 5) = (a + b) + 5
-    if (OpNo == 0) {
-      return BinaryOperator::CreateAdd(NextInChain, TheOther, BO->getName(),
-                                       IP);
-    } else {
-      return BinaryOperator::CreateAdd(TheOther, NextInChain, BO->getName(),
-                                       IP);
-    }
-  }
-
-  // We can reuse BO in this case, because the new expression shares the same
-  // instruction type and BO is used at most once.
-  assert(BO->getNumUses() <= 1 &&
-         "distributeExtsAndCloneChain clones each BinaryOperator in "
-         "UserChain, so no one should be used more than "
-         "once");
-  BO->setOperand(OpNo, NextInChain);
-  BO->setHasNoSignedWrap(false);
-  BO->setHasNoUnsignedWrap(false);
-  // Make sure it appears after all instructions we've inserted so far.
-  BO->moveBefore(IP);
-  return BO;
-}
-
-Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP) {
-  ConstantOffsetExtractor Extractor(GEP);
-  // Find a non-zero constant offset first.
-  APInt ConstantOffset =
-      Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
-                     GEP->isInBounds());
-  if (ConstantOffset == 0)
-    return nullptr;
-  // Separates the constant offset from the GEP index.
-  return Extractor.rebuildWithoutConstOffset();
-}
-
-int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP) {
-  // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
-  return ConstantOffsetExtractor(GEP)
-      .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
-            GEP->isInBounds())
-      .getSExtValue();
-}
-
-void ConstantOffsetExtractor::ComputeKnownBits(Value *V, APInt &KnownOne,
-                                               APInt &KnownZero) const {
-  IntegerType *IT = cast<IntegerType>(V->getType());
-  KnownOne = APInt(IT->getBitWidth(), 0);
-  KnownZero = APInt(IT->getBitWidth(), 0);
-  const DataLayout &DL = IP->getModule()->getDataLayout();
-  llvm::computeKnownBits(V, KnownZero, KnownOne, DL, 0);
-}
-
-bool ConstantOffsetExtractor::NoCommonBits(Value *LHS, Value *RHS) const {
-  assert(LHS->getType() == RHS->getType() &&
-         "LHS and RHS should have the same type");
-  APInt LHSKnownOne, LHSKnownZero, RHSKnownOne, RHSKnownZero;
-  ComputeKnownBits(LHS, LHSKnownOne, LHSKnownZero);
-  ComputeKnownBits(RHS, RHSKnownOne, RHSKnownZero);
-  return (LHSKnownZero | RHSKnownZero).isAllOnesValue();
-}
-
-bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
-    GetElementPtrInst *GEP) {
-  bool Changed = false;
-  const DataLayout &DL = GEP->getModule()->getDataLayout();
-  Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
-  gep_type_iterator GTI = gep_type_begin(*GEP);
-  for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
-       I != E; ++I, ++GTI) {
-    // Skip struct member indices which must be i32.
-    if (isa<SequentialType>(*GTI)) {
-      if ((*I)->getType() != IntPtrTy) {
-        *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP);
-        Changed = true;
-      }
-    }
-  }
-  return Changed;
-}
-
-int64_t
-SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
-                                                 bool &NeedsExtraction) {
-  NeedsExtraction = false;
-  int64_t AccumulativeByteOffset = 0;
-  gep_type_iterator GTI = gep_type_begin(*GEP);
-  const DataLayout &DL = GEP->getModule()->getDataLayout();
-  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
-    if (isa<SequentialType>(*GTI)) {
-      // Tries to extract a constant offset from this GEP index.
-      int64_t ConstantOffset =
-          ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP);
-      if (ConstantOffset != 0) {
-        NeedsExtraction = true;
-        // A GEP may have multiple indices.  We accumulate the extracted
-        // constant offset to a byte offset, and later offset the remainder of
-        // the original GEP with this byte offset.
-        AccumulativeByteOffset +=
-            ConstantOffset * DL.getTypeAllocSize(GTI.getIndexedType());
-      }
-    } else if (LowerGEP) {
-      StructType *StTy = cast<StructType>(*GTI);
-      uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
-      // Skip field 0 as the offset is always 0.
-      if (Field != 0) {
-        NeedsExtraction = true;
-        AccumulativeByteOffset +=
-            DL.getStructLayout(StTy)->getElementOffset(Field);
-      }
-    }
-  }
-  return AccumulativeByteOffset;
-}
-
-void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
-    GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
-  IRBuilder<> Builder(Variadic);
-  const DataLayout &DL = Variadic->getModule()->getDataLayout();
-  Type *IntPtrTy = DL.getIntPtrType(Variadic->getType());
-
-  Type *I8PtrTy =
-      Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
-  Value *ResultPtr = Variadic->getOperand(0);
-  if (ResultPtr->getType() != I8PtrTy)
-    ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
-
-  gep_type_iterator GTI = gep_type_begin(*Variadic);
-  // Create an ugly GEP for each sequential index. We don't create GEPs for
-  // structure indices, as they are accumulated in the constant offset index.
-  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
-    if (isa<SequentialType>(*GTI)) {
-      Value *Idx = Variadic->getOperand(I);
-      // Skip zero indices.
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
-        if (CI->isZero())
-          continue;
-
-      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
-                                DL.getTypeAllocSize(GTI.getIndexedType()));
-      // Scale the index by element size.
-      if (ElementSize != 1) {
-        if (ElementSize.isPowerOf2()) {
-          Idx = Builder.CreateShl(
-              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
-        } else {
-          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
-        }
-      }
-      // Create an ugly GEP with a single index for each index.
-      ResultPtr =
-          Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
-    }
-  }
-
-  // Create a GEP with the constant offset index.
-  if (AccumulativeByteOffset != 0) {
-    Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
-    ResultPtr =
-        Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
-  }
-  if (ResultPtr->getType() != Variadic->getType())
-    ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
-
-  Variadic->replaceAllUsesWith(ResultPtr);
-  Variadic->eraseFromParent();
-}
-
-void
-SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
-                                               int64_t AccumulativeByteOffset) {
-  IRBuilder<> Builder(Variadic);
-  const DataLayout &DL = Variadic->getModule()->getDataLayout();
-  Type *IntPtrTy = DL.getIntPtrType(Variadic->getType());
-
-  Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
-  gep_type_iterator GTI = gep_type_begin(*Variadic);
-  // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
-  // don't create arithmetics for structure indices, as they are accumulated
-  // in the constant offset index.
-  for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
-    if (isa<SequentialType>(*GTI)) {
-      Value *Idx = Variadic->getOperand(I);
-      // Skip zero indices.
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
-        if (CI->isZero())
-          continue;
-
-      APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
-                                DL.getTypeAllocSize(GTI.getIndexedType()));
-      // Scale the index by element size.
-      if (ElementSize != 1) {
-        if (ElementSize.isPowerOf2()) {
-          Idx = Builder.CreateShl(
-              Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
-        } else {
-          Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
-        }
-      }
-      // Create an ADD for each index.
-      ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
-    }
-  }
-
-  // Create an ADD for the constant offset index.
-  if (AccumulativeByteOffset != 0) {
-    ResultPtr = Builder.CreateAdd(
-        ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
-  }
-
-  ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
-  Variadic->replaceAllUsesWith(ResultPtr);
-  Variadic->eraseFromParent();
-}
-
-bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
-  // Skip vector GEPs.
-  if (GEP->getType()->isVectorTy())
-    return false;
-
-  // The backend can already nicely handle the case where all indices are
-  // constant.
-  if (GEP->hasAllConstantIndices())
-    return false;
-
-  bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);
-
-  bool NeedsExtraction;
-  int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
-
-  if (!NeedsExtraction)
-    return Changed;
-  // If LowerGEP is disabled, before really splitting the GEP, check whether the
-  // backend supports the addressing mode we are about to produce. If no, this
-  // splitting probably won't be beneficial.
-  // If LowerGEP is enabled, even the extracted constant offset can not match
-  // the addressing mode, we can still do optimizations to other lowered parts
-  // of variable indices. Therefore, we don't check for addressing modes in that
-  // case.
-  if (!LowerGEP) {
-    TargetTransformInfo &TTI =
-        getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
-            *GEP->getParent()->getParent());
-    if (!TTI.isLegalAddressingMode(GEP->getType()->getElementType(),
-                                   /*BaseGV=*/nullptr, AccumulativeByteOffset,
-                                   /*HasBaseReg=*/true, /*Scale=*/0)) {
-      return Changed;
-    }
-  }
-
-  // Remove the constant offset in each sequential index. The resultant GEP
-  // computes the variadic base.
-  // Notice that we don't remove struct field indices here. If LowerGEP is
-  // disabled, a structure index is not accumulated and we still use the old
-  // one. If LowerGEP is enabled, a structure index is accumulated in the
-  // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
-  // handle the constant offset and won't need a new structure index.
-  gep_type_iterator GTI = gep_type_begin(*GEP);
-  for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
-    if (isa<SequentialType>(*GTI)) {
-      // Splits this GEP index into a variadic part and a constant offset, and
-      // uses the variadic part as the new index.
-      Value *NewIdx = ConstantOffsetExtractor::Extract(GEP->getOperand(I), GEP);
-      if (NewIdx != nullptr) {
-        GEP->setOperand(I, NewIdx);
-      }
-    }
-  }
-
-  // Clear the inbounds attribute because the new index may be off-bound.
-  // e.g.,
-  //
-  // b = add i64 a, 5
-  // addr = gep inbounds float* p, i64 b
-  //
-  // is transformed to:
-  //
-  // addr2 = gep float* p, i64 a
-  // addr = gep float* addr2, i64 5
-  //
-  // If a is -4, although the old index b is in bounds, the new index a is
-  // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
-  // inbounds keyword is not present, the offsets are added to the base
-  // address with silently-wrapping two's complement arithmetic".
-  // Therefore, the final code will be a semantically equivalent.
-  //
-  // TODO(jingyue): do some range analysis to keep as many inbounds as
-  // possible. GEPs with inbounds are more friendly to alias analysis.
-  GEP->setIsInBounds(false);
-
-  // Lowers a GEP to either GEPs with a single index or arithmetic operations.
-  if (LowerGEP) {
-    // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
-    // arithmetic operations if the target uses alias analysis in codegen.
-    if (TM && TM->getSubtargetImpl(*GEP->getParent()->getParent())->useAA())
-      lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
-    else
-      lowerToArithmetics(GEP, AccumulativeByteOffset);
-    return true;
-  }
-
-  // No need to create another GEP if the accumulative byte offset is 0.
-  if (AccumulativeByteOffset == 0)
-    return true;
-
-  // Offsets the base with the accumulative byte offset.
-  //
-  //   %gep                        ; the base
-  //   ... %gep ...
-  //
-  // => add the offset
-  //
-  //   %gep2                       ; clone of %gep
-  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
-  //   %gep                        ; will be removed
-  //   ... %gep ...
-  //
-  // => replace all uses of %gep with %new.gep and remove %gep
-  //
-  //   %gep2                       ; clone of %gep
-  //   %new.gep = gep %gep2, <offset / sizeof(*%gep)>
-  //   ... %new.gep ...
-  //
-  // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an
-  // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep):
-  // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the
-  // type of %gep.
-  //
-  //   %gep2                       ; clone of %gep
-  //   %0       = bitcast %gep2 to i8*
-  //   %uglygep = gep %0, <offset>
-  //   %new.gep = bitcast %uglygep to <type of %gep>
-  //   ... %new.gep ...
-  Instruction *NewGEP = GEP->clone();
-  NewGEP->insertBefore(GEP);
-
-  // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
-  // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
-  // used with unsigned integers later.
-  const DataLayout &DL = GEP->getModule()->getDataLayout();
-  int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
-      DL.getTypeAllocSize(GEP->getType()->getElementType()));
-  Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
-  if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
-    // Very likely. As long as %gep is natually aligned, the byte offset we
-    // extracted should be a multiple of sizeof(*%gep).
-    int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
-    NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
-                                       ConstantInt::get(IntPtrTy, Index, true),
-                                       GEP->getName(), GEP);
-  } else {
-    // Unlikely but possible. For example,
-    // #pragma pack(1)
-    // struct S {
-    //   int a[3];
-    //   int64 b[8];
-    // };
-    // #pragma pack()
-    //
-    // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After
-    // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is
-    // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of
-    // sizeof(int64).
-    //
-    // Emit an uglygep in this case.
-    Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(),
-                                       GEP->getPointerAddressSpace());
-    NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
-    NewGEP = GetElementPtrInst::Create(
-        Type::getInt8Ty(GEP->getContext()), NewGEP,
-        ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
-        GEP);
-    if (GEP->getType() != I8PtrTy)
-      NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
-  }
-
-  GEP->replaceAllUsesWith(NewGEP);
-  GEP->eraseFromParent();
-
-  return true;
-}
-
-bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
-  if (skipOptnoneFunction(F))
-    return false;
-
-  if (DisableSeparateConstOffsetFromGEP)
-    return false;
-
-  bool Changed = false;
-  for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
-    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ) {
-      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++)) {
-        Changed |= splitGEP(GEP);
-      }
-      // No need to split GEP ConstantExprs because all its indices are constant
-      // already.
-    }
-  }
-  return Changed;
-}
Index: lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
===================================================================
--- lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -11,7 +11,7 @@
 // strength reduction, this algorithm is designed to reduce arithmetic
 // redundancy in straight-line code instead of loops. It has proven to be
 // effective in simplifying arithmetic statements derived from an unrolled loop.
-// It can also simplify the logic of SeparateConstOffsetFromGEP.
+// It can also simplify the logic of ReassociateGEPs.
 //
 // There are many optimizations we can perform in the domain of SLSR. This file
 // for now contains only an initial step. Specifically, we look for strength
Index: test/CodeGen/AArch64/aarch64-gep-opt.ll
===================================================================
--- test/CodeGen/AArch64/aarch64-gep-opt.ll
+++ test/CodeGen/AArch64/aarch64-gep-opt.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-linux-gnueabi"
 
-; Following test cases test enabling SeparateConstOffsetFromGEP pass in AArch64
+; Following test cases test enabling ReassociateGEPs pass in AArch64
 ; backend. If useAA() returns true, it will lower a GEP with multiple indices
 ; into GEPs with a single index, otherwise it will lower it into a
 ; "ptrtoint+arithmetics+inttoptr" form.
@@ -150,7 +150,7 @@
 ; CHECK-UseAA-LABEL: @test-struct_2(
 ; CHECK-UseAA: getelementptr i8, i8* %{{[a-zA-Z0-9]+}}, i64 -40
 
-; Test that when a index is added from two constant, SeparateConstOffsetFromGEP
+; Test that when a index is added from two constant, ReassociateGEPs
 ; pass does not generate incorrect result.
 define void @test_const_add([3 x i32]* %in) {
   %inc = add nsw i32 2, 1
Index: test/CodeGen/PowerPC/ppc64-gep-opt.ll
===================================================================
--- test/CodeGen/PowerPC/ppc64-gep-opt.ll
+++ test/CodeGen/PowerPC/ppc64-gep-opt.ll
@@ -4,7 +4,7 @@
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
-; Following test cases test enabling SeparateConstOffsetFromGEP pass in the PPC
+; Following test cases test enabling ReassociateGEPs pass in the PPC
 ; backend. If useAA() returns true, it will lower a GEP with multiple indices
 ; into GEPs with a single index, otherwise it will lower it into a
 ; "ptrtoint+arithmetics+inttoptr" form.
@@ -141,7 +141,7 @@
 ; CHECK-UseAA-LABEL: @test-struct_2(
 ; CHECK-UseAA: getelementptr i8, i8* %{{[a-zA-Z0-9]+}}, i64 -40
 
-; Test that when a index is added from two constant, SeparateConstOffsetFromGEP
+; Test that when a index is added from two constant, ReassociateGEPs
 ; pass does not generate incorrect result.
 define void @test_const_add([3 x i32]* %in) {
   %inc = add nsw i32 2, 1
Index: test/Transforms/ReassociateGEPs/NVPTX/split-gep-and-gvn.ll
===================================================================
--- test/Transforms/ReassociateGEPs/NVPTX/split-gep-and-gvn.ll
+++ test/Transforms/ReassociateGEPs/NVPTX/split-gep-and-gvn.ll
@@ -1,11 +1,11 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX
-; RUN: opt < %s -S -separate-const-offset-from-gep -gvn -dce | FileCheck %s --check-prefix=IR
+; RUN: opt < %s -S -reassociate-geps -gvn -dce | FileCheck %s --check-prefix=IR
 
-; Verifies the SeparateConstOffsetFromGEP pass.
+; Verifies the ReassociateGEPs pass.
 ; The following code computes
 ; *output = array[x][y] + array[x][y+1] + array[x+1][y] + array[x+1][y+1]
 ;
-; We expect SeparateConstOffsetFromGEP to transform it to
+; We expect ReassociateGEPs to transform it to
 ;
 ; float *base = &a[x][y];
 ; *output = base[0] + base[1] + base[32] + base[33];
@@ -60,7 +60,7 @@
 ; the order of "sext" and "add" when computing the array indices. @sum_of_array
 ; computes add before sext, e.g., array[sext(x + 1)][sext(y + 1)], while
 ; @sum_of_array2 computes sext before add,
-; e.g., array[sext(x) + 1][sext(y) + 1]. SeparateConstOffsetFromGEP should be
+; e.g., array[sext(x) + 1][sext(y) + 1]. ReassociateGEPs should be
 ; able to extract constant offsets from both forms.
 define void @sum_of_array2(i32 %x, i32 %y, float* nocapture %output) {
 .preheader:
Index: test/Transforms/ReassociateGEPs/NVPTX/split-gep.ll
===================================================================
--- test/Transforms/ReassociateGEPs/NVPTX/split-gep.ll
+++ test/Transforms/ReassociateGEPs/NVPTX/split-gep.ll
@@ -1,6 +1,6 @@
-; RUN: opt < %s -separate-const-offset-from-gep -dce -S | FileCheck %s
+; RUN: opt < %s -reassociate-geps -dce -S | FileCheck %s
 
-; Several unit tests for -separate-const-offset-from-gep. The transformation
+; Several unit tests for -reassociate-geps. The transformation
 ; heavily relies on TargetTransformInfo, so we put these tests under
 ; target-specific folders.
 
@@ -245,7 +245,7 @@
   %or = or i64 %add, 1
 ; CHECK: [[OR:%or[0-9]*]] = add i64 %shl, 1
   ; ((a << 2) + 12) and 1 have no common bits. Therefore,
-  ; SeparateConstOffsetFromGEP is able to extract the 12.
+  ; ReassociateGEPs is able to extract the 12.
   ; TODO(jingyue): We could reassociate the expression to combine 12 and 1.
   %p = getelementptr float, float* %ptr, i64 %or
 ; CHECK: [[PTR:%[a-zA-Z0-9]+]] = getelementptr float, float* %ptr, i64 [[OR]]
Index: test/Transforms/ReassociateGEPs/NVPTX/split-licm-gep.ll
===================================================================
--- test/Transforms/ReassociateGEPs/NVPTX/split-licm-gep.ll
+++ test/Transforms/ReassociateGEPs/NVPTX/split-licm-gep.ll
@@ -0,0 +1,161 @@
+; RUN: opt < %s -reassociate-geps -dce -licm -S | FileCheck %s
+
+; Several unit tests for -reassociate-geps focusing on splitting GEPs for
+; LICM. The transformation heavily relies on TargetTransformInfo, so we put
+; these tests under target-specific folders.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+; target triple is necessary; otherwise TargetTransformInfo rejects any
+; addressing mode.
+target triple = "nvptx64-unknown-unknown"
+
+define void @simple_licm(i32* %input, i32* %output, i32 %a) {
+; CHECK-LABEL: @simple_licm
+entry:
+; CHECK: entry:
+; CHECK-NEXT: [[SEXT1:%[a-zA-Z0-9]+]] = sext i32 %a to i64
+; CHECK-NEXT: [[GEP1:%[a-zA-Z0-9]+]] = getelementptr i32, i32* %input, i64 [[SEXT1]]
+  br label %loop
+
+loop:
+; CHECK: [[SEXT2:%[a-zA-Z0-9]+]] = sext i32 %i to i64
+; CHECK-NEXT: [[GEP2:%[a-zA-Z0-9]+]] = getelementptr i32, i32* [[GEP1]], i64 [[SEXT2]]
+  %i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
+  %idx = add nsw i32 %a, %i
+  %idx.sext = sext i32 %idx to i64
+  %arrayidx = getelementptr i32, i32* %input, i64 %idx.sext
+  %0 = load i32, i32* %arrayidx
+
+  ; Dummy store to prevent DCE.
+  store i32 %0, i32* %output
+
+  %i.next = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %i.next, 1000000
+  br i1 %exitcond, label %loop.exit, label %loop
+
+loop.exit:
+  ret void
+}
+
+; GEP index with both a loop-invariant subexpression and a constant.
+; Constant should be hoisted first followed by the loop-invariant
+; reassociation and splitting.
+define void @gep_with_const(i32* %input, i32* %output, i64 %a) {
+; CHECK-LABEL: @gep_with_const
+entry:
+; CHECK: entry:
+; CHECK-NEXT: [[GEP1:%[a-zA-Z0-9]+]] = getelementptr i32, i32* %input, i64 %a
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: [[GEP2:%[a-zA-Z0-9]+]] = getelementptr i32, i32* [[GEP1]], i64 %i
+; CHECK: %{{[a-zA-Z0-9]+}} = getelementptr i32, i32* [[GEP2]], i64 1234
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  %sum = add nsw i64 %a, 1234
+  %idx = add nsw i64 %sum, %i
+  %arrayidx = getelementptr i32, i32* %input, i64 %idx
+  %0 = load i32, i32* %arrayidx
+
+  ; Dummy store to prevent DCE.
+  store i32 %0, i32* %output
+
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, 1000000
+  br i1 %exitcond, label %loop.exit, label %loop
+
+loop.exit:
+  ret void
+}
+
+; GEP with an already invariant index should not be split.
+define void @already_invariant_gep(i32* %input, i32* %output, i64 %a) {
+; CHECK-LABEL: @already_invariant_gep(
+entry:
+; CHECK: getelementptr i32, i32* %input, i64 %idx
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  %idx = add nsw i64 %a, %a
+  %arrayidx = getelementptr i32, i32* %input, i64 %idx
+  %0 = load i32, i32* %arrayidx
+
+  ; Dummy store to prevent DCE.
+  store i32 %0, i32* %output
+
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, 1000000
+  br i1 %exitcond, label %loop.exit, label %loop
+
+loop.exit:
+  ret void
+}
+
+; GEP with an index which is not loop invariant should not be split.
+define void @no_loop_invariance(i32* %input, i32* %output) {
+; CHECK-LABEL: @no_loop_invariance(
+entry:
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: getelementptr i32, i32* %input, i64 %idx
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  %idx = add nsw i64 %i, %i
+  %arrayidx = getelementptr i32, i32* %input, i64 %idx
+  %0 = load i32, i32* %arrayidx
+
+  ; Dummy store to prevent DCE.
+  store i32 %0, i32* %output
+
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond = icmp eq i64 %i.next, 1000000
+  br i1 %exitcond, label %loop.exit, label %loop
+
+loop.exit:
+  ret void
+}
+
+; Doubly nested loop containing GEP instruction with multiple indices.  GEP can
+; be split at two of the indices forming two new loop-invariant geps which are
+; hoisted to different loop-depths.
+define void @double_loop([5 x [6 x i32]]* %input, i32* %output, i64 %a) {
+; CHECK-LABEL: @double_loop(
+entry:
+; CHECK: entry:
+; CHECK-NEXT: [[GEP1:%[a-zA-Z0-9]+]] = getelementptr [5 x [6 x i32]], [5 x [6 x i32]]* %input, i64 %a, i64 %a
+  br label %loop1
+
+loop1:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop2.exit ]
+; CHECK: [[GEP2:%[a-zA-Z0-9]+]] = getelementptr [6 x i32], [6 x i32]* [[GEP1]], i64 %i, i64 %i
+; CHECK-NEXT: br label %loop2
+  br label %loop2
+
+loop2:
+  %j = phi i64 [ 0, %loop1 ], [ %j.next, %loop2 ]
+; CHECK: %j = phi
+; CHECK-NEXT: [[GEP3:%[a-zA-Z0-9]+]] = getelementptr i32, i32* [[GEP2]], i64 %j
+; CHECK-NEXT: %{{[a-zA-Z0-9]+}} = load i32, i32* [[GEP3]]
+
+  %idx1 = add nsw i64 %a, %i
+  %idx2 = add nsw i64 %i, %j
+  %arrayidx = getelementptr  [5 x [6 x i32]], [5 x [6 x i32]]* %input, i64 %a, i64 %idx1, i64 %idx2
+  %0 = load i32, i32* %arrayidx
+
+  ; Dummy store to prevent DCE.
+  store i32 %0, i32* %output
+
+  %j.next = add nuw nsw i64 %j, 1
+  %exitcond2 = icmp eq i64 %j.next, 1000000
+  br i1 %exitcond2, label %loop2.exit, label %loop2
+
+loop2.exit:
+  %i.next = add nuw nsw i64 %i, 1
+  %exitcond1 = icmp eq i64 %i.next, 1000000
+  br i1 %exitcond1, label %loop1.exit, label %loop1
+
+loop1.exit:
+  ret void
+}
Index: test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg
===================================================================
--- test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg
+++ test/Transforms/SeparateConstOffsetFromGEP/NVPTX/lit.local.cfg
@@ -1,3 +0,0 @@
-if not 'NVPTX' in config.root.targets:
-    config.unsupported = True
-
Index: test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
===================================================================
--- test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
+++ test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep-and-gvn.ll
@@ -1,196 +0,0 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s --check-prefix=PTX
-; RUN: opt < %s -S -separate-const-offset-from-gep -gvn -dce | FileCheck %s --check-prefix=IR
-
-; Verifies the SeparateConstOffsetFromGEP pass.
-; The following code computes
-; *output = array[x][y] + array[x][y+1] + array[x+1][y] + array[x+1][y+1]
-;
-; We expect SeparateConstOffsetFromGEP to transform it to
-;
-; float *base = &a[x][y];
-; *output = base[0] + base[1] + base[32] + base[33];
-;
-; so the backend can emit PTX that uses fewer virtual registers.
-
-target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
-target triple = "nvptx64-unknown-unknown"
-
-@array = internal addrspace(3) constant [32 x [32 x float]] zeroinitializer, align 4
-
-define void @sum_of_array(i32 %x, i32 %y, float* nocapture %output) {
-.preheader:
-  %0 = sext i32 %y to i64
-  %1 = sext i32 %x to i64
-  %2 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0
-  %3 = addrspacecast float addrspace(3)* %2 to float*
-  %4 = load float, float* %3, align 4
-  %5 = fadd float %4, 0.000000e+00
-  %6 = add i32 %y, 1
-  %7 = sext i32 %6 to i64
-  %8 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %7
-  %9 = addrspacecast float addrspace(3)* %8 to float*
-  %10 = load float, float* %9, align 4
-  %11 = fadd float %5, %10
-  %12 = add i32 %x, 1
-  %13 = sext i32 %12 to i64
-  %14 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %0
-  %15 = addrspacecast float addrspace(3)* %14 to float*
-  %16 = load float, float* %15, align 4
-  %17 = fadd float %11, %16
-  %18 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %7
-  %19 = addrspacecast float addrspace(3)* %18 to float*
-  %20 = load float, float* %19, align 4
-  %21 = fadd float %17, %20
-  store float %21, float* %output, align 4
-  ret void
-}
-; PTX-LABEL: sum_of_array(
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}}
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
-
-; IR-LABEL: @sum_of_array(
-; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 1
-; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 32
-; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 33
-
-; @sum_of_array2 is very similar to @sum_of_array. The only difference is in
-; the order of "sext" and "add" when computing the array indices. @sum_of_array
-; computes add before sext, e.g., array[sext(x + 1)][sext(y + 1)], while
-; @sum_of_array2 computes sext before add,
-; e.g., array[sext(x) + 1][sext(y) + 1]. SeparateConstOffsetFromGEP should be
-; able to extract constant offsets from both forms.
-define void @sum_of_array2(i32 %x, i32 %y, float* nocapture %output) {
-.preheader:
-  %0 = sext i32 %y to i64
-  %1 = sext i32 %x to i64
-  %2 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0
-  %3 = addrspacecast float addrspace(3)* %2 to float*
-  %4 = load float, float* %3, align 4
-  %5 = fadd float %4, 0.000000e+00
-  %6 = add i64 %0, 1
-  %7 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %6
-  %8 = addrspacecast float addrspace(3)* %7 to float*
-  %9 = load float, float* %8, align 4
-  %10 = fadd float %5, %9
-  %11 = add i64 %1, 1
-  %12 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %0
-  %13 = addrspacecast float addrspace(3)* %12 to float*
-  %14 = load float, float* %13, align 4
-  %15 = fadd float %10, %14
-  %16 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %6
-  %17 = addrspacecast float addrspace(3)* %16 to float*
-  %18 = load float, float* %17, align 4
-  %19 = fadd float %15, %18
-  store float %19, float* %output, align 4
-  ret void
-}
-; PTX-LABEL: sum_of_array2(
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}}
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
-
-; IR-LABEL: @sum_of_array2(
-; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 1
-; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 32
-; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 33
-
-
-; This function loads
-;   array[zext(x)][zext(y)]
-;   array[zext(x)][zext(y +nuw 1)]
-;   array[zext(x +nuw 1)][zext(y)]
-;   array[zext(x +nuw 1)][zext(y +nuw 1)].
-;
-; This function is similar to @sum_of_array, but it
-; 1) extends array indices using zext instead of sext;
-; 2) annotates the addition with "nuw"; otherwise, zext(x + 1) => zext(x) + 1
-;    may be invalid.
-define void @sum_of_array3(i32 %x, i32 %y, float* nocapture %output) {
-.preheader:
-  %0 = zext i32 %y to i64
-  %1 = zext i32 %x to i64
-  %2 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0
-  %3 = addrspacecast float addrspace(3)* %2 to float*
-  %4 = load float, float* %3, align 4
-  %5 = fadd float %4, 0.000000e+00
-  %6 = add nuw i32 %y, 1
-  %7 = zext i32 %6 to i64
-  %8 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %7
-  %9 = addrspacecast float addrspace(3)* %8 to float*
-  %10 = load float, float* %9, align 4
-  %11 = fadd float %5, %10
-  %12 = add nuw i32 %x, 1
-  %13 = zext i32 %12 to i64
-  %14 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %0
-  %15 = addrspacecast float addrspace(3)* %14 to float*
-  %16 = load float, float* %15, align 4
-  %17 = fadd float %11, %16
-  %18 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %13, i64 %7
-  %19 = addrspacecast float addrspace(3)* %18 to float*
-  %20 = load float, float* %19, align 4
-  %21 = fadd float %17, %20
-  store float %21, float* %output, align 4
-  ret void
-}
-; PTX-LABEL: sum_of_array3(
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}}
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
-
-; IR-LABEL: @sum_of_array3(
-; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 1
-; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 32
-; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 33
-
-
-; This function loads
-;   array[zext(x)][zext(y)]
-;   array[zext(x)][zext(y)]
-;   array[zext(x) + 1][zext(y) + 1]
-;   array[zext(x) + 1][zext(y) + 1].
-;
-; We expect the generated code to reuse the computation of
-; &array[zext(x)][zext(y)]. See the expected IR and PTX for details.
-define void @sum_of_array4(i32 %x, i32 %y, float* nocapture %output) {
-.preheader:
-  %0 = zext i32 %y to i64
-  %1 = zext i32 %x to i64
-  %2 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %0
-  %3 = addrspacecast float addrspace(3)* %2 to float*
-  %4 = load float, float* %3, align 4
-  %5 = fadd float %4, 0.000000e+00
-  %6 = add i64 %0, 1
-  %7 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %1, i64 %6
-  %8 = addrspacecast float addrspace(3)* %7 to float*
-  %9 = load float, float* %8, align 4
-  %10 = fadd float %5, %9
-  %11 = add i64 %1, 1
-  %12 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %0
-  %13 = addrspacecast float addrspace(3)* %12 to float*
-  %14 = load float, float* %13, align 4
-  %15 = fadd float %10, %14
-  %16 = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %11, i64 %6
-  %17 = addrspacecast float addrspace(3)* %16 to float*
-  %18 = load float, float* %17, align 4
-  %19 = fadd float %15, %18
-  store float %19, float* %output, align 4
-  ret void
-}
-; PTX-LABEL: sum_of_array4(
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG:%(rd|r)[0-9]+]]{{\]}}
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+4{{\]}}
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+128{{\]}}
-; PTX: ld.shared.f32 {{%f[0-9]+}}, {{\[}}[[BASE_REG]]+132{{\]}}
-
-; IR-LABEL: @sum_of_array4(
-; IR: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]] addrspace(3)* @array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 1
-; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 32
-; IR: getelementptr float, float addrspace(3)* [[BASE_PTR]], i64 33
Index: test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
===================================================================
--- test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
+++ test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
@@ -1,279 +0,0 @@
-; RUN: opt < %s -separate-const-offset-from-gep -dce -S | FileCheck %s
-
-; Several unit tests for -separate-const-offset-from-gep. The transformation
-; heavily relies on TargetTransformInfo, so we put these tests under
-; target-specific folders.
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-; target triple is necessary; otherwise TargetTransformInfo rejects any
-; addressing mode.
-target triple = "nvptx64-unknown-unknown"
-
-%struct.S = type { float, double }
-
-@struct_array = global [1024 x %struct.S] zeroinitializer, align 16
-@float_2d_array = global [32 x [32 x float]] zeroinitializer, align 4
-
-; We should not extract any struct field indices, because fields in a struct
-; may have different types.
-define double* @struct(i32 %i) {
-entry:
-  %add = add nsw i32 %i, 5
-  %idxprom = sext i32 %add to i64
-  %p = getelementptr inbounds [1024 x %struct.S], [1024 x %struct.S]* @struct_array, i64 0, i64 %idxprom, i32 1
-  ret double* %p
-}
-; CHECK-LABEL: @struct(
-; CHECK: getelementptr [1024 x %struct.S], [1024 x %struct.S]* @struct_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i32 1
-
-; We should be able to trace into sext(a + b) if a + b is non-negative
-; (e.g., used as an index of an inbounds GEP) and one of a and b is
-; non-negative.
-define float* @sext_add(i32 %i, i32 %j) {
-entry:
-  %0 = add i32 %i, 1
-  %1 = sext i32 %0 to i64  ; inbound sext(i + 1) = sext(i) + 1
-  %2 = add i32 %j, -2
-  ; However, inbound sext(j + -2) != sext(j) + -2, e.g., j = INT_MIN
-  %3 = sext i32 %2 to i64
-  %p = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %1, i64 %3
-  ret float* %p
-}
-; CHECK-LABEL: @sext_add(
-; CHECK-NOT: = add
-; CHECK: add i32 %j, -2
-; CHECK: sext
-; CHECK: getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; CHECK: getelementptr float, float* %{{[a-zA-Z0-9]+}}, i64 32
-
-; We should be able to trace into sext/zext if it can be distributed to both
-; operands, e.g., sext (add nsw a, b) == add nsw (sext a), (sext b)
-;
-; This test verifies we can transform
-;   gep base, a + sext(b +nsw 1), c + zext(d +nuw 1)
-; to
-;   gep base, a + sext(b), c + zext(d); gep ..., 1 * 32 + 1
-define float* @ext_add_no_overflow(i64 %a, i32 %b, i64 %c, i32 %d) {
-  %b1 = add nsw i32 %b, 1
-  %b2 = sext i32 %b1 to i64
-  %i = add i64 %a, %b2       ; i = a + sext(b +nsw 1)
-  %d1 = add nuw i32 %d, 1
-  %d2 = zext i32 %d1 to i64
-  %j = add i64 %c, %d2       ; j = c + zext(d +nuw 1)
-  %p = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %j
-  ret float* %p
-}
-; CHECK-LABEL: @ext_add_no_overflow(
-; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; CHECK: getelementptr float, float* [[BASE_PTR]], i64 33
-
-; Verifies we handle nested sext/zext correctly.
-define void @sext_zext(i32 %a, i32 %b, float** %out1, float** %out2) {
-entry:
-  %0 = add nsw nuw i32 %a, 1
-  %1 = sext i32 %0 to i48
-  %2 = zext i48 %1 to i64    ; zext(sext(a +nsw nuw 1)) = zext(sext(a)) + 1
-  %3 = add nsw i32 %b, 2
-  %4 = sext i32 %3 to i48
-  %5 = zext i48 %4 to i64    ; zext(sext(b +nsw 2)) != zext(sext(b)) + 2
-  %p1 = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %2, i64 %5
-  store float* %p1, float** %out1
-  %6 = add nuw i32 %a, 3
-  %7 = zext i32 %6 to i48
-  %8 = sext i48 %7 to i64 ; sext(zext(a +nuw 3)) = zext(a +nuw 3) = zext(a) + 3
-  %9 = add nsw i32 %b, 4
-  %10 = zext i32 %9 to i48
-  %11 = sext i48 %10 to i64  ; sext(zext(b +nsw 4)) != zext(b) + 4
-  %p2 = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %8, i64 %11
-  store float* %p2, float** %out2
-  ret void
-}
-; CHECK-LABEL: @sext_zext(
-; CHECK: [[BASE_PTR_1:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; CHECK: getelementptr float, float* [[BASE_PTR_1]], i64 32
-; CHECK: [[BASE_PTR_2:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; CHECK: getelementptr float, float* [[BASE_PTR_2]], i64 96
-
-; Similar to @ext_add_no_overflow, we should be able to trace into s/zext if
-; its operand is an OR and the two operands of the OR have no common bits.
-define float* @sext_or(i64 %a, i32 %b) {
-entry:
-  %b1 = shl i32 %b, 2
-  %b2 = or i32 %b1, 1 ; (b << 2) and 1 have no common bits
-  %b3 = or i32 %b1, 4 ; (b << 2) and 4 may have common bits
-  %b2.ext = zext i32 %b2 to i64
-  %b3.ext = sext i32 %b3 to i64
-  %i = add i64 %a, %b2.ext
-  %j = add i64 %a, %b3.ext
-  %p = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %j
-  ret float* %p
-}
-; CHECK-LABEL: @sext_or(
-; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-; CHECK: getelementptr float, float* [[BASE_PTR]], i64 32
-
-; The subexpression (b + 5) is used in both "i = a + (b + 5)" and "*out = b +
-; 5". When extracting the constant offset 5, make sure "*out = b + 5" isn't
-; affected.
-define float* @expr(i64 %a, i64 %b, i64* %out) {
-entry:
-  %b5 = add i64 %b, 5
-  %i = add i64 %b5, %a
-  %p = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 0
-  store i64 %b5, i64* %out
-  ret float* %p
-}
-; CHECK-LABEL: @expr(
-; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 0
-; CHECK: getelementptr float, float* [[BASE_PTR]], i64 160
-; CHECK: store i64 %b5, i64* %out
-
-; d + sext(a +nsw (b +nsw (c +nsw 8))) => (d + sext(a) + sext(b) + sext(c)) + 8
-define float* @sext_expr(i32 %a, i32 %b, i32 %c, i64 %d) {
-entry:
-  %0 = add nsw i32 %c, 8
-  %1 = add nsw i32 %b, %0
-  %2 = add nsw i32 %a, %1
-  %3 = sext i32 %2 to i64
-  %i = add i64 %d, %3
-  %p = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %i
-  ret float* %p
-}
-; CHECK-LABEL: @sext_expr(
-; CHECK: sext i32
-; CHECK: sext i32
-; CHECK: sext i32
-; CHECK: getelementptr float, float* %{{[a-zA-Z0-9]+}}, i64 8
-
-; Verifies we handle "sub" correctly.
-define float* @sub(i64 %i, i64 %j) {
-  %i2 = sub i64 %i, 5 ; i - 5
-  %j2 = sub i64 5, %j ; 5 - i
-  %p = getelementptr inbounds [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i2, i64 %j2
-  ret float* %p
-}
-; CHECK-LABEL: @sub(
-; CHECK: %[[j2:[a-zA-Z0-9]+]] = sub i64 0, %j
-; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 %i, i64 %[[j2]]
-; CHECK: getelementptr float, float* [[BASE_PTR]], i64 -155
-
-%struct.Packed = type <{ [3 x i32], [8 x i64] }> ; <> means packed
-
-; Verifies we can emit correct uglygep if the address is not natually aligned.
-define i64* @packed_struct(i32 %i, i32 %j) {
-entry:
-  %s = alloca [1024 x %struct.Packed], align 16
-  %add = add nsw i32 %j, 3
-  %idxprom = sext i32 %add to i64
-  %add1 = add nsw i32 %i, 1
-  %idxprom2 = sext i32 %add1 to i64
-  %arrayidx3 = getelementptr inbounds [1024 x %struct.Packed], [1024 x %struct.Packed]* %s, i64 0, i64 %idxprom2, i32 1, i64 %idxprom
-  ret i64* %arrayidx3
-}
-; CHECK-LABEL: @packed_struct(
-; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [1024 x %struct.Packed], [1024 x %struct.Packed]* %s, i64 0, i64 %{{[a-zA-Z0-9]+}}, i32 1, i64 %{{[a-zA-Z0-9]+}}
-; CHECK: [[CASTED_PTR:%[a-zA-Z0-9]+]] = bitcast i64* [[BASE_PTR]] to i8*
-; CHECK: %uglygep = getelementptr i8, i8* [[CASTED_PTR]], i64 100
-; CHECK: bitcast i8* %uglygep to i64*
-
-; We shouldn't be able to extract the 8 from "zext(a +nuw (b + 8))",
-; because "zext(b + 8) != zext(b) + 8"
-define float* @zext_expr(i32 %a, i32 %b) {
-entry:
-  %0 = add i32 %b, 8
-  %1 = add nuw i32 %a, %0
-  %i = zext i32 %1 to i64
-  %p = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %i
-  ret float* %p
-}
-; CHECK-LABEL: zext_expr(
-; CHECK: getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %i
-
-; Per http://llvm.org/docs/LangRef.html#id181, the indices of a off-bound gep
-; should be considered sign-extended to the pointer size. Therefore,
-;   gep base, (add i32 a, b) != gep (gep base, i32 a), i32 b
-; because
-;   sext(a + b) != sext(a) + sext(b)
-;
-; This test verifies we do not illegitimately extract the 8 from
-;   gep base, (i32 a + 8)
-define float* @i32_add(i32 %a) {
-entry:
-  %i = add i32 %a, 8
-  %p = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i32 %i
-  ret float* %p
-}
-; CHECK-LABEL: @i32_add(
-; CHECK: getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %{{[a-zA-Z0-9]+}}
-; CHECK-NOT: getelementptr
-
-; Verifies that we compute the correct constant offset when the index is
-; sign-extended and then zero-extended. The old version of our code failed to
-; handle this case because it simply computed the constant offset as the
-; sign-extended value of the constant part of the GEP index.
-define float* @apint(i1 %a) {
-entry:
-  %0 = add nsw nuw i1 %a, 1
-  %1 = sext i1 %0 to i4
-  %2 = zext i4 %1 to i64         ; zext (sext i1 1 to i4) to i64 = 15
-  %p = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %2
-  ret float* %p
-}
-; CHECK-LABEL: @apint(
-; CHECK: [[BASE_PTR:%[a-zA-Z0-9]+]] = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %{{[a-zA-Z0-9]+}}
-; CHECK: getelementptr float, float* [[BASE_PTR]], i64 15
-
-; Do not trace into binary operators other than ADD, SUB, and OR.
-define float* @and(i64 %a) {
-entry:
-  %0 = shl i64 %a, 2
-  %1 = and i64 %0, 1
-  %p = getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array, i64 0, i64 0, i64 %1
-  ret float* %p
-}
-; CHECK-LABEL: @and(
-; CHECK: getelementptr [32 x [32 x float]], [32 x [32 x float]]* @float_2d_array
-; CHECK-NOT: getelementptr
-
-; The code that rebuilds an OR expression used to be buggy, and failed on this
-; test.
-define float* @shl_add_or(i64 %a, float* %ptr) {
-; CHECK-LABEL: @shl_add_or(
-entry:
-  %shl = shl i64 %a, 2
-  %add = add i64 %shl, 12
-  %or = or i64 %add, 1
-; CHECK: [[OR:%or[0-9]*]] = add i64 %shl, 1
-  ; ((a << 2) + 12) and 1 have no common bits. Therefore,
-  ; SeparateConstOffsetFromGEP is able to extract the 12.
-  ; TODO(jingyue): We could reassociate the expression to combine 12 and 1.
-  %p = getelementptr float, float* %ptr, i64 %or
-; CHECK: [[PTR:%[a-zA-Z0-9]+]] = getelementptr float, float* %ptr, i64 [[OR]]
-; CHECK: getelementptr float, float* [[PTR]], i64 12
-  ret float* %p
-; CHECK-NEXT: ret
-}
-
-; The source code used to be buggy in checking
-; (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0)
-; where AccumulativeByteOffset is signed but ElementTypeSizeOfGEP is unsigned.
-; The compiler would promote AccumulativeByteOffset to unsigned, causing
-; unexpected results. For example, while -64 % (int64_t)24 != 0,
-; -64 % (uint64_t)24 == 0.
-%struct3 = type { i64, i32 }
-%struct2 = type { %struct3, i32 }
-%struct1 = type { i64, %struct2 }
-%struct0 = type { i32, i32, i64*, [100 x %struct1] }
-define %struct2* @sign_mod_unsign(%struct0* %ptr, i64 %idx) {
-; CHECK-LABEL: @sign_mod_unsign(
-entry:
-  %arrayidx = add nsw i64 %idx, -2
-; CHECK-NOT: add
-  %ptr2 = getelementptr inbounds %struct0, %struct0* %ptr, i64 0, i32 3, i64 %arrayidx, i32 1
-; CHECK: [[PTR:%[a-zA-Z0-9]+]] = getelementptr %struct0, %struct0* %ptr, i64 0, i32 3, i64 %idx, i32 1
-; CHECK: [[PTR1:%[a-zA-Z0-9]+]] = bitcast %struct2* [[PTR]] to i8*
-; CHECK: getelementptr i8, i8* [[PTR1]], i64 -64
-; CHECK: bitcast
-  ret %struct2* %ptr2
-; CHECK-NEXT: ret
-}