Index: include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfo.h
+++ include/llvm/Analysis/TargetTransformInfo.h
@@ -346,6 +346,8 @@
                            bool HasBaseReg, int64_t Scale,
                            unsigned AddrSpace = 0) const;
 
+  unsigned increasedLSRFormulaCost(Instruction *I, int64_t Offset) const;
+  
   /// \brief Return true if it's free to truncate a value of type Ty1 to type
   /// Ty2. e.g. On x86 it's free to truncate a i32 value in register EAX to i16
   /// by referencing its sub-register AX.
@@ -635,6 +637,7 @@
   virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                    int64_t BaseOffset, bool HasBaseReg,
                                    int64_t Scale, unsigned AddrSpace) = 0;
+  virtual unsigned increasedLSRFormulaCost(Instruction *I, int64_t Offset) = 0;
   virtual bool isTruncateFree(Type *Ty1, Type *Ty2) = 0;
   virtual bool isProfitableToHoist(Instruction *I) = 0;
   virtual bool isTypeLegal(Type *Ty) = 0;
@@ -789,6 +792,9 @@
     return Impl.getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg,
                                      Scale, AddrSpace);
   }
+  unsigned increasedLSRFormulaCost(Instruction *I, int64_t Offset) override {
+    return Impl.increasedLSRFormulaCost(I, Offset);
+  }
   bool isTruncateFree(Type *Ty1, Type *Ty2) override {
     return Impl.isTruncateFree(Ty1, Ty2);
   }
Index: include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- include/llvm/Analysis/TargetTransformInfoImpl.h
+++ include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -226,6 +226,8 @@
     return -1;
   }
 
+  unsigned increasedLSRFormulaCost(Instruction *I, int64_t Offset) { return 0; }
+
   bool isTruncateFree(Type *Ty1, Type *Ty2) { return false; }
 
   bool isProfitableToHoist(Instruction *I) { return true; }
Index: include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- include/llvm/CodeGen/BasicTTIImpl.h
+++ include/llvm/CodeGen/BasicTTIImpl.h
@@ -139,6 +139,10 @@
     return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
   }
 
+  unsigned increasedLSRFormulaCost(Instruction *I, int64_t Offset) {
+    return getTLI()->increasedLSRFormulaCost(I, Offset);
+  }
+
   bool isTruncateFree(Type *Ty1, Type *Ty2) {
     return getTLI()->isTruncateFree(Ty1, Ty2);
   }
Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -1598,6 +1598,10 @@
     return -1;
   }
 
+  virtual unsigned increasedLSRFormulaCost(Instruction *I, int64_t Offset) const {
+    return 0;
+  }
+
   /// Return true if the specified immediate is legal icmp immediate, that is
   /// the target has icmp instructions which can compare a register against the
   /// immediate without having to materialize the immediate into a register.
Index: lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- lib/Analysis/TargetTransformInfo.cpp
+++ lib/Analysis/TargetTransformInfo.cpp
@@ -144,6 +144,11 @@
   return Cost;
 }
 
+unsigned TargetTransformInfo::increasedLSRFormulaCost(Instruction *I,
+                                                 int64_t Offset) const {
+  return TTIImpl->increasedLSRFormulaCost(I, Offset);
+}
+
 bool TargetTransformInfo::isTruncateFree(Type *Ty1, Type *Ty2) const {
   return TTIImpl->isTruncateFree(Ty1, Ty2);
 }
Index: lib/Target/SystemZ/SystemZISelLowering.h
===================================================================
--- lib/Target/SystemZ/SystemZISelLowering.h
+++ lib/Target/SystemZ/SystemZISelLowering.h
@@ -388,6 +388,7 @@
   bool isLegalAddImmediate(int64_t Imm) const override;
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
+  unsigned increasedLSRFormulaCost(Instruction *I, int64_t Offset) const override;
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
                                       unsigned Align,
                                       bool *Fast) const override;
Index: lib/Target/SystemZ/SystemZISelLowering.cpp
===================================================================
--- lib/Target/SystemZ/SystemZISelLowering.cpp
+++ lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/IR/Intrinsics.h"
 #include <cctype>
 
@@ -524,6 +525,23 @@
   return AM.Scale == 0 || AM.Scale == 1;
 }
 
+unsigned SystemZTargetLowering::increasedLSRFormulaCost(Instruction *I,
+                                                        int64_t Offset) const {
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    // There is little support for big offsets with FP / vector types.
+    if (!isUInt<12>(Offset) &&
+        (LI->getType()->isFloatingPointTy() || LI->getType()->isVectorTy()))
+      return 1;
+  }
+  else if (const StoreInst *StI = dyn_cast<StoreInst>(I)) {
+    // There is no vector store with big offset.
+    if (!isUInt<12>(Offset) && StI->getOperand(0)->getType()->isVectorTy())
+      return 1;
+  }
+
+  return 0;
+}
+
 bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
   if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
     return false;
Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp
===================================================================
--- lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -858,6 +858,38 @@
 
 namespace {
 
+/// An operand value in an instruction which is to be replaced with some
+/// equivalent, possibly strength-reduced, replacement.
+struct LSRFixup {
+  /// The instruction which will be updated.
+  Instruction *UserInst;
+
+  /// The operand of the instruction which will be replaced. The operand may be
+  /// used more than once; every instance will be replaced.
+  Value *OperandValToReplace;
+
+  /// If this user is to use the post-incremented value of an induction
+  /// variable, this variable is non-null and holds the loop associated with the
+  /// induction variable.
+  PostIncLoopSet PostIncLoops;
+
+  /// The index of the LSRUse describing the expression which this fixup needs,
+  /// minus an offset (below).
+  size_t LUIdx;
+
+  /// A constant offset to be added to the LSRUse expression.  This allows
+  /// multiple fixups to share the same LSRUse with different offsets, for
+  /// example in an unrolled loop.
+  int64_t Offset;
+
+  bool isUseFullyOutsideLoop(const Loop *L) const;
+
+  LSRFixup();
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+};
+
 /// This class is used to measure and compare candidate formulae.
 class Cost {
   /// TODO: Some of these could be merged. Also, a lexical ordering
@@ -902,6 +934,7 @@
                    const SmallVectorImpl<int64_t> &Offsets,
                    ScalarEvolution &SE, DominatorTree &DT,
                    const LSRUse &LU,
+                   const SmallVector<LSRFixup, 16> &Fixups,
                    SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
 
   void print(raw_ostream &OS) const;
@@ -986,60 +1019,6 @@
   }
 }
 
-void Cost::RateFormula(const TargetTransformInfo &TTI,
-                       const Formula &F,
-                       SmallPtrSetImpl<const SCEV *> &Regs,
-                       const DenseSet<const SCEV *> &VisitedRegs,
-                       const Loop *L,
-                       const SmallVectorImpl<int64_t> &Offsets,
-                       ScalarEvolution &SE, DominatorTree &DT,
-                       const LSRUse &LU,
-                       SmallPtrSetImpl<const SCEV *> *LoserRegs) {
-  assert(F.isCanonical() && "Cost is accurate only for canonical formula");
-  // Tally up the registers.
-  if (const SCEV *ScaledReg = F.ScaledReg) {
-    if (VisitedRegs.count(ScaledReg)) {
-      Lose();
-      return;
-    }
-    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
-    if (isLoser())
-      return;
-  }
-  for (const SCEV *BaseReg : F.BaseRegs) {
-    if (VisitedRegs.count(BaseReg)) {
-      Lose();
-      return;
-    }
-    RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
-    if (isLoser())
-      return;
-  }
-
-  // Determine how many (unfolded) adds we'll need inside the loop.
-  size_t NumBaseParts = F.getNumRegs();
-  if (NumBaseParts > 1)
-    // Do not count the base and a possible second register if the target
-    // allows to fold 2 registers.
-    NumBaseAdds +=
-        NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
-  NumBaseAdds += (F.UnfoldedOffset != 0);
-
-  // Accumulate non-free scaling amounts.
-  ScaleCost += getScalingFactorCost(TTI, LU, F);
-
-  // Tally up the non-zero immediates.
-  for (int64_t O : Offsets) {
-    int64_t Offset = (uint64_t)O + F.BaseOffset;
-    if (F.BaseGV)
-      ImmCost += 64; // Handle symbolic values conservatively.
-                     // TODO: This should probably be the pointer size.
-    else if (Offset != 0)
-      ImmCost += APInt(64, Offset, true).getMinSignedBits();
-  }
-  assert(isValid() && "invalid cost");
-}
-
 /// Set this cost to a losing value.
 void Cost::Lose() {
   NumRegs = ~0u;
@@ -1082,42 +1061,6 @@
   print(errs()); errs() << '\n';
 }
 
-namespace {
-
-/// An operand value in an instruction which is to be replaced with some
-/// equivalent, possibly strength-reduced, replacement.
-struct LSRFixup {
-  /// The instruction which will be updated.
-  Instruction *UserInst;
-
-  /// The operand of the instruction which will be replaced. The operand may be
-  /// used more than once; every instance will be replaced.
-  Value *OperandValToReplace;
-
-  /// If this user is to use the post-incremented value of an induction
-  /// variable, this variable is non-null and holds the loop associated with the
-  /// induction variable.
-  PostIncLoopSet PostIncLoops;
-
-  /// The index of the LSRUse describing the expression which this fixup needs,
-  /// minus an offset (below).
-  size_t LUIdx;
-
-  /// A constant offset to be added to the LSRUse expression.  This allows
-  /// multiple fixups to share the same LSRUse with different offsets, for
-  /// example in an unrolled loop.
-  int64_t Offset;
-
-  bool isUseFullyOutsideLoop(const Loop *L) const;
-
-  LSRFixup();
-
-  void print(raw_ostream &OS) const;
-  void dump() const;
-};
-
-}
-
 LSRFixup::LSRFixup()
   : UserInst(nullptr), OperandValToReplace(nullptr), LUIdx(~size_t(0)),
     Offset(0) {}
@@ -1222,6 +1165,8 @@
   int64_t MinOffset;
   int64_t MaxOffset;
 
+  SmallVector<unsigned, 8> FixupIndexes;
+
   /// This records whether all of the fixups using this LSRUse are outside of
   /// the loop, in which case some special-case heuristics may be used.
   bool AllFixupsOutsideLoop;
@@ -1263,6 +1208,68 @@
 
 }
 
+void Cost::RateFormula(const TargetTransformInfo &TTI,
+                       const Formula &F,
+                       SmallPtrSetImpl<const SCEV *> &Regs,
+                       const DenseSet<const SCEV *> &VisitedRegs,
+                       const Loop *L,
+                       const SmallVectorImpl<int64_t> &Offsets,
+                       ScalarEvolution &SE, DominatorTree &DT,
+                       const LSRUse &LU,
+                       const SmallVector<LSRFixup, 16> &Fixups,
+                       SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+  assert(F.isCanonical() && "Cost is accurate only for canonical formula");
+  // Tally up the registers.
+  if (const SCEV *ScaledReg = F.ScaledReg) {
+    if (VisitedRegs.count(ScaledReg)) {
+      Lose();
+      return;
+    }
+    RatePrimaryRegister(ScaledReg, Regs, L, SE, DT, LoserRegs);
+    if (isLoser())
+      return;
+  }
+  for (const SCEV *BaseReg : F.BaseRegs) {
+    if (VisitedRegs.count(BaseReg)) {
+      Lose();
+      return;
+    }
+    RatePrimaryRegister(BaseReg, Regs, L, SE, DT, LoserRegs);
+    if (isLoser())
+      return;
+  }
+
+  // Determine how many (unfolded) adds we'll need inside the loop.
+  size_t NumBaseParts = F.getNumRegs();
+  if (NumBaseParts > 1)
+    // Do not count the base and a possible second register if the target
+    // allows to fold 2 registers.
+    NumBaseAdds +=
+        NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F)));
+  NumBaseAdds += (F.UnfoldedOffset != 0);
+
+  // Accumulate non-free scaling amounts.
+  ScaleCost += getScalingFactorCost(TTI, LU, F);
+
+  // Tally up the non-zero immediates.
+  for (int64_t O : Offsets) {
+    int64_t Offset = (uint64_t)O + F.BaseOffset;
+    if (F.BaseGV)
+      ImmCost += 64; // Handle symbolic values conservatively.
+                     // TODO: This should probably be the pointer size.
+    else if (Offset != 0)
+      ImmCost += APInt(64, Offset, true).getMinSignedBits();
+  }
+  assert(isValid() && "invalid cost");
+
+  // Allow target to penalize any Instruction+Offset combination.
+  for (auto &Idx : LU.FixupIndexes) {
+    const LSRFixup &Fixup = Fixups[Idx];
+    int64_t Offset = F.BaseOffset + Fixup.Offset;
+    NumBaseAdds += TTI.increasedLSRFormulaCost(Fixup.UserInst, Offset);
+  }
+}
+
 /// Test whether this use as a formula which has the same registers as the given
 /// formula.
 bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
@@ -3014,6 +3021,7 @@
     LF.LUIdx = P.first;
     LF.Offset = P.second;
     LSRUse &LU = Uses[LF.LUIdx];
+    LU.FixupIndexes.push_back(Fixups.size() - 1);
     LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
     if (!LU.WidestFixupType ||
         SE.getTypeSizeInBits(LU.WidestFixupType) <
@@ -3157,6 +3165,7 @@
         LF.LUIdx = P.first;
         LF.Offset = P.second;
         LSRUse &LU = Uses[LF.LUIdx];
+        LU.FixupIndexes.push_back(Fixups.size() - 1);
         LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
         if (!LU.WidestFixupType ||
             SE.getTypeSizeInBits(LU.WidestFixupType) <
@@ -3892,7 +3901,7 @@
       Cost CostF;
       Regs.clear();
       CostF.RateFormula(TTI, F, Regs, VisitedRegs, L, LU.Offsets, SE, DT, LU,
-                        &LoserRegs);
+                        Fixups, &LoserRegs);
       if (CostF.isLoser()) {
         // During initial formula generation, undesirable formulae are generated
         // by uses within other loops that have some non-trivial address mode or
@@ -3926,7 +3935,7 @@
         Cost CostBest;
         Regs.clear();
         CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, LU.Offsets, SE,
-                             DT, LU);
+                             DT, LU, Fixups);
         if (CostF < CostBest)
           std::swap(F, Best);
         DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
@@ -4266,7 +4275,7 @@
     NewCost = CurCost;
     NewRegs = CurRegs;
     NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, LU.Offsets, SE, DT,
-                        LU);
+                        LU, Fixups);
     if (NewCost < SolutionCost) {
       Workspace.push_back(&F);
       if (Workspace.size() != Uses.size()) {