Index: include/llvm/Analysis/ScalarEvolution.h
===================================================================
--- include/llvm/Analysis/ScalarEvolution.h
+++ include/llvm/Analysis/ScalarEvolution.h
@@ -523,6 +523,10 @@
   const SCEV *getConstant(Type *Ty, uint64_t V, bool isSigned = false);
   const SCEV *getTruncateExpr(const SCEV *Op, Type *Ty);
   const SCEV *getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth = 0);
+  /// Sink zext towards the leaves of SCEV more aggressively using KnownBits,
+  /// where the latter requires the original IR Value to be available.
+  const SCEV *getZeroExtendExprForValue(const Value *OpV, const SCEV *Op,
+                                        Type *Ty, unsigned Depth = 0);
   const SCEV *getSignExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth = 0);
   const SCEV *getAnyExtendExpr(const SCEV *Op, Type *Ty);
   const SCEV *getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
Index: lib/Analysis/ScalarEvolution.cpp
===================================================================
--- lib/Analysis/ScalarEvolution.cpp
+++ lib/Analysis/ScalarEvolution.cpp
@@ -1559,8 +1559,14 @@
   return false;
 }
 
-const SCEV *
-ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty, unsigned Depth) {
+const SCEV *ScalarEvolution::getZeroExtendExpr(const SCEV *Op, Type *Ty,
+                                               unsigned Depth) {
+  return getZeroExtendExprForValue(nullptr, Op, Ty, Depth);
+}
+
+const SCEV *ScalarEvolution::getZeroExtendExprForValue(const Value *OpV,
+                                                       const SCEV *Op, Type *Ty,
+                                                       unsigned Depth) {
   assert(getTypeSizeInBits(Op->getType()) < getTypeSizeInBits(Ty) &&
          "This is not an extending conversion!");
   assert(isSCEVable(Ty) &&
@@ -1777,6 +1783,46 @@
         Ops.push_back(getZeroExtendExpr(Op, Ty, Depth + 1));
       return getAddExpr(Ops, SCEV::FlagNUW, Depth + 1);
     }
+
+    // zext(C + x + y + ...) --> (zext(D) + zext((C - D) + x + y + ...))<nuw>
+    // if D + (C - D + x + y + ...) could be proven to not unsigned wrap
+    // where D is the maximum such D and D <= C (unsigned)
+    //
+    // Useful while proving that address arithmetic expressions are equal or
+    // differ by a small constant amount, see LoadStoreVectorizer pass.
+    if (const auto *SC = dyn_cast<SCEVConstant>(SA->getOperand(0))) {
+      APInt MinValue = getUnsignedRangeMin(SA);
+      // Often address arithmetics contain expressions like
+      // (zext (add (shl X, C1), C2)), for instance, (zext (5 + (4 * X))).
+      // ConstantRange is unable to prove that it's possible to transform
+      // (5 + (4 * X)) to (1 + (4 + (4 * X))) w/o underflowing:
+      //
+      // |  Expression   |     ConstantRange      |       KnownBits       |
+      // |---------------|------------------------|-----------------------|
+      // | i8 4 * X      | [L: 0, U: 253)         | XXXX XX00             |
+      // |               |   => Min: 0, Max: 252  |   => Min: 0, Max: 252 |
+      // |               |                        |                       |
+      // | i8 4 * X + 5  | [L: 5, U: 2) (wrapped) | YYYY YY01             |
+      // |         (101) |   => Min: 0, Max: 255  |   => Min: 1, Max: 253 |
+      //
+      // On the other hand, ConstantRange [L: 1, U: 3) degrades to
+      // KnownBits 0000 00XX, loosing Min: 1, Max: 2 to Min: 0, Max: 3.
+      // So use both if available:
+      if (OpV) {
+        const DataLayout &DL = getDataLayout();
+        KnownBits Known = computeKnownBits(OpV, DL, 0, &AC, nullptr, &DT);
+        MinValue = Known.One.ugt(MinValue) ? Known.One : MinValue;
+      }
+      APInt C = SC->getAPInt();
+      APInt D = MinValue.ugt(C) ? C : MinValue;
+      if (D != 0) {
+        const SCEV *SZExtD = getZeroExtendExpr(getConstant(D), Ty, Depth);
+        const SCEV *SResidual =
+            getAddExpr(getConstant(-D), SA, SCEV::FlagAnyWrap, Depth);
+        const SCEV *SZExtR = getZeroExtendExpr(SResidual, Ty, Depth + 1);
+        return getAddExpr(SZExtD, SZExtR, SCEV::FlagNUW, Depth + 1);
+      }
+    }
   }
 
   if (auto *SM = dyn_cast<SCEVMulExpr>(Op)) {
@@ -6337,8 +6383,10 @@
   case Instruction::Trunc:
     return getTruncateExpr(getSCEV(U->getOperand(0)), U->getType());
 
-  case Instruction::ZExt:
-    return getZeroExtendExpr(getSCEV(U->getOperand(0)), U->getType());
+  case Instruction::ZExt: {
+    Value *OpV = U->getOperand(0);
+    return getZeroExtendExprForValue(OpV, getSCEV(OpV), U->getType());
+  }
 
   case Instruction::SExt:
     if (auto BO = MatchBinaryOp(U->getOperand(0), DT)) {
Index: test/Analysis/ScalarEvolution/no-wrap-add-exprs.ll
===================================================================
--- test/Analysis/ScalarEvolution/no-wrap-add-exprs.ll
+++ test/Analysis/ScalarEvolution/no-wrap-add-exprs.ll
@@ -120,3 +120,84 @@
 
   ret void
 }
+
+@z_addr = external global [16 x i8], align 4
+@z_addr_noalign = external global [16 x i8]
+
+%union = type { [10 x [4 x float]] }
+@tmp_addr = external unnamed_addr global { %union, [2000 x i8] }
+
+define void @f3(i8* %x_addr, i8* %y_addr, i32* %tmp_addr) {
+; CHECK-LABEL: Classifying expressions for: @f3
+ entry:
+  %x = load i8, i8* %x_addr
+  %t0 = mul i8 %x, 4
+  %t1 = add i8 %t0, 5
+  %t1.zext = zext i8 %t1 to i16
+; CHECK:  %t1.zext = zext i8 %t1 to i16
+; CHECK-NEXT:  -->  (1 + (zext i8 (4 + (4 * %x)) to i16))<nuw><nsw> U: [1,254) S: [1,257)
+
+  %q0 = mul i8 %x, 4
+  %q1 = add i8 %q0, 7
+  %q1.zext = zext i8 %q1 to i16
+; CHECK:  %q1.zext = zext i8 %q1 to i16
+; CHECK-NEXT:  -->  (3 + (zext i8 (4 + (4 * %x)) to i16))<nuw><nsw> U: [3,256) S: [3,259)
+
+  %p0 = mul i8 %x, 4
+  %p1 = add i8 %p0, 8
+  %p1.zext = zext i8 %p1 to i16
+; CHECK:  %p1.zext = zext i8 %p1 to i16
+; CHECK-NEXT:  -->  (zext i8 (8 + (4 * %x)) to i16) U: [0,253) S: [0,256)
+
+  %r0 = mul i8 %x, 4
+  %r1 = add i8 %r0, 254
+  %r1.zext = zext i8 %r1 to i16
+; CHECK:  %r1.zext = zext i8 %r1 to i16
+; CHECK-NEXT:  -->  (2 + (zext i8 (-4 + (4 * %x)) to i16))<nuw><nsw> U: [2,255) S: [2,258)
+
+  %y = load i8, i8* %y_addr
+  %s0 = mul i8 %x, 32
+  %s1 = mul i8 %y, 36
+  %s2 = add i8 %s0, %s1
+  %s3 = add i8 %s2, 5
+  %s3.zext = zext i8 %s3 to i16
+; CHECK:  %s3.zext = zext i8 %s3 to i16
+; CHECK-NEXT:  -->  (1 + (zext i8 (4 + (32 * %x) + (36 * %y)) to i16))<nuw><nsw> U: [1,254) S: [1,257)
+
+  %ptr = bitcast [16 x i8]* @z_addr to i8*
+  %int0 = ptrtoint i8* %ptr to i32
+  %int5 = add i32 %int0, 5
+  %int.zext = zext i32 %int5 to i64
+; CHECK:  %int.zext = zext i32 %int5 to i64
+; CHECK-NEXT:  -->  (1 + (zext i32 (4 + %int0) to i64))<nuw><nsw> U: [1,4294967294) S: [1,4294967297)
+
+  %ptr_noalign = bitcast [16 x i8]* @z_addr_noalign to i8*
+  %int0_na = ptrtoint i8* %ptr_noalign to i32
+  %int5_na = add i32 %int0_na, 5
+  %int.zext_na = zext i32 %int5_na to i64
+; CHECK:  %int.zext_na = zext i32 %int5_na to i64
+; CHECK-NEXT:  -->  (zext i32 (5 + %int0_na) to i64) U: [0,4294967296) S: [0,4294967296)
+
+  %tmp = load i32, i32* %tmp_addr
+  %mul = and i32 %tmp, -4
+  %add4 = add i32 %mul, 4
+  %add4.zext = zext i32 %add4 to i64
+  %sunkaddr3 = mul i64 %add4.zext, 4
+  %sunkaddr4 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @tmp_addr to i8*), i64 %sunkaddr3
+  %sunkaddr5 = getelementptr inbounds i8, i8* %sunkaddr4, i64 4096
+  %addr4.cast = bitcast i8* %sunkaddr5 to i32*
+  %addr4.incr = getelementptr i32, i32* %addr4.cast, i64 1
+; CHECK:  %addr4.incr = getelementptr i32, i32* %addr4.cast, i64 1
+; CHECK-NEXT:  -->  ([[C:4100]] + ([[SIZE:4]] * (zext i32 ([[OFFSET:4]] + ([[STRIDE:4]] * (%tmp /u [[STRIDE]]))) to i64)) + @tmp_addr)
+
+  %add5 = add i32 %mul, 5
+  %add5.zext = zext i32 %add5 to i64
+  %sunkaddr0 = mul i64 %add5.zext, 4
+  %sunkaddr1 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @tmp_addr to i8*), i64 %sunkaddr0
+  %sunkaddr2 = getelementptr inbounds i8, i8* %sunkaddr1, i64 4096
+  %addr5.cast = bitcast i8* %sunkaddr2 to i32*
+; CHECK:  %addr5.cast = bitcast i8* %sunkaddr2 to i32*
+; CHECK-NEXT:  -->  ([[C]]    +   ([[SIZE]]  *  (zext i32 ([[OFFSET]]  +  ([[STRIDE]]  *  (%tmp /u [[STRIDE]]))) to i64)) + @tmp_addr)
+
+  ret void
+}
Index: test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoadStoreVectorizer/X86/codegenprepare-produced-address-math.ll
@@ -0,0 +1,78 @@
+; RUN: opt -codegenprepare -load-store-vectorizer %s -S -o - | FileCheck %s
+; RUN: opt                 -load-store-vectorizer %s -S -o - | FileCheck %s
+
+target triple = "x86_64--"
+
+%union = type { { [4 x [4 x [4 x [16 x float]]]], [4 x [4 x [4 x [16 x float]]]], [10 x [10 x [4 x float]]] } }
+
+@global_pointer = external unnamed_addr global { %union, [2000 x i8] }, align 4
+
+; Function Attrs: convergent nounwind
+define void @test(i32 %base) #0 {
+; CHECK-LABEL: @test(
+; CHECK-NOT: load i32
+; CHECK: load <2 x i32>
+; CHECK-NOT: load i32
+entry:
+  %mul331 = and i32 %base, -4
+  %add350.4 = add i32 4, %mul331
+  %idx351.4 = zext i32 %add350.4 to i64
+  %arrayidx352.4 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.4
+  %tmp296.4 = bitcast float* %arrayidx352.4 to i32*
+  %add350.5 = add i32 5, %mul331
+  %idx351.5 = zext i32 %add350.5 to i64
+  %arrayidx352.5 = getelementptr inbounds { %union, [2000 x i8] }, { %union, [2000 x i8] }* @global_pointer, i64 0, i32 0, i32 0, i32 1, i64 0, i64 0, i64 0, i64 %idx351.5
+  %tmp296.5 = bitcast float* %arrayidx352.5 to i32*
+  %cnd = icmp ult i32 %base, 1000
+  br i1 %cnd, label %loads, label %exit
+
+loads:
+  ; If and only if the loads are in a different BB from the GEPs codegenprepare
+  ; would try to turn the GEPs into math, which makes LoadStoreVectorizer's job
+  ; harder
+  %tmp297.4 = load i32, i32* %tmp296.4, align 4, !tbaa !0
+  %tmp297.5 = load i32, i32* %tmp296.5, align 4, !tbaa !0
+  br label %exit
+
+exit:
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+define void @test.codegenprepared(i32 %base) #0 {
+; CHECK-LABEL: @test.codegenprepared(
+; CHECK-NOT: load i32
+; CHECK: load <2 x i32>
+; CHECK-NOT: load i32
+entry:
+  %mul331 = and i32 %base, -4
+  %add350.4 = add i32 4, %mul331
+  %idx351.4 = zext i32 %add350.4 to i64
+  %add350.5 = add i32 5, %mul331
+  %idx351.5 = zext i32 %add350.5 to i64
+  %cnd = icmp ult i32 %base, 1000
+  br i1 %cnd, label %loads, label %exit
+
+loads:                                            ; preds = %entry
+  %sunkaddr = mul i64 %idx351.4, 4
+  %sunkaddr1 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr
+  %sunkaddr2 = getelementptr inbounds i8, i8* %sunkaddr1, i64 4096
+  %0 = bitcast i8* %sunkaddr2 to i32*
+  %tmp297.4 = load i32, i32* %0, align 4, !tbaa !0
+  %sunkaddr3 = mul i64 %idx351.5, 4
+  %sunkaddr4 = getelementptr inbounds i8, i8* bitcast ({ %union, [2000 x i8] }* @global_pointer to i8*), i64 %sunkaddr3
+  %sunkaddr5 = getelementptr inbounds i8, i8* %sunkaddr4, i64 4096
+  %1 = bitcast i8* %sunkaddr5 to i32*
+  %tmp297.5 = load i32, i32* %1, align 4, !tbaa !0
+  br label %exit
+
+exit:                                             ; preds = %loads, %entry
+  ret void
+}
+
+attributes #0 = { convergent nounwind }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"float", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C++ TBAA"}