Index: llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
===================================================================
--- llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1585,7 +1585,7 @@
 ///  &A[10] - &A[0]: we should compile this to "10".  LHS/RHS are the pointer
 /// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
 Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
-                                               Type *Ty) {
+                                               Type *Ty, bool IsNUW) {
   // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
   // this.
   bool Swapped = false;
@@ -1653,6 +1653,15 @@
   // Emit the offset of the GEP and an intptr_t.
   Value *Result = EmitGEPOffset(GEP1);
 
+  // If this is a single inbounds GEP and the original sub was nuw,
+  // then the final multiplication is also nuw. We match an extra add zero
+  // here, because that's what EmitGEPOffset() generates.
+  Instruction *I;
+  if (IsNUW && !GEP2 && !Swapped && GEP1->isInBounds() &&
+      match(Result, m_Add(m_Instruction(I), m_Zero())) &&
+      I->getOpcode() == Instruction::Mul)
+    I->setHasNoUnsignedWrap();
+
   // If we had a constant expression GEP on the other side offsetting the
   // pointer, subtract it from the offset we have.
   if (GEP2) {
@@ -1983,13 +1992,15 @@
   Value *LHSOp, *RHSOp;
   if (match(Op0, m_PtrToInt(m_Value(LHSOp))) &&
       match(Op1, m_PtrToInt(m_Value(RHSOp))))
-    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
+    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
+                                               I.hasNoUnsignedWrap()))
       return replaceInstUsesWith(I, Res);
 
   // trunc(p)-trunc(q) -> trunc(p-q)
   if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
       match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
-    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
+    if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
+                                               /* IsNUW */ false))
       return replaceInstUsesWith(I, Res);
 
   // Canonicalize a shifty way to code absolute value to the common pattern.
Index: llvm/lib/Transforms/InstCombine/InstCombineInternal.h
===================================================================
--- llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -369,7 +369,8 @@
   Instruction *visitFNeg(UnaryOperator &I);
   Instruction *visitAdd(BinaryOperator &I);
   Instruction *visitFAdd(BinaryOperator &I);
-  Value *OptimizePointerDifference(Value *LHS, Value *RHS, Type *Ty);
+  Value *OptimizePointerDifference(
+      Value *LHS, Value *RHS, Type *Ty, bool isNUW);
   Instruction *visitSub(BinaryOperator &I);
   Instruction *visitFSub(BinaryOperator &I);
   Instruction *visitMul(BinaryOperator &I);
Index: llvm/test/Transforms/InstCombine/sub-gep.ll
===================================================================
--- llvm/test/Transforms/InstCombine/sub-gep.ll
+++ llvm/test/Transforms/InstCombine/sub-gep.ll
@@ -16,7 +16,7 @@
 
 define i64 @test_inbounds_nuw([0 x i32]* %base, i64 %idx) {
 ; CHECK-LABEL: @test_inbounds_nuw(
-; CHECK-NEXT:    [[P2_IDX:%.*]] = shl nsw i64 [[IDX:%.*]], 2
+; CHECK-NEXT:    [[P2_IDX:%.*]] = shl nuw nsw i64 [[IDX:%.*]], 2
 ; CHECK-NEXT:    ret i64 [[P2_IDX]]
 ;
   %p1 = getelementptr inbounds [0 x i32], [0 x i32]* %base, i64 0, i64 0
@@ -84,3 +84,18 @@
   %d = sub nuw i64 %i2, %i1
   ret i64 %d
 }
+
+define i64 @test_inbounds_nuw_multi_index([0 x [2 x i32]]* %base, i64 %idx, i64 %idx2) {
+; CHECK-LABEL: @test_inbounds_nuw_multi_index(
+; CHECK-NEXT:    [[P2_IDX:%.*]] = shl nsw i64 [[IDX:%.*]], 3
+; CHECK-NEXT:    [[P2_IDX1:%.*]] = shl nsw i64 [[IDX2:%.*]], 2
+; CHECK-NEXT:    [[P2_OFFS2:%.*]] = add i64 [[P2_IDX]], [[P2_IDX1]]
+; CHECK-NEXT:    ret i64 [[P2_OFFS2]]
+;
+  %p1 = getelementptr inbounds [0 x [2 x i32]], [0 x [2 x i32]]* %base, i64 0, i64 0, i64 0
+  %p2 = getelementptr inbounds [0 x [2 x i32]], [0 x [2 x i32]]* %base, i64 0, i64 %idx, i64 %idx2
+  %i1 = ptrtoint i32* %p1 to i64
+  %i2 = ptrtoint i32* %p2 to i64
+  %d = sub nuw i64 %i2, %i1
+  ret i64 %d
+}