Index: include/llvm/IR/Operator.h
===================================================================
--- include/llvm/IR/Operator.h
+++ include/llvm/IR/Operator.h
@@ -472,6 +472,12 @@
     return true;
   }
 
+  unsigned countNonConstantIndices() const {
+    return std::count_if(idx_begin(), idx_end(), [](const Use& use) {
+        return !isa<ConstantInt>(*use);
+      });
+  }
+
   /// \brief Accumulate the constant address offset of this GEP if possible.
   ///
   /// This routine accepts an APInt into which it will accumulate the constant
Index: lib/Transforms/InstCombine/InstCombineAddSub.cpp
===================================================================
--- lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1465,12 +1465,32 @@
     }
   }
 
-  // Avoid duplicating the arithmetic if GEP2 has non-constant indices and
-  // multiple users.
-  if (!GEP1 ||
-      (GEP2 && !GEP2->hasAllConstantIndices() && !GEP2->hasOneUse()))
+  if (!GEP1)
+    // No GEP found.
     return nullptr;
 
+  if (GEP2) {
+    // (gep X, ...) - (gep X, ...)
+    //
+    // Avoid duplicating the arithmetic if there are more than one non-constant
+    // indices between the two GEPs and either GEP has a non-constant index and
+    // multiple users. If zero non-constant index, the result is a constant and
+    // there is no duplication. If one non-constant index, the result is an add
+    // or sub with a constant, which is no larger than the original code, and
+    // there's no duplicated arithmetic, even if either GEP has multiple
+    // users. If more than one non-constant indices combined, as long as the GEP
+    // with at least one non-constant index doesn't have multiple users, there
+    // is no duplication.
+    unsigned NumNonConstantIndices1 = GEP1->countNonConstantIndices();
+    unsigned NumNonConstantIndices2 = GEP2->countNonConstantIndices();
+    if (NumNonConstantIndices1 + NumNonConstantIndices2 > 1 &&
+        ((NumNonConstantIndices1 > 0 && !GEP1->hasOneUse()) ||
+         (NumNonConstantIndices2 > 0 && !GEP2->hasOneUse()))) {
+      DEBUG(dbgs() << "Rejected because ...\n");
+      return nullptr;
+    }
+  }
+
   // Emit the offset of the GEP and an intptr_t.
   Value *Result = EmitGEPOffset(GEP1);
 
Index: test/Transforms/InstCombine/sub.ll
===================================================================
--- test/Transforms/InstCombine/sub.ll
+++ test/Transforms/InstCombine/sub.ll
@@ -989,3 +989,84 @@
   %X = add i32 %B, %A
   %Y = sub i32 %A, %X
   ret i32 %Y                                                                                                                                                                                                                                             }
+
+@dummy_global1 = external global i8*
+@dummy_global2 = external global i8*
+
+define i64 @test58([100 x [100 x i8]]* %foo, i64 %i, i64 %j) {
+; CHECK-LABEL: @test58(
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[J:%.*]], 4200
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[I:%.*]], 4200
+; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 [[TMP2:%.*]] [[TMP1:%.*]]
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+; Note the reassociate pass and another instcombine pass will further optimize this to
+; "%sub = i64 %i, %j, ret i64 %sub"
+;
+; gep1 and gep2 have only one use
+  %gep1 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 %i
+  %gep2 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 %j
+  %cast1 = ptrtoint i8* %gep1 to i64
+  %cast2 = ptrtoint i8* %gep2 to i64
+  %sub = sub i64 %cast1, %cast2
+  ret i64 %sub
+}
+
+define i64 @test59([100 x [100 x i8]]* %foo, i64 %i) {
+; CHECK-LABEL: @test59(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 %i
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 0
+; CHECK-NEXT:    store i8* [[GEP1]], i8** @dummy_global1, align 8
+; CHECK-NEXT:    store i8* [[GEP2]], i8** @dummy_global2, align 8
+; CHECK-NEXT:    ret i64 %i
+;
+; gep1 and gep2 have more than one uses
+  %gep1 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 %i
+  %gep2 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 0
+  %cast1 = ptrtoint i8* %gep1 to i64
+  %cast2 = ptrtoint i8* %gep2 to i64
+  %sub = sub i64 %cast1, %cast2
+  store i8* %gep1, i8** @dummy_global1
+  store i8* %gep2, i8** @dummy_global2
+  ret i64 %sub
+}
+
+define i64 @test60([100 x [100 x i8]]* %foo, i64 %i, i64 %j) {
+; CHECK-LABEL: @test60(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 %j, i64 %i
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 0
+; CHECK-NEXT:    [[CAST1:%.*]] = ptrtoint i8* [[GEP1]] to i64
+; CHECK-NEXT:    [[CAST2:%.*]] = ptrtoint i8* [[GEP2]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[CAST1]], [[CAST2]]
+; CHECK-NEXT:    store i8* [[GEP1]], i8** @dummy_global1, align 8
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+; gep1 has a non-constant index and more than one uses. Shouldn't duplicate the arithmetic.
+  %gep1 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 %j, i64 %i
+  %gep2 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 0
+  %cast1 = ptrtoint i8* %gep1 to i64
+  %cast2 = ptrtoint i8* %gep2 to i64
+  %sub = sub i64 %cast1, %cast2
+  store i8* %gep1, i8** @dummy_global1
+  ret i64 %sub
+}
+
+define i64 @test61([100 x [100 x i8]]* %foo, i64 %i, i64 %j) {
+; CHECK-LABEL: @test61(
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 0
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 %j, i64 %i
+; CHECK-NEXT:    [[CAST1:%.*]] = ptrtoint i8* [[GEP1]] to i64
+; CHECK-NEXT:    [[CAST2:%.*]] = ptrtoint i8* [[GEP2]] to i64
+; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[CAST1]], [[CAST2]]
+; CHECK-NEXT:    store i8* [[GEP2]], i8** @dummy_global2, align 8
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+; gep2 has a non-constant index and more than one uses. Shouldn't duplicate the arithmetic.
+  %gep1 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 42, i64 0
+  %gep2 = getelementptr inbounds [100 x [100 x i8]], [100 x [100 x i8]]* %foo, i64 0, i64 %j, i64 %i
+  %cast1 = ptrtoint i8* %gep1 to i64
+  %cast2 = ptrtoint i8* %gep2 to i64
+  %sub = sub i64 %cast1, %cast2
+  store i8* %gep2, i8** @dummy_global2
+  ret i64 %sub
+}