Index: llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -730,8 +730,8 @@
   Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
 
   bool widenLoopCompare(NarrowIVDefUse DU);
-  bool widenWithVariantLoadUse(NarrowIVDefUse DU);
-  void widenWithVariantLoadUseCodegen(NarrowIVDefUse DU);
+  bool widenWithVariantUse(NarrowIVDefUse DU);
+  void widenWithVariantUseCodegen(NarrowIVDefUse DU);
 
   void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
 };
@@ -1069,20 +1069,27 @@
   return true;
 }
 
-/// If the narrow use is an instruction whose two operands are the defining
-/// instruction of DU and a load instruction, then we have the following:
-/// if the load is hoisted outside the loop, then we do not reach this function
-/// as scalar evolution analysis works fine in widenIVUse with variables
-/// hoisted outside the loop and efficient code is subsequently generated by
-/// not emitting truncate instructions. But when the load is not hoisted
-/// (whether due to limitation in alias analysis or due to a true legality),
-/// then scalar evolution can not proceed with loop variant values and
-/// inefficient code is generated. This function handles the non-hoisted load
-/// special case by making the optimization generate the same type of code for
-/// hoisted and non-hoisted load (widen use and eliminate sign extend
-/// instruction). This special case is important especially when the induction
-/// variables are affecting addressing mode in code generation.
-bool WidenIV::widenWithVariantLoadUse(NarrowIVDefUse DU) {
+// The widenIVUse avoids generating trunc by evaluating the use as AddRec, this
+// will not work when:
+//    1) SCEV traces back to an instruction inside the loop that SCEV can not
+// expand, eg. add %indvar, (load %addr)
+//    2) SCEV finds a loop variant, eg. add %indvar, %loopvariant
+// While SCEV fails to avoid trunc, we can still try to use instruction
+// combining approach to prove trunc is not required. This can be further
+// extended with other instruction combining checks, but for now we handle the
+// following case (sub can be "add" and "mul", "nsw + sext" can be "nus + zext")
+//
+// Src:
+//   %c = sub nsw %b, %indvar
+//   %d = sext %c to i64
+// Dst:
+//   %indvar.ext1 = sext %indvar to i64
+//   %m = sext %b to i64
+//   %d = sub nsw i64 %m, %indvar.ext1
+// Therefore, as long as the result of add/sub/mul is extended to wide type, no
+// trunc is required regardless of how %b is generated. This pattern is common
+// when calculating address in 64 bit architecture
+bool WidenIV::widenWithVariantUse(NarrowIVDefUse DU) {
   Instruction *NarrowUse = DU.NarrowUse;
   Instruction *NarrowDef = DU.NarrowDef;
   Instruction *WideDef = DU.WideDef;
@@ -1113,12 +1120,6 @@
   else
     return false;
 
-  // We are interested in the other operand being a load instruction.
-  // But, we should look into relaxing this restriction later on.
-  auto *I = dyn_cast<Instruction>(NarrowUse->getOperand(ExtendOperIdx));
-  if (I && I->getOpcode() != Instruction::Load)
-    return false;
-
   // Verifying that Defining operand is an AddRec
   const SCEV *Op1 = SE->getSCEV(WideDef);
   const SCEVAddRecExpr *AddRecOp1 = dyn_cast<SCEVAddRecExpr>(Op1);
@@ -1150,9 +1151,9 @@
   return true;
 }
 
-/// Special Case for widening with variant Loads (see
-/// WidenIV::widenWithVariantLoadUse). This is the code generation part.
-void WidenIV::widenWithVariantLoadUseCodegen(NarrowIVDefUse DU) {
+/// Special Case for widening with loop variant (see
+/// WidenIV::widenWithVariant). This is the code generation part.
+void WidenIV::widenWithVariantUseCodegen(NarrowIVDefUse DU) {
   Instruction *NarrowUse = DU.NarrowUse;
   Instruction *NarrowDef = DU.NarrowDef;
   Instruction *WideDef = DU.WideDef;
@@ -1300,8 +1301,8 @@
     // in WideAddRec.first does not indicate a polynomial induction expression.
     // In that case, look at the operands of the use instruction to determine
     // if we can still widen the use instead of truncating its operand.
-    if (widenWithVariantLoadUse(DU)) {
-      widenWithVariantLoadUseCodegen(DU);
+    if (widenWithVariantUse(DU)) {
+      widenWithVariantUseCodegen(DU);
       return nullptr;
     }
 
Index: llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
===================================================================
--- llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
+++ llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
@@ -419,3 +419,52 @@
   %cmp = icmp slt i32 %add, %length
   br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
 }
+
+define i32 @foo6(%struct.image* %input, i32 %length, i32* %in) {
+entry:
+  %stride = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 1
+  %0 = load i32, i32* %stride, align 4
+  %cmp17 = icmp sgt i32 %length, 1
+  br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %channel = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 0
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %1 = phi i32 [ %6, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %2 = phi i32 [ 0, %entry ], [ %1, %for.cond.cleanup.loopexit ]
+  ret i32 %2
+
+; Extend foo4 so that any loop variants (%3 and %or) with mul/sub/add then extend will not
+; need a trunc instruction
+; CHECK: for.body:
+; CHECK-NOT: trunc
+; CHECK:      [[TMP0:%.*]] = and i32 %length, %0
+; CHECK-NEXT: zext i32 [[TMP0]] to i64
+; CHECK:      [[TMP1:%.*]] = or i32 %length, [[TMP2:%.*]]
+; CHECK-NEXT: zext i32 [[TMP1]] to i64
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %x.018 = phi i32 [ 1, %for.body.lr.ph ], [ %add, %for.body ]
+  %add = add nuw nsw i32 %x.018, 1
+  %3 = and i32 %length, %0
+  %mul = mul nuw i32 %3, %add
+  %idx.ext = zext i32 %mul to i64
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %idx.ext
+  %4 = load i32, i32* %add.ptr, align 4
+  %mul1 = mul nuw i32 %0, %add
+  %idx.ext1 = zext i32 %mul1 to i64
+  %add.ptr1 = getelementptr inbounds i32, i32* %in, i64 %idx.ext1
+  %5 = load i32, i32* %add.ptr1, align 4
+  %or = or i32 %length, %5
+  %sub.or = sub nuw i32 %or, %add
+  %or.ext = zext i32 %sub.or to i64
+  %ptr.or = getelementptr inbounds i32, i32* %in, i64 %or.ext
+  %val.or = load i32, i32* %ptr.or
+  %6 = add i32 %4, %val.or
+  %cmp = icmp ult i32 %add, %length
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}