Index: llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1024,6 +1024,8 @@
   Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
 
   bool widenLoopCompare(NarrowIVDefUse DU);
+  const SCEV * getExtendExpr(NarrowIVDefUse DU);
+  bool widenVariantLoadUse(NarrowIVDefUse DU);
 
   void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
 };
@@ -1220,37 +1222,58 @@
   llvm_unreachable("Unsupported opcode.");
 }
 
+/// Get the SCEV Extend Expression of the other operand in the use instruction
+/// of DU (i.e. the one that is not defined in the defining instruction in DU).
+const SCEV * WidenIV::getExtendExpr(NarrowIVDefUse DU) {
+  Instruction *NarrowUse = DU.NarrowUse;
+  Instruction *NarrowDef = DU.NarrowDef;
+
+  // Handle the common case of add<nsw/nuw>
+  const unsigned OpCode = NarrowUse->getOpcode();
+  // Only Add/Sub/Mul instructions are supported.
+  if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
+      OpCode != Instruction::Mul)
+    return nullptr;
+
+  // Check whether the other operand is the first or the second operand.
+  unsigned ExtendOperIdx = DU.NarrowUse->getOperand(0) == NarrowDef ? 1 : 0;
+  assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
+
+  const SCEV *ExtendOperExpr = nullptr;
+  const OverflowingBinaryOperator *OBO =
+    cast<OverflowingBinaryOperator>(NarrowUse);
+  ExtendKind ExtKind = getExtendKind(NarrowDef);
+  if (ExtKind == SignExtended && OBO->hasNoSignedWrap())
+    ExtendOperExpr = SE->getSignExtendExpr(
+      SE->getSCEV(NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else if(ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap())
+    ExtendOperExpr = SE->getZeroExtendExpr(
+      SE->getSCEV(NarrowUse->getOperand(ExtendOperIdx)), WideType);
+  else
+    return nullptr;
+
+  return ExtendOperExpr;
+}
+
 /// No-wrap operations can transfer sign extension of their result to their
 /// operands. Generate the SCEV value for the widened operation without
 /// actually modifying the IR yet. If the expression after extending the
 /// operands is an AddRec for this loop, return the AddRec and the kind of
 /// extension used.
 WidenIV::WidenedRecTy WidenIV::getExtendedOperandRecurrence(NarrowIVDefUse DU) {
-  // Handle the common case of add<nsw/nuw>
-  const unsigned OpCode = DU.NarrowUse->getOpcode();
-  // Only Add/Sub/Mul instructions supported yet.
-  if (OpCode != Instruction::Add && OpCode != Instruction::Sub &&
-      OpCode != Instruction::Mul)
+  const SCEV* ExtendOperExpr = getExtendExpr(DU);
+  if (!ExtendOperExpr)
     return {nullptr, Unknown};
 
+  const unsigned OpCode = DU.NarrowUse->getOpcode();
+
   // One operand (NarrowDef) has already been extended to WideDef. Now determine
   // if extending the other will lead to a recurrence.
   const unsigned ExtendOperIdx =
       DU.NarrowUse->getOperand(0) == DU.NarrowDef ? 1 : 0;
   assert(DU.NarrowUse->getOperand(1-ExtendOperIdx) == DU.NarrowDef && "bad DU");
 
-  const SCEV *ExtendOperExpr = nullptr;
-  const OverflowingBinaryOperator *OBO =
-    cast<OverflowingBinaryOperator>(DU.NarrowUse);
   ExtendKind ExtKind = getExtendKind(DU.NarrowDef);
-  if (ExtKind == SignExtended && OBO->hasNoSignedWrap())
-    ExtendOperExpr = SE->getSignExtendExpr(
-      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
-  else if(ExtKind == ZeroExtended && OBO->hasNoUnsignedWrap())
-    ExtendOperExpr = SE->getZeroExtendExpr(
-      SE->getSCEV(DU.NarrowUse->getOperand(ExtendOperIdx)), WideType);
-  else
-    return {nullptr, Unknown};
 
   // When creating this SCEV expr, don't apply the current operations NSW or NUW
   // flags. This instruction may be guarded by control flow that the no-wrap
@@ -1368,6 +1391,112 @@
   return true;
 }
 
+/// If the narrow use is an instruction whose two operands are the defining
+/// instruction of DU and a load instruction, then we have the following:
+/// if the load is hoisted outside the loop, then we do not reach this function
+/// as scalar evolution analysis works fine in widenIVUse with variables
+/// hoisted outside the loop and efficient code is subsequently generated by
+/// not emitting truncate instructions. But when the load is not hoisted
+/// (whether due to limitation in alias analysis or other), then scalar
+/// evolution can not proceed with loop variant values and inefficient code
+/// is generated. This function handles the non-hoisted load special case by
+/// making the optimization generate the same type of code for hoisted and
+/// non-hoisted load (widen use and eliminate sign extend instruction).
+/// This special case is important especially when the induction variables are
+/// affecting addressing mode in code generation.
+bool WidenIV::widenVariantLoadUse(NarrowIVDefUse DU) {
+  const SCEV* ExtendOperExpr = getExtendExpr(DU);
+  if (!ExtendOperExpr)
+    return false;
+
+  Instruction *NarrowUse = DU.NarrowUse;
+  Instruction *NarrowDef = DU.NarrowDef;
+  Instruction *WideDef = DU.WideDef;
+
+  // The operand that is not defined by NarrowDef of DU. Let's call it the
+  // other operand.
+  unsigned ExtendOperIdx = NarrowUse->getOperand(0) == NarrowDef ? 1 : 0;
+
+  // We are interested in the other operand being a load instruction.
+  // But, we should look into relaxing this restriction later on.
+  auto *I = dyn_cast<Instruction>(NarrowUse->getOperand(ExtendOperIdx));
+  if (I && I->getOpcode() != Instruction::Load)
+    return false;
+
+  // Verifying that Defining operand is an AddRec
+  const SCEV *op1 = SE->getSCEV(WideDef);
+  const SCEVAddRecExpr *AddRecOp1 = dyn_cast<SCEVAddRecExpr>(op1);
+  if (!AddRecOp1 || AddRecOp1->getLoop() != L)
+    return false;
+  // Verifying that other operand is an Extend.
+  ExtendKind ExtKind = getExtendKind(NarrowDef);
+  if (ExtKind == SignExtended)
+  {
+    const SCEVSignExtendExpr *AddRecOp2 = dyn_cast<SCEVSignExtendExpr>(ExtendOperExpr);
+    if (!AddRecOp2)
+      return false;
+  }
+  else
+  {
+    const SCEVZeroExtendExpr *AddRecOp2 = dyn_cast<SCEVZeroExtendExpr>(ExtendOperExpr);
+    if (!AddRecOp2)
+      return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Cloning arithmetic IVUser: " << *NarrowUse << "\n");
+
+  // Generating a widening use instruction.
+  Value *LHS = (NarrowUse->getOperand(0) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(0), WideType,
+                                      ExtKind, NarrowUse);
+  Value *RHS = (NarrowUse->getOperand(1) == NarrowDef)
+                   ? WideDef
+                   : createExtendInst(NarrowUse->getOperand(1), WideType,
+                                      ExtKind, NarrowUse);
+
+  auto *NarrowBO = cast<BinaryOperator>(NarrowUse);
+  auto *WideBO = BinaryOperator::Create(NarrowBO->getOpcode(), LHS, RHS,
+                                        NarrowBO->getName());
+  IRBuilder<> Builder(NarrowUse);
+  Builder.Insert(WideBO);
+  WideBO->copyIRFlags(NarrowBO);
+
+  if (ExtKind == SignExtended)
+    ExtendKindMap[NarrowUse] = SignExtended;
+  else
+    ExtendKindMap[NarrowUse] = ZeroExtended;
+
+  // Check if widening the use eliminates sign/zero extend instructions.
+  bool WideningUseful = false;
+  for (Use &U : NarrowUse->uses()) {
+    auto *User = cast<Instruction>(U.getUser());
+
+    if (((isa<SExtInst>(User) && (getExtendKind(NarrowUse) == SignExtended)) ||
+        (isa<ZExtInst>(User) && (getExtendKind(NarrowUse) == ZeroExtended))) &&
+        (User->getType() == WideType)) {
+        LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *User
+                          << " replaced by " << *WideBO << "\n");
+        ++NumElimExt;
+        User->replaceAllUsesWith(WideBO);
+        DeadInsts.emplace_back(User);
+        WideningUseful = true;
+      }
+      else {
+        WideningUseful = false;
+        break;
+      }
+  }
+
+  // In case the widened instruction is not comsumed by s/z extend instrunction
+  // and is hence not useful, we throw it away.
+  if (!WideningUseful) {
+    LLVM_DEBUG(dbgs() << "Widening use not useful: " << *WideBO << "\n");
+    DeadInsts.emplace_back(WideBO);
+  }
+  return WideningUseful;
+}
+
 /// Determine whether an individual user of the narrow IV can be widened. If so,
 /// return the wide clone of the user.
 Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
@@ -1465,6 +1594,14 @@
     if (widenLoopCompare(DU))
       return nullptr;
 
+    // We are here about to generate a truncate instruction because the scalar
+    // evolution expression computed earlier in WideAddRec.first does not
+    // indicate a polynomial induction expression. In that case, look at
+    // the operands of the use instruction to determine if we can still widen
+    // the use instead of truncating its operand.
+    if (widenVariantLoadUse(DU))
+      return nullptr;
+
     // This user does not evaluate to a recurrence after widening, so don't
     // follow it. Instead insert a Trunc to kill off the original use,
     // eventually isolating the original narrow IV so it can be removed.
Index: llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
===================================================================
--- llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
+++ llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
@@ -273,3 +273,87 @@
   %call = call i32 @dummy(i32* getelementptr inbounds ([100 x i32], [100 x i32]* @a, i32 0, i32 0), i32* getelementptr inbounds ([100 x i32], [100 x i32]* @b, i32 0, i32 0))
   ret i32 0
 }
+
+%struct.image = type {i32, i32}
+define i32 @foo4(%struct.image* %input, i32 %length, i32* %in) {
+entry:
+  %stride = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 1
+  %0 = load i32, i32* %stride, align 4
+  %cmp17 = icmp sgt i32 %length, 1
+  br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %channel = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 0
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %1 = phi i32 [ %6, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %2 = phi i32 [ 0, %entry ], [ %1, %for.cond.cleanup.loopexit ]
+  ret i32 %2
+
+; mul instruction below is widened instead of generating a truncate instruction for it
+; regardless if Load operand of mul is inside or outside the loop (we have both cases).
+; CHECK: for.body:
+; CHECK-NOT: trunc
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %x.018 = phi i32 [ 1, %for.body.lr.ph ], [ %add, %for.body ]
+  %add = add nuw nsw i32 %x.018, 1
+  %3 = load i32, i32* %channel, align 8
+  %mul = mul nsw i32 %3, %add
+  %idx.ext = sext i32 %mul to i64
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %idx.ext
+  %4 = load i32, i32* %add.ptr, align 4
+  %mul1 = mul nsw i32 %0, %add
+  %idx.ext1 = sext i32 %mul1 to i64
+  %add.ptr1 = getelementptr inbounds i32, i32* %in, i64 %idx.ext1
+  %5 = load i32, i32* %add.ptr1, align 4
+  %6 = add i32 %4, %5
+  %cmp = icmp slt i32 %add, %length
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+
+define i32 @foo5(%struct.image* %input, i32 %length, i32* %in) {
+entry:
+  %stride = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 1
+  %0 = load i32, i32* %stride, align 4
+  %cmp17 = icmp sgt i32 %length, 1
+  br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %channel = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 0
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %1 = phi i32 [ %7, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %2 = phi i32 [ 0, %entry ], [ %1, %for.cond.cleanup.loopexit ]
+  ret i32 %2
+
+; This example is the same as above except that the first mul is used in two places 
+; and this may result in having two versions of the multiply: an i32 and i64 version.
+; In this case, keep the trucate instructions to avoid this redundancy. 
+; CHECK: for.body:
+; CHECK: trunc
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %x.018 = phi i32 [ 1, %for.body.lr.ph ], [ %add, %for.body ]
+  %add = add nuw nsw i32 %x.018, 1
+  %3 = load i32, i32* %channel, align 8
+  %mul = mul nsw i32 %3, %add
+  %idx.ext = sext i32 %mul to i64
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %idx.ext
+  %4 = load i32, i32* %add.ptr, align 4
+  %mul1 = mul nsw i32 %0, %add
+  %idx.ext1 = sext i32 %mul1 to i64
+  %add.ptr1 = getelementptr inbounds i32, i32* %in, i64 %idx.ext1
+  %5 = load i32, i32* %add.ptr1, align 4
+  %6 = add i32 %4, %5
+  %7 = add i32 %6, %mul
+  %cmp = icmp slt i32 %add, %length
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}