Index: include/llvm/Analysis/ScalarEvolutionExpressions.h
===================================================================
--- include/llvm/Analysis/ScalarEvolutionExpressions.h
+++ include/llvm/Analysis/ScalarEvolutionExpressions.h
@@ -345,6 +345,12 @@
     /// iteration number.
     const SCEV *evaluateAtIteration(const SCEV *It, ScalarEvolution &SE) const;
 
+    /// Return the minimum bitwidth of the iteration expr to compute this 
+    /// AddRec at the given iteration without overflow. For affine AddRecs its
+    /// the same as the AddRec type width, for non-affine it might be wider
+    /// that the AddRec type width.
+    unsigned minIterationWidthForEvaluateAtIteration(ScalarEvolution &SE) const;
+
     /// Return the number of iterations of this loop that produce
     /// values in the specified constant range.  Another way of
     /// looking at this is that it returns the first iteration number
Index: lib/Analysis/ScalarEvolution.cpp
===================================================================
--- lib/Analysis/ScalarEvolution.cpp
+++ lib/Analysis/ScalarEvolution.cpp
@@ -1108,6 +1108,16 @@
 //                      Simple SCEV method implementations
 //===----------------------------------------------------------------------===//
 
+static unsigned MinIterationWidthForBinomialCoefficient(unsigned K,
+                                                        Type *ResultTy,
+                                                        ScalarEvolution &SE) {
+  unsigned W = SE.getTypeSizeInBits(ResultTy);
+  unsigned T = 1;
+  for (unsigned i = 3; i <= K; ++i)
+    T += APInt(W, i).countTrailingZeros();
+  return W + T;
+}
+
 /// Compute BC(It, K).  The result has width W.  Assume, K > 0.
 static const SCEV *BinomialCoefficient(const SCEV *It, unsigned K,
                                        ScalarEvolution &SE,
@@ -1188,6 +1198,9 @@
 
   // We need at least W + T bits for the multiplication step
   unsigned CalculationBits = W + T;
+  assert(MinIterationWidthForBinomialCoefficient(K, ResultTy, SE) ==
+             CalculationBits &&
+         "must be the same!");
 
   // Calculate 2^T, at width T+W.
   APInt DivFactor = APInt::getOneBitSet(CalculationBits, T);
@@ -1243,6 +1256,14 @@
   return Result;
 }
 
+unsigned SCEVAddRecExpr::minIterationWidthForEvaluateAtIteration(
+    ScalarEvolution &SE) const {
+  if (isAffine())
+    return SE.getTypeSizeInBits(getType());
+  return MinIterationWidthForBinomialCoefficient(getNumOperands() - 1,
+                                                 getType(), SE);
+}
+
 //===----------------------------------------------------------------------===//
 //                    SCEV Expression folder implementations
 //===----------------------------------------------------------------------===//
Index: lib/Analysis/ScalarEvolutionExpander.cpp
===================================================================
--- lib/Analysis/ScalarEvolutionExpander.cpp
+++ lib/Analysis/ScalarEvolutionExpander.cpp
@@ -1476,30 +1476,15 @@
 
   Type *Ty = SE.getEffectiveSCEVType(S->getType());
   const Loop *L = S->getLoop();
+  
+  unsigned MinCanonicalIVWidth = S->minIterationWidthForEvaluateAtIteration(SE);
 
   // First check for an existing canonical IV in a suitable type.
   PHINode *CanonicalIV = nullptr;
   if (PHINode *PN = L->getCanonicalInductionVariable())
-    if (SE.getTypeSizeInBits(PN->getType()) >= SE.getTypeSizeInBits(Ty))
+    if (SE.getTypeSizeInBits(PN->getType()) >= MinCanonicalIVWidth)
       CanonicalIV = PN;
 
-  // Rewrite an AddRec in terms of the canonical induction variable, if
-  // its type is more narrow.
-  if (CanonicalIV &&
-      SE.getTypeSizeInBits(CanonicalIV->getType()) >
-      SE.getTypeSizeInBits(Ty)) {
-    SmallVector<const SCEV *, 4> NewOps(S->getNumOperands());
-    for (unsigned i = 0, e = S->getNumOperands(); i != e; ++i)
-      NewOps[i] = SE.getAnyExtendExpr(S->op_begin()[i], CanonicalIV->getType());
-    Value *V = expand(SE.getAddRecExpr(NewOps, S->getLoop(),
-                                       S->getNoWrapFlags(SCEV::FlagNW)));
-    BasicBlock::iterator NewInsertPt =
-        findInsertPointAfter(cast<Instruction>(V), Builder.GetInsertBlock());
-    V = expandCodeFor(SE.getTruncateExpr(SE.getUnknown(V), Ty), nullptr,
-                      &*NewInsertPt);
-    return V;
-  }
-
   // {X,+,F} --> X + {0,+,F}
   if (!S->getStart()->isZero()) {
     SmallVector<const SCEV *, 4> NewOps(S->op_begin(), S->op_end());
@@ -1540,12 +1525,14 @@
     // specified loop.
     BasicBlock *Header = L->getHeader();
     pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header);
-    CanonicalIV = PHINode::Create(Ty, std::distance(HPB, HPE), "indvar",
-                                  &Header->front());
+    auto *CanonicalIVType =
+        IntegerType::get(SE.getContext(), MinCanonicalIVWidth);
+    CanonicalIV = PHINode::Create(CanonicalIVType, std::distance(HPB, HPE),
+                                  "indvar", &Header->front());
     rememberInstruction(CanonicalIV);
 
     SmallSet<BasicBlock *, 4> PredSeen;
-    Constant *One = ConstantInt::get(Ty, 1);
+    Constant *One = ConstantInt::get(CanonicalIVType, 1);
     for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
       BasicBlock *HP = *HPI;
       if (!PredSeen.insert(HP).second) {
@@ -1565,17 +1552,20 @@
         rememberInstruction(Add);
         CanonicalIV->addIncoming(Add, HP);
       } else {
-        CanonicalIV->addIncoming(Constant::getNullValue(Ty), HP);
+        CanonicalIV->addIncoming(Constant::getNullValue(CanonicalIVType), HP);
       }
     }
   }
 
-  // {0,+,1} --> Insert a canonical induction variable into the loop!
+  // {0,+,1} --> Return the canonical induction variable
   if (S->isAffine() && S->getOperand(1)->isOne()) {
-    assert(Ty == SE.getEffectiveSCEVType(CanonicalIV->getType()) &&
-           "IVs with types different from the canonical IV should "
-           "already have been handled!");
-    return CanonicalIV;
+    if (Ty == SE.getEffectiveSCEVType(CanonicalIV->getType()))
+      return CanonicalIV;
+
+    assert(SE.getTypeSizeInBits(CanonicalIV->getType()) >=
+               SE.getTypeSizeInBits(Ty) &&
+           "Canonical IV can't be more narrow than the expr!");
+    return expand(SE.getTruncateExpr(SE.getUnknown(CanonicalIV), Ty));
   }
 
   // {0,+,F} --> {0,+,1} * F
@@ -1595,13 +1585,7 @@
   // into this folder.
   const SCEV *IH = SE.getUnknown(CanonicalIV);   // Get I as a "symbolic" SCEV.
 
-  // Promote S up to the canonical IV type, if the cast is foldable.
-  const SCEV *NewS = S;
-  const SCEV *Ext = SE.getNoopOrAnyExtend(S, CanonicalIV->getType());
-  if (isa<SCEVAddRecExpr>(Ext))
-    NewS = Ext;
-
-  const SCEV *V = cast<SCEVAddRecExpr>(NewS)->evaluateAtIteration(IH, SE);
+  const SCEV *V = cast<SCEVAddRecExpr>(S)->evaluateAtIteration(IH, SE);
   //cerr << "Evaluated: " << *this << "\n     to: " << *V << "\n";
 
   // Truncate the result down to the original type, if needed.
Index: test/Analysis/ScalarEvolution/scev-expand-canonical-iv-type.ll
===================================================================
--- /dev/null
+++ test/Analysis/ScalarEvolution/scev-expand-canonical-iv-type.ll
@@ -0,0 +1,56 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test verifies that the SCEVExpander emits a canonical IV of a proper type 
+; for non-linear addrec expansion. In this test case loop-vectorize expands 
+; i8 {0,+,2,+,1} addrec in the outer loop. The expression to compute the addrec 
+; at a given iteration involves division by 2 (via shr). In order to compute 
+; this expression without overflow a canonical IV of the wider type i9 should be 
+; used.
+;
+; See also: 
+;   SCEVAddRecExpr::minIterationWidthForEvaluateAtIteration, 
+;   ScalarEvolutionsTest::SCEVExpandInsertCanonicalIV.
+ 
+define i32 @test(i8* %p) {
+; CHECK-LABEL: @test
+bb:
+  br label %outer_loop
+
+outer_loop:
+; Check that the canonical IV inserted by expander is of i9 type
+; CHECK: outer_loop:
+; CHECK: %indvar = phi i9 [ %indvar.next, %outer_loop_cont ], [ 0, %bb ]
+  %tmp4 = phi i32 [ 0, %bb ], [ %tmp8, %outer_loop_cont ]
+  %tmp5 = phi i32 [ 0, %bb ], [ %tmp13, %outer_loop_cont ]
+  %tmp7 = phi i32 [ 1, %bb ], [ %tmp16, %outer_loop_cont ]
+  %tmp8 = add i32 %tmp7, %tmp4
+  %tmp9 = trunc i32 %tmp8 to i8
+  %tmp10 = load i8, i8* %p, align 8
+  br label %inner_loop
+
+inner_loop:
+  %tmp19 = phi i8 [ %tmp10, %outer_loop ], [ %tmp23, %inner_loop ]
+  %tmp20 = phi i32 [ %tmp5, %outer_loop ], [ %tmp22, %inner_loop ]
+  %tmp21 = phi i32 [ 1, %outer_loop ], [ %tmp24, %inner_loop ]
+  %tmp22 = add i32 %tmp20, 1
+  %tmp23 = add i8 %tmp19, %tmp9
+  %tmp24 = add nuw nsw i32 %tmp21, 1
+  %tmp25 = icmp ugt i32 %tmp21, 75
+  br i1 %tmp25, label %outer_loop_cont, label %inner_loop
+
+outer_loop_cont:
+  %tmp12 = phi i8 [ %tmp19, %inner_loop ]
+  %tmp13 = phi i32 [ %tmp22, %inner_loop ]
+  %tmp14 = phi i8 [ %tmp23, %inner_loop ]
+  store i8 %tmp14, i8* %p, align 8
+  %tmp15 = sext i8 %tmp12 to i32
+  %tmp16 = add nuw nsw i32 %tmp7, 1
+  %tmp17 = icmp ugt i32 %tmp7, 256
+  br i1 %tmp17, label %exit, label %outer_loop
+
+exit:
+  %tmp2 = phi i32 [ %tmp15, %outer_loop_cont ]
+  ret i32 %tmp2
+}
\ No newline at end of file
Index: test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
===================================================================
--- test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -28,12 +28,11 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[M]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[K:%.*]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH_US:%.*]]
 ; CHECK:       for.end.us:
 ; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDVARS_IV33:%.*]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
-; CHECK-NEXT:    [[ADD10_US:%.*]] = add nsw i32 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[ADD10_US:%.*]] = add nsw i32 [[TMP3]], 3
 ; CHECK-NEXT:    store i32 [[ADD10_US]], i32* [[ARRAYIDX9_US]], align 4, !llvm.mem.parallel_loop_access !0
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT34:%.*]] = add i64 [[INDVARS_IV33]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV35:%.*]] = trunc i64 [[INDVARS_IV_NEXT34]] to i32
@@ -41,12 +40,12 @@
 ; CHECK-NEXT:    br i1 [[EXITCOND36]], label [[FOR_END15_LOOPEXIT:%.*]], label [[FOR_BODY3_LR_PH_US]], !llvm.loop !2
 ; CHECK:       for.body3.us:
 ; CHECK-NEXT:    [[INDVARS_IV29:%.*]] = phi i64 [ [[BC_RESUME_VAL:%.*]], [[SCALAR_PH:%.*]] ], [ [[INDVARS_IV_NEXT30:%.*]], [[FOR_BODY3_US:%.*]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDVARS_IV29]] to i32
-; CHECK-NEXT:    [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDVARS_IV29]] to i32
+; CHECK-NEXT:    [[ADD4_US:%.*]] = add i32 [[ADD_US:%.*]], [[TMP4]]
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[ADD4_US]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[IDXPROM_US]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4, !llvm.mem.parallel_loop_access !0
-; CHECK-NEXT:    [[ADD5_US:%.*]] = add nsw i32 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* [[ARRAYIDX_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[ADD5_US:%.*]] = add nsw i32 [[TMP5]], 1
 ; CHECK-NEXT:    store i32 [[ADD5_US]], i32* [[ARRAYIDX7_US:%.*]], align 4, !llvm.mem.parallel_loop_access !0
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT30]] = add i64 [[INDVARS_IV29]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV31:%.*]] = trunc i64 [[INDVARS_IV_NEXT30]] to i32
@@ -54,10 +53,10 @@
 ; CHECK-NEXT:    br i1 [[EXITCOND32]], label [[FOR_END_US:%.*]], label [[FOR_BODY3_US]], !llvm.loop !3
 ; CHECK:       for.body3.lr.ph.us:
 ; CHECK-NEXT:    [[INDVARS_IV33]] = phi i64 [ [[INDVARS_IV_NEXT34]], [[FOR_END_US]] ], [ 0, [[FOR_BODY3_LR_PH_US_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP3]], [[INDVARS_IV33]]
-; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
-; CHECK-NEXT:    [[ADD_US]] = add i32 [[TMP9]], [[K]]
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[K:%.*]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[INDVARS_IV33]] to i32
+; CHECK-NEXT:    [[ADD_US]] = add i32 [[TMP8]], [[K]]
 ; CHECK-NEXT:    [[ARRAYIDX7_US]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV33]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH]], label [[VECTOR_SCEVCHECK:%.*]]
@@ -65,40 +64,40 @@
 ; CHECK-NEXT:    [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 [[TMP0]])
 ; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = add i32 [[TMP8]], [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP8]], [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp sgt i32 [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13:%.*]] = icmp slt i32 [[TMP10]], [[TMP8]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 false, i1 [[TMP12]], i1 [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP14]], [[MUL_OVERFLOW]]
-; CHECK-NEXT:    [[TMP16:%.*]] = or i1 false, [[TMP15]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i32 [[TMP7]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub i32 [[TMP7]], [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp sgt i32 [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp slt i32 [[TMP9]], [[TMP7]]
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 false, i1 [[TMP11]], i1 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = or i1 [[TMP13]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or i1 false, [[TMP14]]
+; CHECK-NEXT:    br i1 [[TMP15]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP17:%.*]] = trunc i64 [[INDEX]] to i32
-; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP17]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = add i32 [[ADD_US]], [[TMP18]]
-; CHECK-NEXT:    [[TMP20:%.*]] = sext i32 [[TMP19]] to i64
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP23]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP24]], i32 0
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc i64 [[INDEX]] to i32
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[TMP16]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[ADD_US]], [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP20]], i32 0
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i32* [[TMP21]] to <4 x i32>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP22]], align 4
+; CHECK-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i32> [[TMP23]], i32 0
+; CHECK-NEXT:    store i32 [[TMP24]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i32> [[TMP23]], i32 1
 ; CHECK-NEXT:    store i32 [[TMP25]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
-; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP24]], i32 1
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x i32> [[TMP23]], i32 2
 ; CHECK-NEXT:    store i32 [[TMP26]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
-; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP24]], i32 2
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i32> [[TMP23]], i32 3
 ; CHECK-NEXT:    store i32 [[TMP27]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP24]], i32 3
-; CHECK-NEXT:    store i32 [[TMP28]], i32* [[ARRAYIDX7_US]], align 4, !llvm.mem.parallel_loop_access !0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_US]], label [[SCALAR_PH]]
Index: unittests/Analysis/ScalarEvolutionTest.cpp
===================================================================
--- unittests/Analysis/ScalarEvolutionTest.cpp
+++ unittests/Analysis/ScalarEvolutionTest.cpp
@@ -1444,5 +1444,234 @@
   EXPECT_EQ(S2S->getExpressionSize(), 5u);
 }
 
+/// This test verifies that the SCEVExpander emits a canonical IV of a proper
+/// type while expanding non-linear addrecs.
+///
+/// See also:
+///   SCEVAddRecExpr::minIterationWidthForEvaluateAtIteration
+///   scev-expand-canonical-iv-type.ll
+TEST_F(ScalarEvolutionsTest, SCEVExpandInsertCanonicalIV) {
+  LLVMContext C;
+  SMDiagnostic Err;
+
+  // Expand the addrec produced by GetAddRec into a loop without a canonical IV.
+  // SCEVExpander will produce one to expand the addrec. Check the the type of
+  // the inserted IV is equal to ExpectedCanonicalIVWidth.
+  auto TestNoCanonicalIV = [&](
+      std::function<const SCEV *(ScalarEvolution & SE, Loop * L)> GetAddRec,
+      unsigned ExpectedCanonicalIVWidth) {
+    std::unique_ptr<Module> M =
+        parseAssemblyString("define i32 @test(i32 %limit) { "
+                            "entry: "
+                            "  br label %loop "
+                            "loop: "
+                            "  %i = phi i32 [ 1, %entry ], [ %i.inc, %loop ] "
+                            "  %i.inc = add nsw i32 %i, 1 "
+                            "  %cont = icmp slt i32 %i.inc, %limit "
+                            "  br i1 %cont, label %loop, label %exit "
+                            "exit: "
+                            "  ret i32 %i.inc "
+                            "}",
+                            Err, C);
+
+    assert(M && "Could not parse module?");
+    assert(!verifyModule(*M) && "Must have been well formed!");
+
+    runWithSE(*M, "test", [&](Function &F, LoopInfo &LI, ScalarEvolution &SE) {
+      auto &I = GetInstByName(F, "i");
+      auto *Loop = LI.getLoopFor(I.getParent());
+      EXPECT_FALSE(Loop->getCanonicalInductionVariable());
+
+      auto *AR = GetAddRec(SE, Loop);
+      SCEVExpander Exp(SE, M->getDataLayout(), "expander");
+      auto *InsertAt = I.getNextNode();
+      Exp.expandCodeFor(AR, nullptr, InsertAt);
+      PHINode *CanonicalIV = Loop->getCanonicalInductionVariable();
+      unsigned CanonicalIVBitWidth =
+          cast<IntegerType>(CanonicalIV->getType())->getBitWidth();
+      EXPECT_EQ(CanonicalIVBitWidth, ExpectedCanonicalIVWidth);
+    });
+  };
+
+  // Expand the addrec produced by GetAddRec into a loop with a canonical IV
+  // which is narrower than ExpectedCanonicalIVWidth.
+  // SCEVExpander will produce a canonical IV of a wider type to expand the
+  // addrec. Check the the type of the inserted IV is equal to
+  // ExpectedCanonicalIVWidth.
+  auto TestNarrowCanonicalIV = [&](
+      std::function<const SCEV *(ScalarEvolution & SE, Loop * L)> GetAddRec,
+      unsigned ExpectedCanonicalIVWidth) {
+    std::unique_ptr<Module> M = parseAssemblyString(
+        "define i32 @test(i32 %limit) { "
+        "entry: "
+        "  br label %loop "
+        "loop: "
+        "  %i = phi i32 [ 1, %entry ], [ %i.inc, %loop ] "
+        "  %canonical.iv = phi i8 [ 0, %entry ], [ %canonical.iv.inc, %loop ] "
+        "  %i.inc = add nsw i32 %i, 1 "
+        "  %canonical.iv.inc = add i8 %canonical.iv, 1 "
+        "  %cont = icmp slt i32 %i.inc, %limit "
+        "  br i1 %cont, label %loop, label %exit "
+        "exit: "
+        "  ret i32 %i.inc "
+        "}",
+        Err, C);
+
+    assert(M && "Could not parse module?");
+    assert(!verifyModule(*M) && "Must have been well formed!");
+
+    runWithSE(*M, "test", [&](Function &F, LoopInfo &LI, ScalarEvolution &SE) {
+      auto &I = GetInstByName(F, "i");
+
+      auto *LoopHeaderBB = I.getParent();
+      auto *Loop = LI.getLoopFor(LoopHeaderBB);
+      PHINode *CanonicalIV = Loop->getCanonicalInductionVariable();
+      EXPECT_EQ(CanonicalIV, &GetInstByName(F, "canonical.iv"));
+
+      unsigned CanonicalIVBitWidth =
+          cast<IntegerType>(CanonicalIV->getType())->getBitWidth();
+      EXPECT_LT(CanonicalIVBitWidth, ExpectedCanonicalIVWidth);
+
+      auto *AR = GetAddRec(SE, Loop);
+      SCEVExpander Exp(SE, M->getDataLayout(), "expander");
+      auto *InsertAt = I.getNextNode();
+      Exp.expandCodeFor(AR, nullptr, InsertAt);
+
+      // Loop over all of the PHI nodes, looking for the new canonical indvar.
+      PHINode *NewCanonicalIV = nullptr;
+      for (BasicBlock::iterator i = LoopHeaderBB->begin(); isa<PHINode>(i);
+           ++i) {
+        PHINode *PN = cast<PHINode>(i);
+        if (PN == &I || PN == CanonicalIV)
+          continue;
+        EXPECT_FALSE(NewCanonicalIV);
+        NewCanonicalIV = PN;
+      }
+
+      // Check that NewCanonicalIV is a canonical IV, i.e {0,+,1}
+      BasicBlock *Incoming = nullptr, *Backedge = nullptr;
+      EXPECT_TRUE(Loop->getIncomingAndBackEdge(Incoming, Backedge));
+      auto *Start = NewCanonicalIV->getIncomingValueForBlock(Incoming);
+      EXPECT_TRUE(isa<ConstantInt>(Start));
+      EXPECT_TRUE(dyn_cast<ConstantInt>(Start)->isZero());
+      auto *Next = NewCanonicalIV->getIncomingValueForBlock(Backedge);
+      EXPECT_TRUE(isa<BinaryOperator>(Next));
+      auto *NextBinOp = dyn_cast<BinaryOperator>(Next);
+      EXPECT_EQ(NextBinOp->getOpcode(), Instruction::Add);
+      EXPECT_EQ(NextBinOp->getOperand(0), NewCanonicalIV);
+      auto *Step = NextBinOp->getOperand(1);
+      EXPECT_TRUE(isa<ConstantInt>(Step));
+      EXPECT_TRUE(dyn_cast<ConstantInt>(Step)->isOne());
+
+      unsigned NewCanonicalIVBitWidth =
+          cast<IntegerType>(NewCanonicalIV->getType())->getBitWidth();
+      EXPECT_EQ(NewCanonicalIVBitWidth, ExpectedCanonicalIVWidth);
+    });
+  };
+
+  // Expand the addrec produced by GetAddRec into a loop with a canonical IV
+  // of width ExpectedCanonicalIVWidth.
+  // To expand the addrec SCEVExpander should use the existing canonical IV.
+  auto TestMatchingCanonicalIV = [&](
+      std::function<const SCEV *(ScalarEvolution & SE, Loop * L)> GetAddRec,
+      unsigned ExpectedCanonicalIVWidth) {
+    auto ExpectedCanonicalIVTypeStr =
+        "i" + std::to_string(ExpectedCanonicalIVWidth);
+    std::unique_ptr<Module> M = parseAssemblyString(
+        "define i32 @test(i32 %limit) { "
+        "entry: "
+        "  br label %loop "
+        "loop: "
+        "  %i = phi i32 [ 1, %entry ], [ %i.inc, %loop ] "
+        "  %canonical.iv = phi " + ExpectedCanonicalIVTypeStr + 
+            " [ 0, %entry ], [ %canonical.iv.inc, %loop ] "
+        "  %i.inc = add nsw i32 %i, 1 "
+        "  %canonical.iv.inc = add " + ExpectedCanonicalIVTypeStr + 
+            " %canonical.iv, 1 "
+        "  %cont = icmp slt i32 %i.inc, %limit "
+        "  br i1 %cont, label %loop, label %exit "
+        "exit: "
+        "  ret i32 %i.inc "
+        "}",
+        Err, C);
+
+    assert(M && "Could not parse module?");
+    assert(!verifyModule(*M) && "Must have been well formed!");
+
+    runWithSE(*M, "test", [&](Function &F, LoopInfo &LI, ScalarEvolution &SE) {
+      auto &I = GetInstByName(F, "i");
+      auto &CanonicalIV = GetInstByName(F, "canonical.iv");
+
+      auto *LoopHeaderBB = I.getParent();
+      auto *Loop = LI.getLoopFor(LoopHeaderBB);
+      EXPECT_EQ(&CanonicalIV, Loop->getCanonicalInductionVariable());
+      unsigned CanonicalIVBitWidth =
+          cast<IntegerType>(CanonicalIV.getType())->getBitWidth();
+      EXPECT_EQ(CanonicalIVBitWidth, ExpectedCanonicalIVWidth);
+
+      auto *AR = GetAddRec(SE, Loop);
+      SCEVExpander Exp(SE, M->getDataLayout(), "expander");
+      auto *InsertAt = I.getNextNode();
+      Exp.expandCodeFor(AR, nullptr, InsertAt);
+
+      // Loop over all of the PHI nodes, looking if a new canonical indvar was
+      // introduced.
+      PHINode *NewCanonicalIV = nullptr;
+      for (BasicBlock::iterator i = LoopHeaderBB->begin(); isa<PHINode>(i);
+           ++i) {
+        PHINode *PN = cast<PHINode>(i);
+        if (PN == &I || PN == &CanonicalIV)
+          continue;
+        NewCanonicalIV = PN;
+      }
+      EXPECT_FALSE(NewCanonicalIV);
+    });
+  };
+
+  unsigned ARBitwidth = 16;
+  Type *ARType = IntegerType::get(C, ARBitwidth);
+
+  // Expand {5,+,1}, expect ARBitwidth canonical IV width
+  auto GetAR2 = [&](ScalarEvolution &SE, Loop *L) -> const SCEV * {
+    return SE.getAddRecExpr(SE.getConstant(APInt(ARBitwidth, 5)),
+                            SE.getOne(ARType), L, SCEV::FlagAnyWrap);
+  };
+  TestNoCanonicalIV(GetAR2, ARBitwidth);
+  TestNarrowCanonicalIV(GetAR2, ARBitwidth);
+  TestMatchingCanonicalIV(GetAR2, ARBitwidth);
+
+  // Expand {5,+,1,+,1}, expect ARBitwidth+1 canonical IV width
+  auto GetAR3 = [&](ScalarEvolution &SE, Loop *L) -> const SCEV * {
+    SmallVector<const SCEV *, 3> Ops = {SE.getConstant(APInt(ARBitwidth, 5)),
+                                        SE.getOne(ARType), SE.getOne(ARType)};
+    return SE.getAddRecExpr(Ops, L, SCEV::FlagAnyWrap);
+  };
+  TestNoCanonicalIV(GetAR3, ARBitwidth + 1);
+  TestNarrowCanonicalIV(GetAR3, ARBitwidth + 1);
+  TestMatchingCanonicalIV(GetAR3, ARBitwidth + 1);
+
+  // Expand {5,+,1,+,1,+,1}, expect ARBitwidth+1 canonical IV width
+  auto GetAR4 = [&](ScalarEvolution &SE, Loop *L) -> const SCEV * {
+    SmallVector<const SCEV *, 3> Ops = {SE.getConstant(APInt(ARBitwidth, 5)),
+                                        SE.getOne(ARType), SE.getOne(ARType),
+                                        SE.getOne(ARType)};
+    return SE.getAddRecExpr(Ops, L, SCEV::FlagAnyWrap);
+  };
+  TestNoCanonicalIV(GetAR4, ARBitwidth + 1);
+  TestNarrowCanonicalIV(GetAR4, ARBitwidth + 1);
+  TestMatchingCanonicalIV(GetAR4, ARBitwidth + 1);
+
+  // Expand {5,+,1,+,1,+,1,+,1}, expect ARBitwidth+3 canonical IV width
+  auto GetAR5 = [&](ScalarEvolution &SE, Loop *L) -> const SCEV * {
+    SmallVector<const SCEV *, 3> Ops = {SE.getConstant(APInt(ARBitwidth, 5)),
+                                        SE.getOne(ARType), SE.getOne(ARType),
+                                        SE.getOne(ARType), SE.getOne(ARType)};
+    return SE.getAddRecExpr(Ops, L, SCEV::FlagAnyWrap);
+  };
+  TestNoCanonicalIV(GetAR5, ARBitwidth + 3);
+  TestNarrowCanonicalIV(GetAR5, ARBitwidth + 3);
+  TestMatchingCanonicalIV(GetAR5, ARBitwidth + 3);
+}
+
 }  // end anonymous namespace
 }  // end namespace llvm