Index: llvm/lib/Transforms/Scalar/LoopFlatten.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -748,12 +748,30 @@
     return false;
 
   // Check if we can widen the induction variables to avoid overflow checks.
-  if (CanWidenIV(FI, DT, LI, SE, AC, TTI))
+  bool CanFlatten = CanWidenIV(FI, DT, LI, SE, AC, TTI);
+
+  // It can happen that after widening of the IV, flattening may not be
+  // possible/happening, e.g. when it is deemed unprofitable. So bail here if
+  // that is the case.
+  // TODO: IV widening without performing the actual flattening transformation
+  // is not ideal. While this codegen change should not matter much, it is an
+  // unnecessary change which is better to avoid. It's unlikely this happens
+  // often, because if it's unprofitibale after widening, it should be
+  // unprofitabe before widening as checked in the first round of checks. But
+  // 'RepeatedInstructionThreshold' is set to only 2, which can probably be
+  // relaxed. Because this is making a code change (the IV widening, but not
+  // the flattening), we return true here.
+  if (FI.Widened && !CanFlatten)
+    return true;
+
+  // If we have widened and can perform the transformation, do that here.
+  if (CanFlatten)
     return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI);
 
-  // Check if the new iteration variable might overflow. In this case, we
-  // need to version the loop, and select the original version at runtime if
-  // the iteration space is too large.
+  // Otherwise, if we haven't widened the IV, check if the new iteration
+  // variable might overflow. In this case, we need to version the loop, and
+  // select the original version at runtime if the iteration space is too
+  // large.
   // TODO: We currently don't version the loop.
   OverflowResult OR = checkOverflow(FI, DT, AC);
   if (OR == OverflowResult::AlwaysOverflowsHigh ||
Index: llvm/test/Transforms/LoopFlatten/widen-iv3.ll
===================================================================
--- llvm/test/Transforms/LoopFlatten/widen-iv3.ll
+++ llvm/test/Transforms/LoopFlatten/widen-iv3.ll
@@ -10,7 +10,6 @@
 define i16 @foo() {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[FLATTEN_TRIPCOUNT:%.*]] = mul i32 16, 4
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
 ; CHECK:       for.cond1.preheader:
 ; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i32 [ [[INDVAR_NEXT3:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[ENTRY:%.*]] ]
@@ -19,7 +18,6 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul nsw i32 [[INDVAR2]], 16
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i16 [[I_013]], 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[MUL]] to i32
-; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i32 [[INDVAR2]] to i16
 ; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    [[ADD5_LCSSA_LCSSA:%.*]] = phi i16 [ [[ADD5_LCSSA]], [[FOR_COND_CLEANUP3]] ]
@@ -28,22 +26,22 @@
 ; CHECK-NEXT:    [[ADD5_LCSSA]] = phi i16 [ [[ADD5:%.*]], [[FOR_BODY4]] ]
 ; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i32 [[INDVAR2]], 1
 ; CHECK-NEXT:    [[INC7]] = add nuw nsw i16 [[I_013]], 1
-; CHECK-NEXT:    [[EXITCOND14_NOT:%.*]] = icmp eq i32 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]]
+; CHECK-NEXT:    [[EXITCOND14_NOT:%.*]] = icmp eq i32 [[INDVAR_NEXT3]], 4
 ; CHECK-NEXT:    br i1 [[EXITCOND14_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]]
 ; CHECK:       for.body4:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY4]] ], [ 0, [[FOR_COND1_PREHEADER]] ]
 ; CHECK-NEXT:    [[J_011:%.*]] = phi i16 [ 0, [[FOR_COND1_PREHEADER]] ]
-; CHECK-NEXT:    [[SUM_110:%.*]] = phi i16 [ [[SUM_012]], [[FOR_COND1_PREHEADER]] ]
+; CHECK-NEXT:    [[SUM_110:%.*]] = phi i16 [ [[SUM_012]], [[FOR_COND1_PREHEADER]] ], [ [[ADD5]], [[FOR_BODY4]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i32 [[INDVAR]], [[TMP0]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[J_011]], [[MUL]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i16], [64 x i16]* @v, i16 0, i16 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[ADD5]] = add nsw i16 [[TMP4]], [[SUM_110]]
-; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i32 [[INDVAR]], 1
+; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
 ; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i16 [[J_011]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INDVAR_NEXT]], 16
-; CHECK-NEXT:    br label [[FOR_COND_CLEANUP3]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]]
 ;
 entry:
   br label %for.cond1.preheader