Index: llvm/lib/Transforms/Scalar/LoopFlatten.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -748,12 +748,30 @@ return false; // Check if we can widen the induction variables to avoid overflow checks. - if (CanWidenIV(FI, DT, LI, SE, AC, TTI)) + bool CanFlatten = CanWidenIV(FI, DT, LI, SE, AC, TTI); + + // It can happen that after widening of the IV, flattening may not be + // possible/happening, e.g. when it is deemed unprofitable. So bail here if + // that is the case. + // TODO: IV widening without performing the actual flattening transformation + // is not ideal. While this codegen change should not matter much, it is an + // unnecessary change which is better to avoid. It's unlikely this happens + // often, because if it's unprofitibale after widening, it should be + // unprofitabe before widening as checked in the first round of checks. But + // 'RepeatedInstructionThreshold' is set to only 2, which can probably be + // relaxed. Because this is making a code change (the IV widening, but not + // the flattening), we return true here. + if (FI.Widened && !CanFlatten) + return true; + + // If we have widened and can perform the transformation, do that here. + if (CanFlatten) return DoFlattenLoopPair(FI, DT, LI, SE, AC, TTI); - // Check if the new iteration variable might overflow. In this case, we - // need to version the loop, and select the original version at runtime if - // the iteration space is too large. + // Otherwise, if we haven't widened the IV, check if the new iteration + // variable might overflow. In this case, we need to version the loop, and + // select the original version at runtime if the iteration space is too + // large. // TODO: We currently don't version the loop. OverflowResult OR = checkOverflow(FI, DT, AC); if (OR == OverflowResult::AlwaysOverflowsHigh || Index: llvm/test/Transforms/LoopFlatten/widen-iv3.ll =================================================================== --- llvm/test/Transforms/LoopFlatten/widen-iv3.ll +++ llvm/test/Transforms/LoopFlatten/widen-iv3.ll @@ -10,7 +10,6 @@ define i16 @foo() { ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[FLATTEN_TRIPCOUNT:%.*]] = mul i32 16, 4 ; CHECK-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] ; CHECK: for.cond1.preheader: ; CHECK-NEXT: [[INDVAR2:%.*]] = phi i32 [ [[INDVAR_NEXT3:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[ENTRY:%.*]] ] @@ -19,7 +18,6 @@ ; CHECK-NEXT: [[TMP0:%.*]] = mul nsw i32 [[INDVAR2]], 16 ; CHECK-NEXT: [[MUL:%.*]] = mul nsw i16 [[I_013]], 16 ; CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[MUL]] to i32 -; CHECK-NEXT: [[FLATTEN_TRUNCIV:%.*]] = trunc i32 [[INDVAR2]] to i16 ; CHECK-NEXT: br label [[FOR_BODY4:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: [[ADD5_LCSSA_LCSSA:%.*]] = phi i16 [ [[ADD5_LCSSA]], [[FOR_COND_CLEANUP3]] ] @@ -28,22 +26,22 @@ ; CHECK-NEXT: [[ADD5_LCSSA]] = phi i16 [ [[ADD5:%.*]], [[FOR_BODY4]] ] ; CHECK-NEXT: [[INDVAR_NEXT3]] = add i32 [[INDVAR2]], 1 ; CHECK-NEXT: [[INC7]] = add nuw nsw i16 [[I_013]], 1 -; CHECK-NEXT: [[EXITCOND14_NOT:%.*]] = icmp eq i32 [[INDVAR_NEXT3]], [[FLATTEN_TRIPCOUNT]] +; CHECK-NEXT: [[EXITCOND14_NOT:%.*]] = icmp eq i32 [[INDVAR_NEXT3]], 4 ; CHECK-NEXT: br i1 [[EXITCOND14_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]] ; CHECK: for.body4: -; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER]] ] +; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ [[INDVAR_NEXT:%.*]], [[FOR_BODY4]] ], [ 0, [[FOR_COND1_PREHEADER]] ] ; CHECK-NEXT: [[J_011:%.*]] = phi i16 [ 0, [[FOR_COND1_PREHEADER]] ] -; CHECK-NEXT: [[SUM_110:%.*]] = phi i16 [ [[SUM_012]], [[FOR_COND1_PREHEADER]] ] +; CHECK-NEXT: [[SUM_110:%.*]] = phi i16 [ [[SUM_012]], [[FOR_COND1_PREHEADER]] ], [ [[ADD5]], [[FOR_BODY4]] ] ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[INDVAR]], [[TMP0]] ; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i16 [[J_011]], [[MUL]] ; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 [[TMP2]] to i16 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [64 x i16], [64 x i16]* @v, i16 0, i16 [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = load i16, i16* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[ADD5]] = add nsw i16 [[TMP4]], [[SUM_110]] -; CHECK-NEXT: [[INDVAR_NEXT:%.*]] = add i32 [[INDVAR]], 1 +; CHECK-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 ; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i16 [[J_011]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INDVAR_NEXT]], 16 -; CHECK-NEXT: br label [[FOR_COND_CLEANUP3]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]] ; entry: br label %for.cond1.preheader