Index: llvm/lib/Transforms/Scalar/LoopFlatten.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopFlatten.cpp +++ llvm/lib/Transforms/Scalar/LoopFlatten.cpp @@ -166,6 +166,20 @@ m_c_Add(m_Specific(InductionPHI), m_ConstantInt<1>()))) { Increment = dyn_cast(Compare->getOperand(1)); Limit = Compare->getOperand(0); + } else { + // The compare may have been altered by another transformation + // (e.g icmp ult %inc, limit -> icmp ult %j, limit-1). + // In this case the increment is obtained from the InductionPHI + // and the limit is the RHS of the compare + 1. + Value *LatchValue = InductionPHI->getIncomingValueForBlock(Latch); + if (match(LatchValue, + m_c_Add(m_Specific(InductionPHI), m_ConstantInt<1>()))) { + Increment = dyn_cast(LatchValue); + ConstantInt *RHS = cast(Compare->getOperand(1)); + ConstantInt *One = ConstantInt::get(RHS->getType(), 1, true); + Limit = ConstantInt::get(Compare->getContext(), + RHS->getValue() + One->getValue()); + } } if (!Increment || Increment->hasNUsesOrMore(3)) { LLVM_DEBUG(dbgs() << "Cound not find valid increment\n"); @@ -383,7 +397,13 @@ LLVM_DEBUG(dbgs() << "Use is optimisable\n"); ValidOuterPHIUses.insert(MatchedMul); FI.LinearIVUses.insert(U); - } else { + } else if (U == FI.InnerBranch->getCondition()) + // The use is in the compare which is also the condition of the inner + // branch. In this case the compare has been altered by another + // transformation (e.g icmp ult %inc, limit -> icmp ult %j, limit-1). + // Ignore this use as the compare gets removed later anyway. + continue; + else { LLVM_DEBUG(dbgs() << "Did not match expected pattern, bailing\n"); return false; } Index: llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll =================================================================== --- llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll +++ llvm/test/Transforms/LoopFlatten/loop-flatten-negative.ll @@ -341,6 +341,70 @@ ret i32 10 } +; test_10 and test_11 are for the case when the inner limit is a +; defined integer (e.g. 20), then InstCombine makes the transformation: +; icmp ult i32 %inc, 20 -> icmp ult i32 %j, 20-step. + +; test_10: If the step is not 1, the loop shouldn't be flattened. +define i32 @test_10(i32* nocapture %A) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: + %i.017 = phi i32 [ 0, %entry ], [ %inc, %for.cond.cleanup3 ] + %mul = mul i32 %i.017, 20 + br label %for.body4 + +for.body4: + %j.016 = phi i32 [ 0, %for.cond1.preheader ], [ %add5, %for.body4 ] + %add = add i32 %j.016, %mul + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add + store i32 30, i32* %arrayidx, align 4 + %add5 = add nuw nsw i32 %j.016, 2 + %cmp2 = icmp ult i32 %j.016, 18 + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3 + +for.cond.cleanup3: + %inc = add i32 %i.017, 1 + %cmp = icmp ult i32 %inc, 11 + br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup + +for.cond.cleanup: + %0 = load i32, i32* %A, align 4 + ret i32 %0 +} + +; test_11: The inner inducation variable is used in a compare which +; isn't the condition of the inner branch. +define i32 @test_11(i32* nocapture %A) { +entry: + br label %for.cond1.preheader + +for.cond1.preheader: + %i.020 = phi i32 [ 0, %entry ], [ %inc7, %for.cond.cleanup3 ] + %mul = mul i32 %i.020, 20 + br label %for.body4 + +for.body4: + %j.019 = phi i32 [ 0, %for.cond1.preheader ], [ %inc, %for.body4 ] + %cmp5 = icmp ult i32 %j.019, 5 + %cond = select i1 %cmp5, i32 30, i32 15 + %add = add i32 %j.019, %mul + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add + store i32 %cond, i32* %arrayidx, align 4 + %inc = add nuw nsw i32 %j.019, 1 + %cmp2 = icmp ult i32 %j.019, 19 + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3 + +for.cond.cleanup3: + %inc7 = add i32 %i.020, 1 + %cmp = icmp ult i32 %inc7, 11 + br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup + +for.cond.cleanup: + %0 = load i32, i32* %A, align 4 + ret i32 %0 +} ; Outer loop conditional phi define i32 @e() { Index: llvm/test/Transforms/LoopFlatten/loop-flatten.ll =================================================================== --- llvm/test/Transforms/LoopFlatten/loop-flatten.ll +++ llvm/test/Transforms/LoopFlatten/loop-flatten.ll @@ -586,6 +586,59 @@ ret i32 10 } +; CHECK-LABEL: test9 +; When the inner loop limit is a defined integer (e.g. 20) and the step +; is 1, InstCombine causes the transformation: +; icmp ult i32 %inc, 20 -> icmp ult i32 %j, 19. +; This is an 'unoptimizable' use of the inner induction variable %j but +; we should still flatten the loop as this compare instruction is +; removed later anyway. +define i32 @test9(i32* nocapture %A) { +entry: + br label %for.cond1.preheader +; CHECK: entry: +; CHECK: %flatten.tripcount = mul i32 20, 11 +; CHECK: br label %for.cond1.preheader + +for.cond1.preheader: + %i.017 = phi i32 [ 0, %entry ], [ %inc6, %for.cond.cleanup3 ] + %mul = mul i32 %i.017, 20 + br label %for.body4 +; CHECK: for.cond1.preheader: +; CHECK: %i.017 = phi i32 [ 0, %entry ], [ %inc6, %for.cond.cleanup3 ] +; CHECK: %mul = mul i32 %i.017, 20 +; CHECK: br label %for.body4 + +for.cond.cleanup3: + %inc6 = add i32 %i.017, 1 + %cmp = icmp ult i32 %inc6, 11 + br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup +; CHECK: for.cond.cleanup3: +; CHECK: %inc6 = add i32 %i.017, 1 +; CHECK: %cmp = icmp ult i32 %inc6, %flatten.tripcount +; CHECK: br i1 %cmp, label %for.cond1.preheader, label %for.cond.cleanup + +for.body4: + %j.016 = phi i32 [ 0, %for.cond1.preheader ], [ %inc, %for.body4 ] + %add = add i32 %j.016, %mul + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %add + store i32 30, i32* %arrayidx, align 4 + %inc = add nuw nsw i32 %j.016, 1 + %cmp2 = icmp ult i32 %j.016, 19 + br i1 %cmp2, label %for.body4, label %for.cond.cleanup3 +; CHECK: for.body4 +; CHECK: %j.016 = phi i32 [ 0, %for.cond1.preheader ] +; CHECK: %add = add i32 %j.016, %mul +; CHECK: %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.017 +; CHECK: store i32 30, i32* %arrayidx, align 4 +; CHECK: %inc = add nuw nsw i32 %j.016, 1 +; CHECK: %cmp2 = icmp ult i32 %j.016, 19 +; CHECK: br label %for.cond.cleanup3 + +for.cond.cleanup: + %0 = load i32, i32* %A, align 4 + ret i32 %0 +} declare i32 @func(i32)