Index: lib/Analysis/ScalarEvolution.cpp =================================================================== --- lib/Analysis/ScalarEvolution.cpp +++ lib/Analysis/ScalarEvolution.cpp @@ -6306,8 +6306,35 @@ // also returns true if StepV is maximally negative (eg, INT_MIN), but that // case is not handled as this code is guarded by !CountDown. if (StepV.isPowerOf2() && - GetMinTrailingZeros(Distance) >= StepV.countTrailingZeros()) - return getUDivExactExpr(Distance, Step); + GetMinTrailingZeros(Distance) >= StepV.countTrailingZeros()) { + // Here we've constrained the equation to be of the form + // + // 2^(N + k) * Distance' = (StepV == 2^N) * X (mod 2^W) ... (0) + // + // where we're operating on a W bit wide integer domain and k is + // non-negative. The smallest unsigned solution for X is the trip count. + // + // (0) is equivalent to: + // + // 2^(N + k) * Distance' - 2^N * X = L * 2^W + // <=> 2^N(2^k * Distance' - X) = L * 2^(W - N) * 2^N + // <=> 2^k * Distance' - X = L * 2^(W - N) + // <=> 2^k * Distance' = L * 2^(W - N) + X ... (1) + // + // The smallest X satisfying (1) is unsigned remainder of dividing the LHS + // by 2^(W - N). + + const auto *ModuloResult = getUDivExactExpr(Distance, Step); + + // Since SCEV does not have a URem instruction, we mask out the lower bits + // by a truncate followed by a zero extend to compute the remainder. + + unsigned NarrowWidth = StepV.getBitWidth() - StepV.countTrailingZeros(); + auto *NarrowTy = IntegerType::get(getContext(), NarrowWidth); + auto *WideTy = Distance->getType(); + + return getZeroExtendExpr(getTruncateExpr(ModuloResult, NarrowTy), WideTy); + } } // If the condition controls loop exit (the loop exits only if the expression Index: test/Analysis/ScalarEvolution/pr24757.ll =================================================================== --- /dev/null +++ test/Analysis/ScalarEvolution/pr24757.ll @@ -0,0 +1,35 @@ +; RUN: opt -analyze -scalar-evolution < %s | FileCheck %s + +; CHECK: Loop %bb1: backedge-taken count is (zext i7 (trunc i8 %a.promoted to i7) to i8) + +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.10.0" + +@a = global i8 -127, align 1 +@b = common global i32 0, align 4 + +declare void @use(i32) + +define i32 @main() { +bb: + %a.promoted = load i8, i8* @a + br label %bb1 + +bb1: ; preds = %bb1, %bb + %tmp = phi i8 [ %tmp2, %bb1 ], [ %a.promoted, %bb ] + %tmp2 = add i8 %tmp, -1 + %tmp3 = sext i8 %tmp to i32 + %tmp4 = xor i32 %tmp3, -1 + %tmp5 = sext i8 %tmp2 to i32 + %tmpf = sub nsw i32 %tmp4, %tmp5 + %tmp6 = trunc i32 %tmpf to i8 + %tmp7 = icmp eq i8 %tmp6, 0 + br i1 %tmp7, label %bb8, label %bb1 + +bb8: ; preds = %bb1 + store i8 %tmp2, i8* @a + store i32 %tmp4, i32* @b + %tmp9 = sext i8 %tmp2 to i32 + call void @use(i32 %tmp9) + ret i32 0 +}