Index: llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -6766,9 +6766,28 @@ // If this is an IV which we could replace the terminating condition, return // the final value of the alternative IV on the last iteration. auto getAlternateIVEnd = [&](PHINode &PN) -> const SCEV * { - // FIXME: This does not properly account for overflow. const SCEVAddRecExpr *AddRec = cast(SE.getSCEV(&PN)); const SCEV *BECount = SE.getBackedgeTakenCount(L); + + // Check that we can compute the value of AddRec on the exiting iteration + // without soundness problems. There are two cases to be worried about: + // 1) BECount could be 255 with type i8. Simply adding one would be + // incorrect. We may need one extra bit to represent the unsigned + // trip count. + // 2) The multiplication of stride by TC may wrap around. This is subtle + // because computing the result accounting for wrap is insufficient. + // In order to use the result in an exit test, we must also know that + // AddRec doesn't take the same value on any previous iteration. + // The simplest case to consider is a candidate IV which is narrower + // than the trip count (and thus original IV), but this can also + // happen due to non-unit strides on the candidate IVs. + ConstantRange StepCR = SE.getSignedRange(AddRec->getStepRecurrence(SE)); + ConstantRange BECountCR = SE.getUnsignedRange(BECount); + unsigned NoOverflowBitWidth = BECountCR.getActiveBits() + 1 + StepCR.getMinSignedBits(); + unsigned ARBitWidth = SE.getTypeSizeInBits(AddRec->getType()); + if (NoOverflowBitWidth > ARBitWidth) + return nullptr; + const SCEV *TermValueS = SE.getAddExpr( AddRec->getOperand(0), SE.getTruncateOrZeroExtend( Index: llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll =================================================================== --- llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll +++ llvm/test/Transforms/LoopStrengthReduce/lsr-term-fold.ll @@ -192,20 +192,18 @@ ; In this case, the integer IV has a larger bitwidth than the pointer IV. ; This means that the smaller IV may wrap around multiple times before ; the original loop exit is taken. -; FIXME: miscompile define void @iv_size(ptr %a, i128 %N) { ; CHECK-LABEL: @iv_size( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = trunc i128 [[N:%.*]] to i64 -; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 -; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP1]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[UGLYGEP2:%.*]], [[FOR_BODY]] ], [ [[A]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[UGLYGEP2:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i128 [ [[LSR_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[N:%.*]], [[ENTRY]] ] ; CHECK-NEXT: store i32 1, ptr [[LSR_IV1]], align 4 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i128 [[LSR_IV]], -1 ; CHECK-NEXT: [[UGLYGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4 -; CHECK-NEXT: [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND:%.*]] = icmp eq ptr [[UGLYGEP2]], [[SCEVGEP]] -; CHECK-NEXT: br i1 [[LSR_FOLD_TERM_COND_REPLACED_TERM_COND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i128 [[LSR_IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: ret void ;