Index: lib/Transforms/Scalar/LoopPredication.cpp =================================================================== --- lib/Transforms/Scalar/LoopPredication.cpp +++ lib/Transforms/Scalar/LoopPredication.cpp @@ -174,6 +174,9 @@ using namespace llvm; +static cl::opt EnableIVTruncation("loop-predication-enable-iv-truncation", + cl::Hidden, cl::init(true)); + namespace { class LoopPredication { /// Represents an induction variable check: @@ -212,6 +215,22 @@ IRBuilder<> &Builder); bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander); + // When the IV type is wider than the range operand type, we can still do loop + // predication, by generating SCEVs for the range and latch that are of the + // same type. We achieve this by generating a SCEV truncate expression for the + // latch IV. This is done iff truncation of the IV is a safe operation, + // without loss of information. + // Another way to achieve this is by generating a wider type SCEV for the + // range check operand, however, this needs a more involved check that + // operands do not overflow. This can lead to loss of information when the + // range operand is of the form: add i32 %offset, %iv. We need to prove that + // sext(x + y) is same as sext(x) + sext(y). + // This function returns true if we can safely represent the IV type in + // the RangeCheckType without loss of information. + bool isSafeToTruncateWideIVType(Type *RangeCheckType); + // Return the loopLatchCheck corresponding to the RangeCheckType if safe to do + // so. + Optional generateLoopLatchCheck(Type *RangeCheckType); public: LoopPredication(ScalarEvolution *SE) : SE(SE){}; bool runOnLoop(Loop *L); @@ -301,6 +320,34 @@ return Builder.CreateICmp(Pred, LHSV, RHSV); } +Optional +LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) { + + auto *LatchType = LatchCheck.IV->getType(); + if (RangeCheckType == LatchType) + return LatchCheck; + // For now, bail out if latch type is narrower than range type. + if (DL->getTypeSizeInBits(LatchType) < DL->getTypeSizeInBits(RangeCheckType)) + return None; + if (!isSafeToTruncateWideIVType(RangeCheckType)) + return None; + // We can now safely identify the truncated version of the IV and limit for + // RangeCheckType. + LoopICmp NewLatchCheck; + NewLatchCheck.Pred = LatchCheck.Pred; + NewLatchCheck.IV = dyn_cast( + SE->getTruncateExpr(LatchCheck.IV, RangeCheckType)); + if (!NewLatchCheck.IV) + return None; + NewLatchCheck.Limit = SE->getTruncateExpr(LatchCheck.Limit, RangeCheckType); + DEBUG(dbgs() << "IV of type: " << *LatchType + << "can be represented as range check type:" << *RangeCheckType + << "\n"); + DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n"); + DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n"); + return NewLatchCheck; +} + /// If ICI can be widened to a loop invariant condition emits the loop /// invariant condition in the loop preheader and return it, otherwise /// returns None. @@ -325,22 +372,29 @@ return None; } auto *RangeCheckIV = RangeCheck->IV; - auto *Ty = RangeCheckIV->getType(); - if (Ty != LatchCheck.IV->getType()) { - DEBUG(dbgs() << "Type mismatch between range check and latch IVs!\n"); - return None; - } if (!RangeCheckIV->isAffine()) { DEBUG(dbgs() << "Range check IV is not affine!\n"); return None; } auto *Step = RangeCheckIV->getStepRecurrence(*SE); - if (Step != LatchCheck.IV->getStepRecurrence(*SE)) { + if (!Step->isOne()) { DEBUG(dbgs() << "Range check and latch have IVs different steps!\n"); return None; } - assert(Step->isOne() && "must be one"); + auto *Ty = RangeCheckIV->getType(); + auto CurrLatchCheckOpt = generateLoopLatchCheck(Ty); + if (!CurrLatchCheckOpt) { + DEBUG(dbgs() << "Failed to generate a loop latch check " + "corresponding to range type: " + << *Ty << "\n"); + return None; + } + LoopICmp CurrLatchCheck = *CurrLatchCheckOpt; + // At this point the range check step and latch step should have the same + // value and type. + assert(Step == CurrLatchCheck.IV->getStepRecurrence(*SE) && + "Range and latch should have same step recurrence!"); // Generate the widened condition: // guardStart u< guardLimit && // latchLimit guardLimit - 1 - guardStart + latchStart @@ -348,8 +402,8 @@ // header comment for the reasoning. const SCEV *GuardStart = RangeCheckIV->getStart(); const SCEV *GuardLimit = RangeCheck->Limit; - const SCEV *LatchStart = LatchCheck.IV->getStart(); - const SCEV *LatchLimit = LatchCheck.Limit; + const SCEV *LatchStart = CurrLatchCheck.IV->getStart(); + const SCEV *LatchLimit = CurrLatchCheck.Limit; // guardLimit - guardStart + latchStart - 1 const SCEV *RHS = @@ -357,7 +411,7 @@ SE->getMinusSCEV(LatchStart, SE->getOne(Ty))); ICmpInst::Predicate LimitCheckPred; - switch (LatchCheck.Pred) { + switch (CurrLatchCheck.Pred) { case ICmpInst::ICMP_ULT: LimitCheckPred = ICmpInst::ICMP_ULE; break; @@ -510,6 +564,36 @@ return Result; } +// Returns true if its safe to truncate the IV to RangeCheckType. +bool LoopPredication::isSafeToTruncateWideIVType(Type *RangeCheckType) { + if (!EnableIVTruncation) + return false; + assert(DL->getTypeSizeInBits(LatchCheck.IV->getType()) > + DL->getTypeSizeInBits(RangeCheckType) && + "Expected latch check IV type to be larger than range check operand " + "type!"); + // The start and end values of the IV should be known. This is to guarantee + // that truncating the wide type will not lose information. + auto *Limit = dyn_cast(LatchCheck.Limit); + auto *Start = dyn_cast(LatchCheck.IV->getStart()); + if (!Limit || !Start) + return false; + // This check makes sure that the IV does not change sign during loop + // iterations. Consider latchType = i64, LatchStart = 5, Pred = ICMP_SGE, + // LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the + // IV wraps around, and the truncation of the IV would lose the range of + // iterations between 2^32 and 2^64. + bool Increasing; + if (!SE->isMonotonicPredicate(LatchCheck.IV, LatchCheck.Pred, Increasing)) + return false; + // The active bits should be less than the bits in the RangeCheckType. This + // guarantees that truncating the latch check to RangeCheckType is a safe + // operation. + auto RangeCheckTypeBitSize = DL->getTypeSizeInBits(RangeCheckType); + return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize && + Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize; +} + bool LoopPredication::runOnLoop(Loop *Loop) { L = Loop; Index: test/Transforms/LoopPredication/widened.ll =================================================================== --- /dev/null +++ test/Transforms/LoopPredication/widened.ll @@ -0,0 +1,138 @@ +; RUN: opt -S -loop-predication -loop-predication-enable-iv-truncation=true < %s 2>&1 | FileCheck %s +declare void @llvm.experimental.guard(i1, ...) + +declare i32 @length(i8*) + +declare i16 @short_length(i8*) +; Consider range check of type i16 and i32, while IV is of type i64 +; We can loop predicate this because the IV range is within i16 and within i32. +define i64 @iv_wider_type_rc_two_narrow_types(i32 %offA, i16 %offB, i8* %arrA, i8* %arrB) { +; CHECK-LABEL: iv_wider_type_rc_two_narrow_types +entry: +; CHECK-LABEL: entry: +; CHECK: [[idxB:[^ ]+]] = sub i16 %lengthB, %offB +; CHECK-NEXT: [[limit_checkB:[^ ]+]] = icmp ule i16 16, [[idxB]] +; CHECK-NEXT: [[first_iteration_checkB:[^ ]+]] = icmp ult i16 %offB, %lengthB +; CHECK-NEXT: [[WideChkB:[^ ]+]] = and i1 [[first_iteration_checkB]], [[limit_checkB]] +; CHECK-NEXT: [[idxA:[^ ]+]] = sub i32 %lengthA, %offA +; CHECK-NEXT: [[limit_checkA:[^ ]+]] = icmp ule i32 16, [[idxA]] +; CHECK-NEXT: [[first_iteration_checkA:[^ ]+]] = icmp ult i32 %offA, %lengthA +; CHECK-NEXT: [[WideChkA:[^ ]+]] = and i1 [[first_iteration_checkA]], [[limit_checkA]] + %lengthA = call i32 @length(i8* %arrA) + %lengthB = call i16 @short_length(i8* %arrB) + br label %loop + +loop: +; CHECK-LABEL: loop: +; CHECK: [[invariant_check:[^ ]+]] = and i1 [[WideChkB]], [[WideChkA]] +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[invariant_check]], i32 9) + %iv = phi i64 [0, %entry ], [ %iv.next, %loop ] + %iv.trunc.32 = trunc i64 %iv to i32 + %iv.trunc.16 = trunc i64 %iv to i16 + %indexA = add i32 %iv.trunc.32, %offA + %indexB = add i16 %iv.trunc.16, %offB + %rcA = icmp ult i32 %indexA, %lengthA + %rcB = icmp ult i16 %indexB, %lengthB + %wide.chk = and i1 %rcA, %rcB + call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk, i32 9) [ "deopt"() ] + %indexA.ext = zext i32 %indexA to i64 + %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext + %eltA = load i8, i8* %addrA + %indexB.ext = zext i16 %indexB to i64 + %addrB = getelementptr inbounds i8, i8* %arrB, i64 %indexB.ext + store i8 %eltA, i8* %addrB + %iv.next = add nuw nsw i64 %iv, 1 + %latch.check = icmp ult i64 %iv.next, 16 + br i1 %latch.check, label %loop, label %exit + +exit: + ret i64 %iv +} + + +; Consider an IV of type long and an array access into int array. +; IV is of type i64 while the range check operands are of type i32 and i64. +define i64 @iv_rc_different_types(i32 %offA, i32 %offB, i8* %arrA, i8* %arrB, i64 %max) +{ +; CHECK-LABEL: iv_rc_different_types +entry: +; CHECK-LABEL: entry: +; CHECK: [[lenB:[^ ]+]] = add i32 %lengthB, -1 +; CHECK-NEXT: [[idxB:[^ ]+]] = sub i32 [[lenB]], %offB +; CHECK-NEXT: [[limit_checkB:[^ ]+]] = icmp ule i32 15, [[idxB]] +; CHECK-NEXT: [[first_iteration_checkB:[^ ]+]] = icmp ult i32 %offB, %lengthB +; CHECK-NEXT: [[WideChkB:[^ ]+]] = and i1 [[first_iteration_checkB]], [[limit_checkB]] +; CHECK-NEXT: [[maxMinusOne:[^ ]+]] = add i64 %max, -1 +; CHECK-NEXT: [[limit_checkMax:[^ ]+]] = icmp ule i64 15, [[maxMinusOne]] +; CHECK-NEXT: [[first_iteration_checkMax:[^ ]+]] = icmp ult i64 0, %max +; CHECK-NEXT: [[WideChkMax:[^ ]+]] = and i1 [[first_iteration_checkMax]], [[limit_checkMax]] +; CHECK-NEXT: [[lenA:[^ ]+]] = add i32 %lengthA, -1 +; CHECK-NEXT: [[idxA:[^ ]+]] = sub i32 [[lenA]], %offA +; CHECK-NEXT: [[limit_checkA:[^ ]+]] = icmp ule i32 15, [[idxA]] +; CHECK-NEXT: [[first_iteration_checkA:[^ ]+]] = icmp ult i32 %offA, %lengthA +; CHECK-NEXT: [[WideChkA:[^ ]+]] = and i1 [[first_iteration_checkA]], [[limit_checkA]] + %lengthA = call i32 @length(i8* %arrA) + %lengthB = call i32 @length(i8* %arrB) + br label %loop + +loop: +; CHECK-LABEL: loop: +; CHECK: [[BandMax:[^ ]+]] = and i1 [[WideChkB]], [[WideChkMax]] +; CHECK: [[ABandMax:[^ ]+]] = and i1 [[BandMax]], [[WideChkA]] +; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[ABandMax]], i32 9) + %iv = phi i64 [0, %entry ], [ %iv.next, %loop ] + %iv.trunc = trunc i64 %iv to i32 + %indexA = add i32 %iv.trunc, %offA + %indexB = add i32 %iv.trunc, %offB + %rcA = icmp ult i32 %indexA, %lengthA + %rcIV = icmp ult i64 %iv, %max + %wide.chk = and i1 %rcA, %rcIV + %rcB = icmp ult i32 %indexB, %lengthB + %wide.chk.final = and i1 %wide.chk, %rcB + call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk.final, i32 9) [ "deopt"() ] + %indexA.ext = zext i32 %indexA to i64 + %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext + %eltA = load i8, i8* %addrA + %indexB.ext = zext i32 %indexB to i64 + %addrB = getelementptr inbounds i8, i8* %arrB, i64 %indexB.ext + %eltB = load i8, i8* %addrB + %result = xor i8 %eltA, %eltB + store i8 %result, i8* %addrA + %iv.next = add nuw nsw i64 %iv, 1 + %latch.check = icmp ult i64 %iv, 15 + br i1 %latch.check, label %loop, label %exit + +exit: + ret i64 %iv +} + +; cannot narrow the IV to the range type, because we lose information. +; for (i64 i= 5; i>= 2; i++) +; this loop wraps around after reaching 2^64. +define i64 @iv_rc_different_type(i32 %offA, i8* %arrA) { +; CHECK-LABEL: iv_rc_different_type +entry: + %lengthA = call i32 @length(i8* %arrA) + br label %loop + +loop: +; CHECK-LABEL: loop: +; CHECK: %rcA = icmp ult i32 %indexA, %lengthA +; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %rcA, i32 9) + %iv = phi i64 [ 5, %entry ], [ %iv.next, %loop ] + %iv.trunc.32 = trunc i64 %iv to i32 + %indexA = add i32 %iv.trunc.32, %offA + %rcA = icmp ult i32 %indexA, %lengthA + call void (i1, ...) @llvm.experimental.guard(i1 %rcA, i32 9) [ "deopt"() ] + %indexA.ext = zext i32 %indexA to i64 + %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext + %eltA = load i8, i8* %addrA + %res = add i8 %eltA, 2 + store i8 %eltA, i8* %addrA + %iv.next = add i64 %iv, 1 + %latch.check = icmp sge i64 %iv.next, 2 + br i1 %latch.check, label %loop, label %exit + +exit: + ret i64 %iv +}