Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -569,7 +569,7 @@ Value *getOrCreateTripCount(Loop *NewLoop); /// Returns (and creates if needed) the trip count of the widened loop. - Value *getOrCreateVectorTripCount(Loop *NewLoop); + Value *getOrCreateVectorTripCount(Loop *NewLoop, Loop *OrigLoop = nullptr); /// Emit a bypass check to see if the trip count would overflow, or we /// wouldn't have enough iterations to execute one vector loop. @@ -3217,7 +3217,8 @@ return TripCount; } -Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) { +Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L, + Loop *OrigLoop) { if (VectorTripCount) return VectorTripCount; @@ -3232,6 +3233,73 @@ Constant *Step = ConstantInt::get(TC->getType(), VF * UF); Value *R = Builder.CreateURem(TC, Step, "n.mod.vf"); + assert(OrigLoop && "during first calculation of VectorTripCount, the orig " + "loop should be passed in!"); + + // If the loop terminating condition uses the phi value at loop entry rather + // than the + // induction variable at the current iteration, and + // extract the result at the last element of the vectorized phi (as done in + // fixFirstOrderRecurrence), we may get an incorrect result. + // Note that this + // problem only arises when both these conditions occur: + // 1. TC evenly divides Step, and the scalar loop is not run. + // 2. Some Phi value which is a relation based on the IV phi, is + // extracted out of the loop (this is the firstOrderRecurrenceRelation). + // For correctness, the vectorized part of the loop should be TC - Step times + // and the remaining iterations should be run in the scalar remainder loop. + // Example IR: + // loop: + // %var = phi i32 [ %init, %ph ], [ %sum, %loop ] + // %indvar = phi i32 [ 0, %ph ], [ %indvar.next, %loop ] + // %sum = add i32 %indvar, %a + // %indvar.next = add i32 %indvar, 1 + // %c11 = icmp slt i32 %indvar, 95 + // br i1 %c11, label %exit, label %loop + // exit: + // %var.lcssa = phi i32 [ %var, %loop ] + // ret i32 %var.lcssa + // Here although the trip count is 96, if we run the vectorized loop + // with Step = 4, %var will be extracted as 95 + a since we extract the + // last element of the vectorized %var, whereas it should be 94 + + // a. + // This is because the exit block uses the `var` value (calculated at previous + // iteration) and not the `sum` value + // calculated at current iteration of loop. + auto IsSecondLastIterationExtracted = [&]() { + auto *ExitBlock = OrigLoop->getExitBlock(); + auto *HeaderBlock = OrigLoop->getHeader(); + auto *Cmp = dyn_cast(HeaderBlock->getTerminator()->getOperand(0)); + if (!Cmp) + return false; + // Confirmed that we are effectively running TC - 1 number of iterations. + bool LoopTermConditionUsesPhi = any_of( + Cmp->operands(), [&](Use &U) { return Legal->isInductionVariable(U); }); + if (!LoopTermConditionUsesPhi) + return false; + for (auto &I : *HeaderBlock) { + if (!isa(I)) + break; + // We only care about phi nodes that depend on the IV (i.e. secondary + // induction variables). + if ((Legal->getPrimaryInduction()) == &(cast(I))) + continue; + for (auto *U : I.users()) { + if (auto *PhiUse = dyn_cast(U)) + // At this stage, we know that the value for the secondary IV + // should be the second last element in the vectorized version (since + // we are using the phi). + // However, we don't have the capability in vector.extractelement to + // choose between secondlast and last element. So, we conservatively + // assume that the secondlastiteration will be used and generate the + // loop accordingly. + if (PhiUse->getParent() == ExitBlock) + return true; + } + } + return false; + }; + // If there is a non-reversed interleaved group that may speculatively access // memory out-of-bounds, we need to ensure that there will be at least one // iteration of the scalar epilogue loop. Thus, if the step evenly divides @@ -3239,7 +3307,11 @@ // does not evenly divide the trip count, no adjustment is necessary since // there will already be scalar iterations. Note that the minimum iterations // check ensures that N >= Step. - if (VF > 1 && Legal->requiresScalarEpilogue()) { + // Also, if the second last iteration val is extracted, we should follow this logic of + // (N - Step) iterations in vectorized loop and remaining (atmost Step) + // iterations in scalar loop. + if (VF > 1 && + (Legal->requiresScalarEpilogue() || IsSecondLastIterationExtracted())) { auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0)); R = Builder.CreateSelect(IsZero, Step, R); } @@ -3448,8 +3520,9 @@ // body. In case of overflow we want to directly jump to the scalar remainder // loop. emitMinimumIterationCountCheck(Lp, ScalarPH); - // Now, compare the new count to zero. If it is zero skip the vector loop and - // jump to the scalar loop. + // The loop step is equal to the vectorization factor (num of SIMD elements) + // times the unroll factor (num of SIMD instructions). + Value *CountRoundDown = getOrCreateVectorTripCount(Lp, OrigLoop); emitVectorLoopEnteredCheck(Lp, ScalarPH); // Generate the code to check any assumptions that we've made for SCEV // expressions. @@ -3461,9 +3534,6 @@ emitMemRuntimeChecks(Lp, ScalarPH); // Generate the induction variable. - // The loop step is equal to the vectorization factor (num of SIMD elements) - // times the unroll factor (num of SIMD instructions). - Value *CountRoundDown = getOrCreateVectorTripCount(Lp); Constant *Step = ConstantInt::get(IdxTy, VF * UF); Induction = createInductionVariable(Lp, StartIdx, CountRoundDown, Step, @@ -4070,7 +4140,13 @@ VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch()); // Extract the last vector element in the middle block. This will be the - // initial value for the recurrence when jumping to the scalar loop. + // initial value for the recurrence when jumping to the scalar loop. Note + // that the last element need not always be the correct one, see + // IsSecondLastIterationExtracted. We fix the vector trip count so that the assumption + // of last element holds true. It would have been nice to fix this here (i.e. + // extract from the correct index), but extractElement does not allow anything + // other than a constant as the index. We will not know if it is the last + // element or the element before the last, until runtime. auto *Extract = Incoming; if (VF > 1) { Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); Index: test/Transforms/LoopVectorize/first-order-recurrence.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -0,0 +1,100 @@ +; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -S | FileCheck %s + +; This loop has 96 iterations, which divides evenly with the Step = 4. However, the vectorized part should only be upto 192. +; The remaining scalar iterations will +; be done at the remainder loop, so that we extract the correct value for +; %val.phi = 94 + x. +; If we vectorize the loop completely, we extract the last element of the vector val.phi (which would be incorrect: 95 + x) +; CHECK-LABEL: extract_second_last_iteration +; CHECK: vector.body: +; CHECK: %vec.ind = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] +; CHECK: %[[L1:.+]] = add <4 x i32> %vec.ind, %broadcast.splat +; CHECK: index.next = add i32 %index, 4 +; CHECK: icmp eq i32 %index.next, 92 +; CHECK: middle.block: +; CHECK: %vector.recur.extract = extractelement <4 x i32> %[[L1]], i32 3 +; CHECK: scalar.ph: +; CHECK: %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ 0, %min.iters.checked ], [ 0, %entry ] +; CHECK: %bc.resume.val = phi i32 [ 92, %middle.block ], [ 0, %entry ], [ 0, %min.iters.checked ] +; CHECK: for.body: +; CHECK: %inc.phi = phi i32 [ %bc.resume.val, %scalar.ph ], [ %inc, %for.body ] +; CHECK: %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ %addx, %for.body ] +; CHECK: %addx = add i32 %inc.phi, %x +; CHECK: %cmp = icmp eq i32 %inc.phi, 95 +; CHECK: for.end: +; CHECK: %val.phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract, %middle.block ] +; CHECK: ret i32 %val.phi.lcssa +define i32 @extract_second_last_iteration(i32* %cval, i32 %x) { +entry: + br label %for.body + +for.body: + %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %val.phi = phi i32 [ 0, %entry ], [ %addx, %for.body ] + %inc = add i32 %inc.phi, 1 + %bc = zext i32 %inc.phi to i64 + %addx = add i32 %inc.phi, %x + %cmp = icmp eq i32 %inc.phi, 95 + br i1 %cmp, label %for.end, label %for.body + +for.end: + ret i32 %val.phi +} + +; same as test above but we also store into index = val.phi. +; FIXME: We could vectorize in this case following the same logic above (92 +; iterations vectorized and remaining is scalarized), +; but currently the legality phase of the vectorizer prevents this. +; CHECK-LABEL: storing_upto_second_last +; CHECK-NOT: vector.body +define i32 @storing_upto_second_last(i32* %cval, i32 %x) { +entry: + br label %for.body + +for.body: +; val.phi at end of loop is 94 + x. + %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %val.phi = phi i32 [ 0, %entry ], [ %addx, %for.body ] + %inc = add nsw i32 %inc.phi, 1 + %bc = zext i32 %val.phi to i64 + %c40 = getelementptr inbounds i32, i32 * %cval, i64 %bc + %addx = add nuw nsw i32 %inc.phi, %x + store i32 %val.phi, i32* %c40 + %cmp = icmp eq i32 %inc.phi, 95 + br i1 %cmp, label %for.end, label %for.body + +for.end: + ret i32 0 +} + +; We are extracting the last element from the loop. Make sure that we don't +; unnecessarily create TC - Step iterations (i.e. 92 iterations) for the loop. +; We can completely vectorize the loop at 96 iterations and extract the correct +; value inc.phi = 95. +; CHECK-LABEL: extract_last_iteration +; CHECK: vector.body: +; CHECK: %vec.ind = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] +; CHECK: index.next = add i32 %index, 4 +; CHECK: icmp eq i32 %index.next, 96 +; CHECK: middle.block: +; CHECK: %cmp.n = icmp eq i32 96, 96 +; CHECK: br i1 %cmp.n, label %for.end, label %scalar.ph +; CHECK: for.end: +; CHECK: %inc.phi.lcssa = phi i32 [ %inc.phi, %for.body ], [ 95, %middle.block ] +; CHECK: ret i32 %inc.phi.lcssa +define i32 @extract_last_iteration(i32* %cval, i32 %x) { +entry: + br label %for.body + +for.body: + %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %val.phi = phi i32 [ 0, %entry ], [ %addx, %for.body ] + %inc = add i32 %inc.phi, 1 + %bc = zext i32 %inc.phi to i64 + %addx = add i32 %inc.phi, %x + %cmp = icmp eq i32 %inc.phi, 95 + br i1 %cmp, label %for.end, label %for.body + +for.end: + ret i32 %inc.phi +}