Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1176,18 +1176,17 @@ return false; } - // TODO: handle reductions when tail is folded by masking. - if (!Reductions.empty()) { - reportVectorizationFailure( - "Loop has reductions, cannot fold tail by masking", - "Cannot fold tail by masking in the presence of reductions.", - "ReductionFoldingTailByMasking", ORE, TheLoop); - return false; - } + SmallPtrSet ReductionLiveOuts; - // TODO: handle outside users when tail is folded by masking. + for (auto &Reduction : *getReductionVars()) + ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr()); + + // TODO: handle non-reduction outside users when tail is folded by masking. for (auto *AE : AllowedExit) { - // Check that all users of allowed exit values are inside the loop. + // Check that all users of allowed exit values are inside the loop or + // are the live-out of a reduction. + if (ReductionLiveOuts.count(AE)) + continue; for (User *U : AE->users()) { Instruction *UI = cast(U); if (TheLoop->contains(UI)) Index: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3678,6 +3678,26 @@ setDebugLocFromInst(Builder, LoopExitInst); + // If tail is folded by masking, the vector value to leave the loop should be + // a Select choosing between the vectorized LoopExitInst and vectorized Phi, + // instead of the former. + if (Cost->foldTailByMasking()) { + for (unsigned Part = 0; Part < UF; ++Part) { + Value *VecLoopExitInst = + VectorLoopValueMap.getVectorValue(LoopExitInst, Part); + Value *Sel = nullptr; + for (User *U : VecLoopExitInst->users()) { + if (isa(U)) { + assert(!Sel && "Reduction exit feeding two selects"); + Sel = U; + } else + assert(isa(U) && "Reduction exit must feed Phi's or select"); + } + assert(Sel && "Reduction exit feeds no select"); + VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel); + } + } + // If the vector reduction can be performed in a smaller type, we truncate // then extend the loop exit value to enable InstCombine to evaluate the // entire expression in the smaller type. @@ -6939,8 +6959,15 @@ // If the tail is to be folded by masking, the primary induction variable // needs to be represented in VPlan for it to model early-exit masking. - if (CM.foldTailByMasking()) + // Also, both the Phi and the live-out instruction of each reduction are + // required in order to introduce a select between them in VPlan. + if (CM.foldTailByMasking()) { NeedDef.insert(Legal->getPrimaryInduction()); + for (auto &Reduction : *Legal->getReductionVars()) { + NeedDef.insert(Reduction.first); + NeedDef.insert(Reduction.second.getLoopExitInstr()); + } + } // Collect instructions from the original loop that will become trivially dead // in the vectorized loop. We don't need to vectorize these instructions. For @@ -7067,6 +7094,18 @@ VPBlockUtils::disconnectBlocks(PreEntry, Entry); delete PreEntry; + // Finally, if tail is folded by masking, introduce selects between the phi + // and the live-out instruction of each reduction, at the end of the latch. + if (CM.foldTailByMasking()) { + Builder.setInsertPoint(VPBB); + auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan); + for (auto &Reduction : *Legal->getReductionVars()) { + VPValue *Phi = Plan->getVPValue(Reduction.first); + VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr()); + Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi}); + } + } + std::string PlanName; raw_string_ostream RSO(PlanName); unsigned VF = Range.Start; Index: llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/trunk/lib/Transforms/Vectorize/VPlan.cpp @@ -309,6 +309,14 @@ State.set(this, V, Part); break; } + case Instruction::Select: { + Value *Cond = State.get(getOperand(0), Part); + Value *Op1 = State.get(getOperand(1), Part); + Value *Op2 = State.get(getOperand(2), Part); + Value *V = Builder.CreateSelect(Cond, Op1, Op2); + State.set(this, V, Part); + break; + } default: llvm_unreachable("Unsupported opcode for instruction"); } Index: llvm/trunk/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll =================================================================== --- llvm/trunk/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll +++ llvm/trunk/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll @@ -78,6 +78,62 @@ br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 } +; Check that fold tail under optsize passes the reduction live-out value +; through a select. +; int reduction_i32(int *A, int *B, int N) { +; int sum = 0; +; for (int i = 0; i < N; ++i) +; sum += (A[i] + B[i]); +; return sum; +; } +; +define i32 @reduction_i32(i32* nocapture readonly %A, i32* nocapture readonly %B, i32 %N) #0 { +; CHECK-LABEL: @reduction_i32( +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[ACCUM_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %vector.ph ], [ [[ACCUM:%.*]], %vector.body ] +; CHECK: [[ICMPULE:%.*]] = icmp ule <8 x i64> +; CHECK: [[LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef) +; CHECK: [[LOAD2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* {{.*}}, i32 4, <8 x i1> [[ICMPULE]], <8 x i32> undef) +; CHECK-NEXT: [[ADD:%.*]] = add nsw <8 x i32> [[LOAD2]], [[LOAD1]] +; CHECK-NEXT: [[ACCUM]] = add nuw nsw <8 x i32> [[ADD]], [[ACCUM_PHI]] +; CHECK: [[LIVEOUT:%.*]] = select <8 x i1> [[ICMPULE]], <8 x i32> [[ACCUM]], <8 x i32> [[ACCUM_PHI]] +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8 +; CHECK: middle.block: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[LIVEOUT]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[LIVEOUT]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF4:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX5:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF4]] +; CHECK-NEXT: [[RDX_SHUF6:%.*]] = shufflevector <8 x i32> [[BIN_RDX5]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[BIN_RDX7:%.*]] = add <8 x i32> [[BIN_RDX5]], [[RDX_SHUF6]] +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[BIN_RDX7]], i32 0 +; CHECK-NEXT: br i1 true, label %for.cond.cleanup, label %scalar.ph +; CHECK: scalar.ph: +; CHECK: for.cond.cleanup: +; CHECK-NEXT: [[SUM_1_LCSSA:%.*]] = phi i32 [ {{.*}}, %for.body ], [ [[TMP17]], %middle.block ] +; CHECK-NEXT: ret i32 [[SUM_1_LCSSA]] +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %sum.0 = phi i32 [ %sum.1, %for.body ], [ 0, %entry ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %arrayidxA = getelementptr inbounds i32, i32* %A, i64 %indvars.iv + %0 = load i32, i32* %arrayidxA, align 4 + %arrayidxB = getelementptr inbounds i32, i32* %B, i64 %indvars.iv + %1 = load i32, i32* %arrayidxB, align 4 + %add = add nsw i32 %1, %0 + %sum.1 = add nuw nsw i32 %add, %sum.0 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %N + br i1 %exitcond, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: + ret i32 %sum.1 +} + ; CHECK: !0 = distinct !{!0, !1} ; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1} ; CHECK-NEXT: !2 = distinct !{!2, !3, !1}