diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8828,10 +8828,8 @@ // Add the necessary canonical IV and branch recipes required to control the // loop. -static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, - TailFoldingStyle Style) { - Value *StartIdx = ConstantInt::get(IdxTy, 0); - auto *StartV = Plan.getOrAddVPValue(StartIdx); +static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL, bool HasNUW) { + auto *StartV = Plan.getOrAddVPValue(ConstantInt::get(IdxTy, 0)); // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); @@ -8841,7 +8839,6 @@ // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar // IV by VF * UF. - bool HasNUW = Style == TailFoldingStyle::None; auto *CanonicalIVIncrement = new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW : VPInstruction::CanonicalIVIncrement, @@ -8849,87 +8846,102 @@ CanonicalIVPHI->addOperand(CanonicalIVIncrement); VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); - if (useActiveLaneMaskForControlFlow(Style)) { - // Create the active lane mask instruction in the vplan preheader. - VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); - - // We can't use StartV directly in the ActiveLaneMask VPInstruction, since - // we have to take unrolling into account. Each part needs to start at - // Part * VF - auto *CanonicalIVIncrementParts = - new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW - : VPInstruction::CanonicalIVIncrementForPart, - {StartV}, DL, "index.part.next"); - Preheader->appendRecipe(CanonicalIVIncrementParts); - - // Create the ActiveLaneMask instruction using the correct start values. - VPValue *TC = Plan.getOrCreateTripCount(); - - VPValue *TripCount, *IncrementValue; - if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { - // When avoiding a runtime check, the active.lane.mask inside the loop - // uses a modified trip count and the induction variable increment is - // done after the active.lane.mask intrinsic is called. - auto *TCMinusVF = - new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL); - Preheader->appendRecipe(TCMinusVF); - IncrementValue = CanonicalIVPHI; - TripCount = TCMinusVF; - } else { - // When the loop is guarded by a runtime overflow check for the loop - // induction variable increment by VF, we can increment the value before - // the get.active.lane mask and use the unmodified tripcount. - EB->appendRecipe(CanonicalIVIncrement); - IncrementValue = CanonicalIVIncrement; - TripCount = TC; - } + EB->appendRecipe(CanonicalIVIncrement); - auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, - {CanonicalIVIncrementParts, TC}, DL, - "active.lane.mask.entry"); - Preheader->appendRecipe(EntryALM); - - // Now create the ActiveLaneMaskPhi recipe in the main loop using the - // preheader ActiveLaneMask instruction. - auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); - Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); - - // Create the active lane mask for the next iteration of the loop. - CanonicalIVIncrementParts = - new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW - : VPInstruction::CanonicalIVIncrementForPart, - {IncrementValue}, DL); - EB->appendRecipe(CanonicalIVIncrementParts); - - auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, - {CanonicalIVIncrementParts, TripCount}, DL, - "active.lane.mask.next"); - EB->appendRecipe(ALM); - LaneMaskPhi->addOperand(ALM); - - if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { - // Do the increment of the canonical IV after the active.lane.mask, because - // that value is still based off %CanonicalIVPHI - EB->appendRecipe(CanonicalIVIncrement); - } + // Add the BranchOnCount VPInstruction to the latch. + VPInstruction *BranchBack = new VPInstruction( + VPInstruction::BranchOnCount, + {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); + EB->appendRecipe(BranchBack); +} + +// Add the necessary canonical IV and branch recipes required to control the +// loop. +static void addCanonicalIVRecipesWithActiveLaneMaskForControlFlow( + VPlan &Plan, Type *IdxTy, DebugLoc DL, TailFoldingStyle Style) { + auto *StartV = Plan.getOrAddVPValue(ConstantInt::get(IdxTy, 0)); + auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL); + + // Add a VPCanonicalIVPHIRecipe starting at 0 to the header. + VPRegionBlock *TopRegion = Plan.getVectorLoopRegion(); + VPBasicBlock *Header = TopRegion->getEntryBasicBlock(); + Header->insert(CanonicalIVPHI, Header->begin()); - // We have to invert the mask here because a true condition means jumping - // to the exit block. - auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); - EB->appendRecipe(NotMask); + // Add a CanonicalIVIncrement VPInstruction to increment the scalar + // IV by VF * UF. + auto *CanonicalIVIncrement = new VPInstruction( + VPInstruction::CanonicalIVIncrement, {CanonicalIVPHI}, DL, "index.next"); + CanonicalIVPHI->addOperand(CanonicalIVIncrement); + + // Create the active lane mask instruction in the vplan preheader. + VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock(); + + // We can't use StartV directly in the ActiveLaneMask VPInstruction, since + // we have to take unrolling into account. Each part needs to start at + // Part * VF + auto *CanonicalIVIncrementParts = + new VPInstruction(VPInstruction::CanonicalIVIncrementForPart, {StartV}, + DL, "index.part.next"); + Preheader->appendRecipe(CanonicalIVIncrementParts); - VPInstruction *BranchBack = - new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); - EB->appendRecipe(BranchBack); + // Create the ActiveLaneMask instruction using the correct start values. + VPValue *TC = Plan.getOrCreateTripCount(); + + VPValue *TripCount, *IncrementValue; + VPBasicBlock *EB = TopRegion->getExitingBasicBlock(); + if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { + // When avoiding a runtime check, the active.lane.mask inside the loop + // uses a modified trip count and the induction variable increment is + // done after the active.lane.mask intrinsic is called. + auto *TCMinusVF = + new VPInstruction(VPInstruction::CalculateTripCountMinusVF, {TC}, DL); + Preheader->appendRecipe(TCMinusVF); + IncrementValue = CanonicalIVPHI; + TripCount = TCMinusVF; } else { + // When the loop is guarded by a runtime overflow check for the loop + // induction variable increment by VF, we can increment the value before + // the get.active.lane mask and use the unmodified tripcount. + EB->appendRecipe(CanonicalIVIncrement); + IncrementValue = CanonicalIVIncrement; + TripCount = TC; + } + + auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TC}, DL, + "active.lane.mask.entry"); + Preheader->appendRecipe(EntryALM); + + // Now create the ActiveLaneMaskPhi recipe in the main loop using the + // preheader ActiveLaneMask instruction. + auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc()); + Header->insert(LaneMaskPhi, Header->getFirstNonPhi()); + + // Create the active lane mask for the next iteration of the loop. + CanonicalIVIncrementParts = new VPInstruction( + VPInstruction::CanonicalIVIncrementForPart, {IncrementValue}, DL); + EB->appendRecipe(CanonicalIVIncrementParts); + + auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask, + {CanonicalIVIncrementParts, TripCount}, DL, + "active.lane.mask.next"); + EB->appendRecipe(ALM); + LaneMaskPhi->addOperand(ALM); + + if (Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) { + // Do the increment of the canonical IV after the active.lane.mask, because + // that value is still based off %CanonicalIVPHI EB->appendRecipe(CanonicalIVIncrement); - - // Add the BranchOnCount VPInstruction to the latch. - VPInstruction *BranchBack = new VPInstruction( - VPInstruction::BranchOnCount, - {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL); - EB->appendRecipe(BranchBack); } + + // We have to invert the mask here because a true condition means jumping + // to the exit block. + auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL); + EB->appendRecipe(NotMask); + + VPInstruction *BranchBack = + new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL); + EB->appendRecipe(BranchBack); } // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the @@ -9029,9 +9041,15 @@ Instruction *DLInst = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); - addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), - DLInst ? DLInst->getDebugLoc() : DebugLoc(), - CM.getTailFoldingStyle(IVUpdateMayOverflow)); + auto Style = CM.getTailFoldingStyle(IVUpdateMayOverflow); + if (useActiveLaneMaskForControlFlow(Style)) + addCanonicalIVRecipesWithActiveLaneMaskForControlFlow( + *Plan, Legal->getWidestInductionType(), + DLInst ? DLInst->getDebugLoc() : DebugLoc(), Style); + else + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), + DLInst ? DLInst->getDebugLoc() : DebugLoc(), + Style == TailFoldingStyle::None); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -9236,8 +9254,14 @@ Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator(); Term->eraseFromParent(); - addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - CM.getTailFoldingStyle()); + auto Style = CM.getTailFoldingStyle(); + if (useActiveLaneMaskForControlFlow(Style)) + addCanonicalIVRecipesWithActiveLaneMaskForControlFlow( + *Plan, Legal->getWidestInductionType(), DebugLoc(), Style); + else + addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), + Style == TailFoldingStyle::None); + return Plan; }