diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -424,6 +424,12 @@ void getShuffleMaskWithWidestElts(ArrayRef Mask, SmallVectorImpl &ScaledMask); +/// Create a mask vector to scalar cast +/// i.e. if any lane set in mask vector then the casted result +/// is expected to be non zero +Value *createVectorToScalarCast(Value *V, IRBuilderBase &Builder, + const TargetTransformInfo *TTI); + /// Splits and processes shuffle mask depending on the number of input and /// output registers. The function does 2 main things: 1) splits the /// source/destination vectors into real registers; 2) do the mask analysis to diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -423,6 +423,62 @@ ScaledMask.assign(InputMask.begin(), InputMask.end()); } +/// Create a mask vector to scalar cast +/// i.e. if any lane set in mask vector then the casted result +/// is expected to be non zero +Value *llvm::createVectorToScalarCast(Value *V, IRBuilderBase &Builder, + const TargetTransformInfo *TTI) { + VectorType *VTy = dyn_cast(V->getType()); + assert(VTy && "Expected vector Type"); + assert(VTy->getElementType()->isIntegerTy(1) && "Expected vector of I1 type"); + unsigned VF = VTy->getElementCount().getKnownMinValue(); + TypeSize WidestRegister = + TTI->getRegisterBitWidth(TargetTransformInfo::RGK_Scalar); + unsigned ScalarRegSize = WidestRegister.getKnownMinSize(); + // If ptest not feasible then generate the default + // cast based checks + if (VF <= 4) { + // Create the intermedeate vector type cast if required + // i.e. promote <4 x i1> to <4 x i8> type + Type *VecTy = FixedVectorType::get(Builder.getIntNTy(8), VF); + V = Builder.CreateZExt(V, VecTy); + // Create the scalar type cast, by casting the vector type to the scalar + // type i.e. convert <4 x i8> type to i32 type + return Builder.CreateBitCast(V, Builder.getIntNTy(VF * 8)); + } else if (ScalarRegSize >= VF) { + // Convert <16 x i1> type to i16 type + return Builder.CreateBitCast(V, Builder.getIntNTy(VF)); + } else { + assert(((VF % ScalarRegSize) == 0) && + "VF is not divisible by ScalarRegSize"); + Value *ScalarCastRes = nullptr; + for (unsigned I = 0; I < (VF / ScalarRegSize); I++) { + // 1. SubVector Extract + // %vec.extract = shufflevector <128 x i1> %Vector, <128 x i1> %undef, <32 + // x i32> Mask + SmallVector Indices; + Indices.clear(); + for (unsigned Index = 0; Index < ScalarRegSize; Index++) + Indices.push_back(ConstantInt::get(Builder.getInt32Ty(), + (I * ScalarRegSize) + Index)); + Constant *Mask = ConstantVector::get(Indices); + Value *SubVec = + Builder.CreateShuffleVector(V, PoisonValue::get(V->getType()), Mask); + // 2. Cast to Scalar + // Convert <32 x i1> type to i32 type + Value *ScalarVal = + Builder.CreateBitCast(SubVec, Builder.getIntNTy(ScalarRegSize)); + // 3. OR With Previous Result If Any + if (ScalarCastRes) { + ScalarCastRes = Builder.CreateOr(ScalarCastRes, ScalarVal); + } else { + ScalarCastRes = ScalarVal; + } + } + return ScalarCastRes; + } +} + void llvm::processShuffleMasks( ArrayRef Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref NoInputAction, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -268,6 +268,9 @@ /// Loop Info analysis. LoopInfo *LI; + /// DominatorTree + DominatorTree *DT; + /// Target Library Info. const TargetLibraryInfo *TLI; @@ -298,7 +301,8 @@ VPBuilder Builder; public: - LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, + LoopVectorizationPlanner(Loop *L, LoopInfo *LI, DominatorTree *DT, + const TargetLibraryInfo *TLI, const TargetTransformInfo &TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, @@ -306,8 +310,8 @@ PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints, OptimizationRemarkEmitter *ORE) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), - PSE(PSE), Hints(Hints), ORE(ORE) {} + : OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), + IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {} /// Plan how to best vectorize, return the best VF and its cost, or /// std::nullopt if vectorization and interleaving should be avoided up front. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -172,6 +172,10 @@ STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); +cl::opt EnableBOSCCVectorization( + "enable-boscc-vectorization", cl::init(false), cl::Hidden, + cl::desc("Enable BOSCC Vectorizer")); + static cl::opt EnableEpilogueVectorization( "enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops.")); @@ -199,6 +203,10 @@ "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks")); +static cl::opt BOSCCInstructionInBlockThreshold( + "boscc-instructions-in-threshold", cl::init(5), cl::Hidden, + cl::desc("The minimum instructions in a block required for boscc")); + // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the // vectorizer will try to fold the tail-loop (epilogue) into the vector body @@ -935,6 +943,37 @@ void printDebugTracesAtStart() override; void printDebugTracesAtEnd() override; }; + +// Block level BOSCC vectorization planner +class BOSCCBlockPlanner { + BasicBlock *BB; + VPBasicBlock *VPBB; + Loop *OrigLoop; + DominatorTree *DT; + const TargetTransformInfo *TTI; + VPBuilder &Builder; + VPBasicBlock *VPGuardBlock; + VPBasicBlock *VPJoinBlock; + VPBasicBlock *VPVecContinueBlock; + +public: + BOSCCBlockPlanner(BasicBlock *BB, VPBasicBlock *VPBB, Loop *OrigLoop, + DominatorTree *DT, const TargetTransformInfo *TTI, + VPBuilder &Builder) + : BB(BB), VPBB(VPBB), OrigLoop(OrigLoop), DT(DT), TTI(TTI), + Builder(Builder), VPGuardBlock(nullptr), VPJoinBlock(nullptr), + VPVecContinueBlock(nullptr) {} + + bool isBlockLegalForBOSCC(); + bool isBlockProfitableForBOSCC(); + bool isBlockLegalAndProfitableForBOSCC(); + VPBasicBlock *getVPGuardBlock() { return VPGuardBlock; } + VPBasicBlock *getVPJoinBlock() { return VPJoinBlock; } + VPBasicBlock *getVPVecContinueBlock() { return VPVecContinueBlock; } + VPBasicBlock *createBOSCCBlocks(); + bool needBOSCCLiveOut(Instruction *I); +}; + } // end namespace llvm /// Look for a meaningful debug location on the instruction or it's @@ -1091,6 +1130,7 @@ isa(CurRec) || isa(CurRec) || isa(CurRec) || + isa(CurRec) || isa(CurRec)) continue; @@ -8071,6 +8111,110 @@ }); } +// BOSCC Legal Check +bool BOSCCBlockPlanner::isBlockLegalForBOSCC() { + if (!EnableBOSCCVectorization) + return false; + if (ForceTargetSupportsScalableVectors) + return false; + if (TTI->enableScalableVectorization()) + return false; + BasicBlock *Latch = OrigLoop->getLoopLatch(); + return !DT->dominates(BB, Latch); +} + +// BOSCC Profitablity Check +bool BOSCCBlockPlanner::isBlockProfitableForBOSCC() { + // TBD: At this point the profitablity is controlled by a threshold + // This can be improved later. + return (BB->sizeWithoutDebug() > BOSCCInstructionInBlockThreshold); +} + +// BOSCC Legal & Profitablity Check +bool BOSCCBlockPlanner::isBlockLegalAndProfitableForBOSCC() { + return isBlockLegalForBOSCC() && isBlockProfitableForBOSCC(); +} + +// Creates the BOSCC Block layout for a given conditional block +// +// This creates the "GUARD", "BOSCC.VEC", "BOSCC.VEC.CONTINUE" & "BOSCC.JOIN" +// +// Consider below block layout +// +// | \ +// | \ +// | BB +// | / +// | / +// +// It gets transformed to: +// +// | \ +// | \ +// | BB.BOSCC.GUARD +// | | \ +// | | \ +// | | BB.BOSCC.VEC +// | | | +// | | | +// | | BB.BOSCC.VEC.CONTINUE +// | | / +// | | / +// | BB.BOSCC.JOIN +// | / +// +// BB.BOSCC.GUARD : This serves the purpose of guarding with right condition +// BB.BOSCC.VEC : This is the vector block corrosponds to BB +// BB.BOSCC.VEC.CONTINUE: Auxilary block to facilitate control flow +// BB.BOSCC.JOIN: Required for PHI generation for the live out from BB +// +VPBasicBlock *BOSCCBlockPlanner::createBOSCCBlocks() { + Builder.setInsertPoint(VPBB); + VPBasicBlock *SuccBlock = cast(VPBB->getSingleSuccessor()); + VPBlockUtils::disconnectBlocks(VPBB, SuccBlock); + VPBasicBlock *VPVecBlock = new VPBasicBlock(); + VPVecBlock->setParent(VPBB->getParent()); + VPVecContinueBlock = new VPBasicBlock(); + VPVecContinueBlock->setParent(VPBB->getParent()); + VPJoinBlock = new VPBasicBlock(); + VPJoinBlock->setParent(VPBB->getParent()); + VPGuardBlock = VPBB; + VPGuardBlock->setName(BB->getName() + ".boscc.guard"); + VPVecBlock->setName(BB->getName() + ".boscc.vec"); + VPVecContinueBlock->setName(BB->getName() + ".boscc.vec.continue"); + VPJoinBlock->setName(BB->getName() + ".boscc.join"); + VPGuardBlock->markBOSCCBlock(); + VPVecBlock->markBOSCCBlock(); + VPJoinBlock->markBOSCCBlock(); + VPVecContinueBlock->markBOSCCBlock(); + VPBlockUtils::insertTwoBlocksAfter(VPVecBlock, VPJoinBlock, VPGuardBlock); + VPBlockUtils::connectBlocks(VPVecBlock, VPVecContinueBlock); + VPBlockUtils::connectBlocks(VPVecContinueBlock, VPJoinBlock); + VPBlockUtils::connectBlocks(VPJoinBlock, SuccBlock); + return VPVecBlock; +} + +// Identify if there is a need for BOSCC liveout for the +// given instruction. +bool BOSCCBlockPlanner::needBOSCCLiveOut(Instruction *I) { + // There has to be a join block to generate live outs + if (!getVPJoinBlock()) + return false; + // If the instruction has usage across basic block + for (Use &U : I->uses()) { + Instruction *UseInst = dyn_cast(U.getUser()); + if (!UseInst || (UseInst->getParent() == I->getParent())) + continue; + return true; + } + // If a condition plays a role in terminator branch + // Required for nested conditional statements. + BranchInst *TermBr = dyn_cast(I->getParent()->getTerminator()); + if (TermBr && TermBr->isConditional() && (TermBr->getCondition() == I)) + return true; + return false; +} + bool LoopVectorizationPlanner::getDecisionAndClampRange( const std::function &Predicate, VFRange &Range) { assert(!Range.isEmpty() && "Trying to test an empty VF range."); @@ -8920,12 +9064,31 @@ for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) { // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. + // Init the BOSCC Planner and check the legal and profitablity + BOSCCBlockPlanner BOSCCPlanner(BB, VPBB, OrigLoop, DT, &TTI, Builder); + bool BOSCCRequiredOnBlock = + BOSCCPlanner.isBlockLegalAndProfitableForBOSCC(); + if (BOSCCRequiredOnBlock) { + // Mark plan with BOSCC style vectorization + Plan->markPlanWithBOSCC(); + // Create BOSCC block layout + VPBB = BOSCCPlanner.createBOSCCBlocks(); + // Create the reciepe to generate required check in guard block + VPValue *BlockInMask = RecipeBuilder.createBlockInMask(BB, *Plan); + VPBasicBlock *VPGuardBlock = BOSCCPlanner.getVPGuardBlock(); + auto *BOMRecipe = new VPBranchOnBOSCCGuardRecipe(BlockInMask, &TTI); + VPGuardBlock->appendRecipe(BOMRecipe); + } + if (VPBB != HeaderVPBB) - VPBB->setName(BB->getName()); + VPBB->setName(BB->getName() + (BOSCCRequiredOnBlock ? ".boscc" : "")); + Builder.setInsertPoint(VPBB); // Introduce each ingredient into VPlan. // TODO: Model and preserve debug intrinsics in VPlan. + MapVector BOSCCLiveOutMap; + BOSCCLiveOutMap.clear(); for (Instruction &I : BB->instructionsWithoutDebug(false)) { Instruction *Instr = &I; @@ -8934,6 +9097,10 @@ if (isa(Instr) || DeadInstructions.count(Instr)) continue; + // Check the need for BOSCC LiveOut + bool BOSCCLiveOutRequired = BOSCCRequiredOnBlock && + BOSCCPlanner.needBOSCCLiveOut(Instr); + SmallVector Operands; auto *Phi = dyn_cast(Instr); if (Phi && Phi->getParent() == OrigLoop->getHeader()) { @@ -8981,6 +9148,32 @@ Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi()); } else VPBB->appendRecipe(Recipe); + + // Create the BOSCC LiveOuts + if (BOSCCLiveOutRequired) { + // Create the reciepe to generate required PHI nodes in Join block + VPBasicBlock *VPJoinBlock = BOSCCPlanner.getVPJoinBlock(); + Builder.setInsertPoint(VPJoinBlock); + VPValue *ScalarLiveOut = Plan->getVPValueOrAddLiveIn(Instr); + auto *LiveOutRecipe = new VPBOSCCLiveOutRecipe( + ScalarLiveOut, BOSCCPlanner.getVPGuardBlock(), + BOSCCPlanner.getVPVecContinueBlock()); + VPJoinBlock->appendRecipe(LiveOutRecipe); + BOSCCLiveOutMap[Instr] = LiveOutRecipe; + Builder.setInsertPoint(VPBB); + } + } + + // Update the Plan for BOSCC live outs + for (auto &Itr : BOSCCLiveOutMap) { + Plan->removeVPValueFor(Itr.first); + Plan->addVPValue(Itr.first, Itr.second); + } + + if (BOSCCRequiredOnBlock) { + // Before processing the next block, update the current + // vplan block to boscc join block + VPBB = BOSCCPlanner.getVPJoinBlock(); } VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); @@ -9950,7 +10143,7 @@ // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -10290,9 +10483,8 @@ LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, + LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints, ORE); - // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); @@ -10677,7 +10869,7 @@ // vectorization. Until this is addressed, mark these analyses as preserved // only for non-VPlan-native path. // TODO: Preserve Loop and Dominator analyses for VPlan-native path. - if (!EnableVPlanNativePath) { + if (!EnableVPlanNativePath && !EnableBOSCCVectorization) { PA.preserve(); PA.preserve(); PA.preserve(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1807,6 +1807,58 @@ } }; +/// A recipe for generating conditional branch for BOSCC +/// guard block +class VPBranchOnBOSCCGuardRecipe : public VPRecipeBase { +private: + /// Pointer to the TTI, needed to create the target reduction + const TargetTransformInfo *TTI; + +public: + VPBranchOnBOSCCGuardRecipe(VPValue *BlockInMask, + const TargetTransformInfo *TTI) + : VPRecipeBase(VPBranchOnBOSCCGuardSC, {}), TTI(TTI) { + if (BlockInMask) // nullptr means all-one mask. + addOperand(BlockInMask); + } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPBranchOnBOSCCGuardSC; + } + + /// Generate the extraction of the appropriate bit from the block mask and the + /// conditional branch. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override { + O << Indent << "BRANCH-ON-BOSCC-GUARD "; + if (VPValue *Mask = getMask()) + Mask->printAsOperand(O, SlotTracker); + else + O << " All-One"; + } +#endif + + /// Return the mask used by this recipe. Note that a full mask is represented + /// by a nullptr. + VPValue *getMask() const { + assert(getNumOperands() <= 1 && "should have either 0 or 1 operands"); + // Mask is optional. + return getNumOperands() == 1 ? getOperand(0) : nullptr; + } + + /// Returns true if the recipe uses scalars of operand \p Op. + bool usesScalars(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } +}; + /// A recipe for generating conditional branches on the bits of a mask. class VPBranchOnMaskRecipe : public VPRecipeBase { public: @@ -1850,6 +1902,52 @@ } }; +/// VPBOSCCLiveOutRecipe is a recipe for generating the phi nodes needed when +/// control converges back from a Branch-on-Mask. The phi nodes are needed in +/// order to merge values that are set under such a branch and feed their uses. +/// The phi nodes can be scalar or vector depending on the users of the value. +/// This recipe works in concert with VPBranchOnMaskRecipe. +class VPBOSCCLiveOutRecipe : public VPRecipeBase, public VPValue { +private: + VPBasicBlock *VPGuardBlock; + VPBasicBlock *VPVecContinueBlock; + +public: + /// Construct a VPBOSCCLiveOutRecipe given \p PredInst whose value needs a phi + /// nodes after merging back from a Branch-on-Mask. + VPBOSCCLiveOutRecipe(VPValue *PredV, VPBasicBlock *VPGuardBlock, + VPBasicBlock *VPVecContinueBlock) + : VPRecipeBase(VPBOSCCLiveOutSC, PredV), + VPValue(VPDef::VPPredInstPHISC, nullptr, this), + VPGuardBlock(VPGuardBlock), VPVecContinueBlock(VPVecContinueBlock) {} + + ~VPBOSCCLiveOutRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPBOSCCLiveOutSC; + } + + /// Generates phi nodes for live-outs as needed to retain SSA form. + void execute(VPTransformState &State) override; + + VPBasicBlock *getVPGuardBlock() { return VPGuardBlock; } + VPBasicBlock *getVPVecContinueBlock() { return VPVecContinueBlock; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the recipe uses scalars of operand \p Op. + bool usesScalars(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } +}; + /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when /// control converges back from a Branch-on-Mask. The phi nodes are needed in /// order to merge values that are set under such a branch and feed their uses. @@ -2200,10 +2298,11 @@ private: /// The VPRecipes held in the order of output instructions to generate. RecipeListTy Recipes; + bool IsBOSCCBlock; public: VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr) - : VPBlockBase(VPBasicBlockSC, Name.str()) { + : VPBlockBase(VPBasicBlockSC, Name.str()), IsBOSCCBlock(false) { if (Recipe) appendRecipe(Recipe); } @@ -2302,6 +2401,8 @@ /// Returns true if the block is exiting it's parent region. bool isExiting() const; + void markBOSCCBlock() { IsBOSCCBlock = true; } + bool isBOSCCBlock() { return IsBOSCCBlock; } private: /// Create an IR BasicBlock to hold the output instructions generated by this @@ -2467,6 +2568,8 @@ /// NOTE: This mapping is temporary and will be removed once all users have /// been modeled in VPlan directly. DenseMap SCEVToExpansion; + /// Indicates whether VPlan contains BOSCC Blocks + bool HasBOSCCBlocks = false; public: /// Construct a VPlan with original preheader \p Preheader, trip count \p TC @@ -2586,6 +2689,12 @@ return getVPValue(V); } + void removeVPValueFor(Value *V) { + assert(Value2VPValueEnabled && + "IR value to VPValue mapping may be out of date!"); + Value2VPValue.erase(V); + } + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) /// Print this VPlan to \p O. void print(raw_ostream &O) const; @@ -2653,6 +2762,12 @@ VPBasicBlock *getPreheader() { return Preheader; } const VPBasicBlock *getPreheader() const { return Preheader; } + /// Return true if the VPlan contains BOSCC Blocks + bool hasBOSCCBlocks() { return HasBOSCCBlocks; } + + /// Mark VPlan indicating + void markPlanWithBOSCC() { HasBOSCCBlocks = true; } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -52,6 +52,8 @@ extern cl::opt EnableVPlanNativePath; } +extern cl::opt EnableBOSCCVectorization; + #define DEBUG_TYPE "vplan" #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -368,11 +370,12 @@ // block. cast(ExitingBB->getTerminator())->setSuccessor(0, NewBB); } else if (PrevVPBB && /* A */ - !((SingleHPred = getSingleHierarchicalPredecessor()) && - SingleHPred->getExitingBasicBlock() == PrevVPBB && - PrevVPBB->getSingleHierarchicalSuccessor() && - (SingleHPred->getParent() == getEnclosingLoopRegion() && - !IsLoopRegion(SingleHPred))) && /* B */ + (getPlan()->hasBOSCCBlocks() || + !((SingleHPred = getSingleHierarchicalPredecessor()) && + SingleHPred->getExitingBasicBlock() == PrevVPBB && + PrevVPBB->getSingleHierarchicalSuccessor() && + (SingleHPred->getParent() == getEnclosingLoopRegion() && + !IsLoopRegion(SingleHPred)))) && /* B */ !(Replica && getPredecessors().empty())) { /* C */ // The last IR basic block is reused, as an optimization, in three cases: // A. the first VPBB reuses the loop pre-header BB - when PrevVPBB is null; @@ -464,7 +467,7 @@ const VPRecipeBase *R = &VPBB->back(); auto *VPI = dyn_cast(R); bool IsCondBranch = - isa(R) || + isa(R) || isa(R) || (VPI && (VPI->getOpcode() == VPInstruction::BranchOnCond || VPI->getOpcode() == VPInstruction::BranchOnCount)); (void)IsCondBranch; @@ -754,7 +757,7 @@ } // We do not attempt to preserve DT for outer loop vectorization currently. - if (!EnableVPlanNativePath) { + if (!EnableVPlanNativePath && !EnableBOSCCVectorization) { BasicBlock *VectorHeaderBB = State->CFG.VPBB2IRBB[Header]; State->DT->addNewBlock(VectorHeaderBB, VectorPreHeader); updateDominatorTree(State->DT, VectorHeaderBB, VectorLatchBB, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" @@ -53,6 +54,7 @@ case VPBranchOnMaskSC: case VPScalarIVStepsSC: case VPPredInstPHISC: + case VPBranchOnBOSCCGuardSC: return false; case VPBlendSC: case VPReductionSC: @@ -87,6 +89,7 @@ case VPBranchOnMaskSC: case VPScalarIVStepsSC: case VPPredInstPHISC: + case VPBranchOnBOSCCGuardSC: return false; case VPBlendSC: case VPReductionSC: @@ -996,6 +999,52 @@ } #endif +// Creates the condition inside the boscc-guard block, which +// ensures when all the lanes in mask is set to zero then do +// not execute the respective vector block at runtime. +// i.e. +// if.then.boscc.guard: ; preds = %vector.body +// %5 = bitcast <8 x i1> %mask to i8 +// %6 = icmp ne i8 %5, 0 +// br i1 %6, label %if.then.boscc, label %if.then.boscc.join +// +// if.then.boscc: ; preds = %if.then.boscc.guard +// ;; The Vector Block +// +// if.then.boscc.join: +// +void VPBranchOnBOSCCGuardRecipe::execute(VPTransformState &State) { + VPValue *BlockInMask = getMask(); + Value *GuardMask = nullptr; + // Create a common mask by appending the mask from different + // unroll instances representing the same condition + auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + Value *PartMask = nullptr; + if (BlockInMask) + PartMask = State.get(BlockInMask, Part); + else + PartMask = State.Builder.getTrue(); + assert(PartMask && "PartMask Missing"); + if (!PartMask->getType()->isVectorTy()) + PartMask = State.Builder.CreateVectorSplat(State.VF, PartMask, ""); + // Change the vector mask to scalar value, and then generate the condition + // check inside boscc-guard block + Value *ScalarCond = createVectorToScalarCast(PartMask, State.Builder, TTI); + if (!GuardMask) { + GuardMask = ScalarCond; + continue; + } + GuardMask = State.Builder.CreateOr(GuardMask, ScalarCond); + } + + Value *Cond = State.Builder.CreateICmpNE( + GuardMask, ConstantInt::get(GuardMask->getType(), 0)); + auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, Cond); + CondBr->setSuccessor(0, nullptr); + ReplaceInstWithInst(CurrentTerminator, CondBr); +} + void VPBranchOnMaskRecipe::execute(VPTransformState &State) { assert(State.Instance && "Branch on Mask works only on single instance."); @@ -1022,6 +1071,55 @@ ReplaceInstWithInst(CurrentTerminator, CondBr); } +// Create the PHI node in the join block. +// +// It will have 2 incoming values, first representing value from +// vector block, and the second in undefined value when vector block +// is not executed. +// +// i.e. +// if.then.boscc.guard: ; preds = %vector.body +// br i1 %6, label %if.then.boscc, label %if.then.boscc.join +// +// if.then.boscc: ; preds = %if.then.boscc.guard +// ;; The Vector Block +// br label %if.then.boscc.vec.continue +// +// if.then.boscc.vec.continue: ; preds = %if.then.boscc +// br label %if.then.boscc.join +// +// if.then.boscc.join: ; preds = %if.then.boscc.vec.continue, %if.then.boscc.guard +// %12 = phi <8 x i32> [ %SomeComputedValue, %if.then.boscc.vec.continue ], +// [ undef, %if.then.boscc.guard ] +void VPBOSCCLiveOutRecipe::execute(VPTransformState &State) { + VPValue *ScalarLiveOut = getOperand(0); + BasicBlock *GuardBlock = State.CFG.VPBB2IRBB[getVPGuardBlock()]; + BasicBlock *ContinueBlock = State.CFG.VPBB2IRBB[getVPVecContinueBlock()]; + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + if (State.hasVectorValue(ScalarLiveOut, Part)) { + Instruction *VecInst = cast(State.get(ScalarLiveOut, Part)); + PHINode *ExitPhi = State.Builder.CreatePHI(VecInst->getType(), 2); + ExitPhi->addIncoming(VecInst, ContinueBlock); + ExitPhi->addIncoming(PoisonValue::get(VecInst->getType()), GuardBlock); + State.set(this, ExitPhi, Part); + State.reset(ScalarLiveOut, ExitPhi, Part); + } else if (State.hasScalarValue(ScalarLiveOut, {Part, 0})) { + unsigned PartItrs = vputils::isUniformAfterVectorization(ScalarLiveOut) + ? 1 : State.VF.getKnownMinValue(); + for (unsigned Lane = 0; Lane < PartItrs; ++Lane) { + Instruction *ScalarInst = + cast(State.get(ScalarLiveOut, {Part, Lane})); + PHINode *ExitPhi = State.Builder.CreatePHI(ScalarInst->getType(), 2); + ExitPhi->addIncoming(ScalarInst, ContinueBlock); + ExitPhi->addIncoming(PoisonValue::get(ScalarInst->getType()), + GuardBlock); + State.set(this, ExitPhi, {Part, Lane}); + State.reset(ScalarLiveOut, ExitPhi, {Part, Lane}); + } + } + } +} + void VPPredInstPHIRecipe::execute(VPTransformState &State) { assert(State.Instance && "Predicated instruction PHI works per instance."); Instruction *ScalarPredInst = @@ -1069,6 +1167,14 @@ } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPBOSCCLiveOutRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "BOSCC-LIVE-OUT-INSTRUCTION "; + printAsOperand(O, SlotTracker); + O << " = "; + printOperands(O, SlotTracker); +} + void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "PHI-PREDICATED-INSTRUCTION "; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -184,10 +184,13 @@ VPValue *getPredicatedMask(VPRegionBlock *R) { auto *EntryBB = dyn_cast(R->getEntry()); if (!EntryBB || EntryBB->size() != 1 || - !isa(EntryBB->begin())) + (!isa(EntryBB->begin()) && + !isa(EntryBB->begin()))) return nullptr; - - return cast(&*EntryBB->begin())->getOperand(0); + if (isa(EntryBB->begin())) + return cast(&*EntryBB->begin())->getOperand(0); + else + return cast(&*EntryBB->begin())->getOperand(0); } /// If \p R is a triangle region, return the 'then' block of the triangle. @@ -382,6 +385,8 @@ vp_depth_first_deep(Plan.getEntry()))) { auto *PredVPBB = dyn_cast_or_null(VPBB->getSinglePredecessor()); + if (VPBB->isBOSCCBlock()) + continue; if (PredVPBB && PredVPBB->getNumSuccessors() == 1) WorkList.push_back(VPBB); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -336,6 +336,7 @@ /// type identification. using VPRecipeTy = enum { VPBranchOnMaskSC, + VPBranchOnBOSCCGuardSC, VPDerivedIVSC, VPExpandSCEVSC, VPInstructionSC, @@ -361,6 +362,7 @@ VPWidenPHISC, VPWidenIntOrFpInductionSC, VPWidenPointerInductionSC, + VPBOSCCLiveOutSC, VPReductionPHISC, // END: SubclassID for recipes that inherit VPHeaderPHIRecipe // END: Phi-like recipes diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -147,6 +147,11 @@ if (isa(RecipeI)) NumActiveLaneMaskPhiRecipes++; + if (isa(*RecipeI)) { + RecipeI++; + continue; + } + if (IsHeaderVPBB && !isa(*RecipeI)) { errs() << "Found non-header PHI recipe in header VPBB"; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/test/Transforms/LoopVectorize/boscc0.ll b/llvm/test/Transforms/LoopVectorize/boscc0.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/boscc0.ll @@ -0,0 +1,132 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize -enable-boscc-vectorization -S %s | FileCheck %s +; +; for (unsigned i = 0; i < len; i++) +; if (X[i]) +; A[i] = B[i] + C[i]; +; + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable +define dso_local void @foo(ptr noalias %A, ptr %B, ptr %C, ptr %X, i64 noundef %len) local_unnamed_addr #0 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp eq i64 [[LEN:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP9_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[LEN]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[LEN]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[LEN]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[IF_THEN_BOSCC_JOIN:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4, !tbaa [[TBAA5:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: br label [[IF_THEN_BOSCC_GUARD:%.*]] +; CHECK: if.then.boscc.guard: +; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i8 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[IF_THEN_BOSCC:%.*]], label [[IF_THEN_BOSCC_JOIN]] +; CHECK: if.then.boscc: +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP8]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP10]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP11]], ptr [[TMP13]], i32 4, <8 x i1> [[TMP4]]), !tbaa [[TBAA5]] +; CHECK-NEXT: br label [[IF_THEN_BOSCC_VEC_CONTINUE:%.*]] +; CHECK: if.then.boscc.vec.continue: +; CHECK-NEXT: br label [[IF_THEN_BOSCC_JOIN]] +; CHECK: if.then.boscc.join: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[LEN]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_010:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[I_010]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_010]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[I_010]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_010]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX3]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INC]] = add nuw i64 [[I_010]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[LEN]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; +entry: + %cmp9.not = icmp eq i64 %len, 0 + br i1 %cmp9.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.inc, %entry + ret void + +for.body: ; preds = %entry, %for.inc + %i.010 = phi i64 [ %inc, %for.inc ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %X, i64 %i.010 + %0 = load i32, ptr %arrayidx, align 4, !tbaa !5 + %tobool.not = icmp eq i32 %0, 0 + br i1 %tobool.not, label %for.inc, label %if.then + +if.then: ; preds = %for.body + %arrayidx1 = getelementptr inbounds i32, ptr %B, i64 %i.010 + %1 = load i32, ptr %arrayidx1, align 4, !tbaa !5 + %arrayidx2 = getelementptr inbounds i32, ptr %C, i64 %i.010 + %2 = load i32, ptr %arrayidx2, align 4, !tbaa !5 + %add = add nsw i32 %2, %1 + %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %i.010 + store i32 %add, ptr %arrayidx3, align 4, !tbaa !5 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %inc = add nuw i64 %i.010, 1 + %exitcond.not = icmp eq i64 %inc, %len + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !9 +} + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+crc32,+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" } + +!llvm.module.flags = !{!0, !1, !2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 8, !"PIC Level", i32 2} +!2 = !{i32 7, !"PIE Level", i32 2} +!3 = !{i32 7, !"uwtable", i32 2} +!4 = !{!"clang version 17.0.0 (https://github.com/llvm/llvm-project.git 170277ce39677d5a64139b9848a53fd85b74d29d)"} +!5 = !{!6, !6, i64 0} +!6 = !{!"int", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"} +!9 = distinct !{!9, !10, !11} +!10 = !{!"llvm.loop.mustprogress"} +!11 = !{!"llvm.loop.unroll.disable"} diff --git a/llvm/test/Transforms/LoopVectorize/boscc1.ll b/llvm/test/Transforms/LoopVectorize/boscc1.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/boscc1.ll @@ -0,0 +1,171 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize -enable-boscc-vectorization -S %s | FileCheck %s +; +; for (unsigned i = 0; i < len; i++) +; if (X[i]) +; A[i] = B[i] + C[i]; +; else +; A[i] = E[i] * F[i]; +; + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable +define dso_local void @foo(ptr noalias %A, ptr %B, ptr %C, ptr %D, ptr %E, ptr %X, i64 noundef %len) local_unnamed_addr #0 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP16_NOT:%.*]] = icmp eq i64 [[LEN:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP16_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[LEN]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[LEN]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[LEN]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[IF_ELSE_BOSCC_JOIN:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4, !tbaa [[TBAA5:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: br label [[IF_THEN_BOSCC_GUARD:%.*]] +; CHECK: if.then.boscc.guard: +; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i8 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[IF_THEN_BOSCC:%.*]], label [[IF_THEN_BOSCC_JOIN:%.*]] +; CHECK: if.then.boscc: +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP8]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP10]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: br label [[IF_THEN_BOSCC_VEC_CONTINUE:%.*]] +; CHECK: if.then.boscc.vec.continue: +; CHECK-NEXT: br label [[IF_THEN_BOSCC_JOIN]] +; CHECK: if.then.boscc.join: +; CHECK-NEXT: [[TMP12:%.*]] = phi <8 x i32> [ [[TMP11]], [[IF_THEN_BOSCC_VEC_CONTINUE]] ], [ poison, [[IF_THEN_BOSCC_GUARD]] ] +; CHECK-NEXT: br label [[IF_ELSE_BOSCC_GUARD:%.*]] +; CHECK: if.else.boscc.guard: +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne i8 [[TMP13]], 0 +; CHECK-NEXT: br i1 [[TMP14]], label [[IF_ELSE_BOSCC:%.*]], label [[IF_ELSE_BOSCC_JOIN]] +; CHECK: if.else.boscc: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[D:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP15]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP16]], i32 4, <8 x i1> [[TMP3]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[E:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP18]], i32 4, <8 x i1> [[TMP3]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <8 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD2]] +; CHECK-NEXT: br label [[IF_ELSE_BOSCC_VEC_CONTINUE:%.*]] +; CHECK: if.else.boscc.vec.continue: +; CHECK-NEXT: br label [[IF_ELSE_BOSCC_JOIN]] +; CHECK: if.else.boscc.join: +; CHECK-NEXT: [[TMP20:%.*]] = phi <8 x i32> [ [[TMP19]], [[IF_ELSE_BOSCC_VEC_CONTINUE]] ], [ poison, [[IF_ELSE_BOSCC_GUARD]] ] +; CHECK-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP12]], <8 x i32> [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0 +; CHECK-NEXT: store <8 x i32> [[PREDPHI]], ptr [[TMP22]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[LEN]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_017:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[I_017]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP24]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_017]] +; CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[I_017]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP26]], [[TMP25]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: if.else: +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[I_017]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[E]], i64 [[I_017]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP28]], [[TMP27]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[ADD_SINK:%.*]] = phi i32 [ [[MUL]], [[IF_ELSE]] ], [ [[ADD]], [[IF_THEN]] ] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_017]] +; CHECK-NEXT: store i32 [[ADD_SINK]], ptr [[TMP29]], align 4 +; CHECK-NEXT: [[INC]] = add nuw i64 [[I_017]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[LEN]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; +entry: + %cmp16.not = icmp eq i64 %len, 0 + br i1 %cmp16.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.inc, %entry + ret void + +for.body: ; preds = %entry, %for.inc + %i.017 = phi i64 [ %inc, %for.inc ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i32, ptr %X, i64 %i.017 + %0 = load i32, ptr %arrayidx, align 4, !tbaa !5 + %tobool.not = icmp eq i32 %0, 0 + br i1 %tobool.not, label %if.else, label %if.then + +if.then: ; preds = %for.body + %arrayidx1 = getelementptr inbounds i32, ptr %B, i64 %i.017 + %1 = load i32, ptr %arrayidx1, align 4, !tbaa !5 + %arrayidx2 = getelementptr inbounds i32, ptr %C, i64 %i.017 + %2 = load i32, ptr %arrayidx2, align 4, !tbaa !5 + %add = add nsw i32 %2, %1 + br label %for.inc + +if.else: ; preds = %for.body + %arrayidx4 = getelementptr inbounds i32, ptr %D, i64 %i.017 + %3 = load i32, ptr %arrayidx4, align 4, !tbaa !5 + %arrayidx5 = getelementptr inbounds i32, ptr %E, i64 %i.017 + %4 = load i32, ptr %arrayidx5, align 4, !tbaa !5 + %mul = mul nsw i32 %4, %3 + br label %for.inc + +for.inc: ; preds = %if.then, %if.else + %add.sink = phi i32 [ %mul, %if.else ], [ %add, %if.then ] + %5 = getelementptr inbounds i32, ptr %A, i64 %i.017 + store i32 %add.sink, ptr %5, align 4 + %inc = add nuw i64 %i.017, 1 + %exitcond.not = icmp eq i64 %inc, %len + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !9 +} + +attributes #0 = { nofree norecurse nosync nounwind memory(argmem: readwrite) uwtable "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+avx,+avx2,+crc32,+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" } + +!llvm.module.flags = !{!0, !1, !2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 8, !"PIC Level", i32 2} +!2 = !{i32 7, !"PIE Level", i32 2} +!3 = !{i32 7, !"uwtable", i32 2} +!4 = !{!"clang version 17.0.0 (https://github.com/llvm/llvm-project.git 170277ce39677d5a64139b9848a53fd85b74d29d)"} +!5 = !{!6, !6, i64 0} +!6 = !{!"int", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"} +!9 = distinct !{!9, !10, !11} +!10 = !{!"llvm.loop.mustprogress"} +!11 = !{!"llvm.loop.unroll.disable"} diff --git a/llvm/test/Transforms/LoopVectorize/boscc2.ll b/llvm/test/Transforms/LoopVectorize/boscc2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/boscc2.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize -enable-boscc-vectorization -S %s | FileCheck %s +; +; for (unsigned i = 0; i < len; i++) { +; if (X[i]) { +; A[i] = B[i] + C[i]; +; if (Y[i]) +; D[i] = E[i] * F[i]; +; } +; } +; + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable +define dso_local void @foo(ptr noalias %A, ptr %B, ptr %C, ptr noalias %D, ptr %E, ptr %F, ptr %X, ptr %Y, i32 noundef %len) local_unnamed_addr #0 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP27_NOT:%.*]] = icmp eq i32 [[LEN:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP27_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[IF_THEN10_BOSCC_JOIN:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4, !tbaa [[TBAA5:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: br label [[IF_THEN_BOSCC_GUARD:%.*]] +; CHECK: if.then.boscc.guard: +; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i8 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[IF_THEN_BOSCC:%.*]], label [[IF_THEN_BOSCC_JOIN:%.*]] +; CHECK: if.then.boscc: +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP8]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP10]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP11]], ptr [[TMP13]], i32 4, <8 x i1> [[TMP4]]), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[Y:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP15]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <8 x i32> [[WIDE_MASKED_LOAD2]], zeroinitializer +; CHECK-NEXT: br label [[IF_THEN_BOSCC_VEC_CONTINUE:%.*]] +; CHECK: if.then.boscc.vec.continue: +; CHECK-NEXT: br label [[IF_THEN_BOSCC_JOIN]] +; CHECK: if.then.boscc.join: +; CHECK-NEXT: [[TMP17:%.*]] = phi <8 x i1> [ [[TMP16]], [[IF_THEN_BOSCC_VEC_CONTINUE]] ], [ poison, [[IF_THEN_BOSCC_GUARD]] ] +; CHECK-NEXT: br label [[IF_THEN10_BOSCC_GUARD:%.*]] +; CHECK: if.then10.boscc.guard: +; CHECK-NEXT: [[TMP18:%.*]] = xor <8 x i1> [[TMP17]], +; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP4]], <8 x i1> [[TMP18]], <8 x i1> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i1> [[TMP19]] to i8 +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne i8 [[TMP20]], 0 +; CHECK-NEXT: br i1 [[TMP21]], label [[IF_THEN10_BOSCC:%.*]], label [[IF_THEN10_BOSCC_JOIN]] +; CHECK: if.then10.boscc: +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[E:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP23]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[F:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP25]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP26:%.*]] = mul nsw <8 x i32> [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD3]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, ptr [[D:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[TMP27]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP26]], ptr [[TMP28]], i32 4, <8 x i1> [[TMP19]]), !tbaa [[TBAA5]] +; CHECK-NEXT: br label [[IF_THEN10_BOSCC_VEC_CONTINUE:%.*]] +; CHECK: if.then10.boscc.vec.continue: +; CHECK-NEXT: br label [[IF_THEN10_BOSCC_JOIN]] +; CHECK: if.then10.boscc.join: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP30]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP32]], [[TMP31]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[TOBOOL9_NOT:%.*]] = icmp eq i32 [[TMP33]], 0 +; CHECK-NEXT: br i1 [[TOBOOL9_NOT]], label [[FOR_INC]], label [[IF_THEN10:%.*]] +; CHECK: if.then10: +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[E]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX14]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP35]], [[TMP34]] +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX16]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; +entry: + %cmp27.not = icmp eq i32 %len, 0 + br i1 %cmp27.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %len to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.inc, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.inc + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %X, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4, !tbaa !5 + %tobool.not = icmp eq i32 %0, 0 + br i1 %tobool.not, label %for.inc, label %if.then + +if.then: ; preds = %for.body + %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4, !tbaa !5 + %arrayidx4 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %2 = load i32, ptr %arrayidx4, align 4, !tbaa !5 + %add = add nsw i32 %2, %1 + %arrayidx6 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %add, ptr %arrayidx6, align 4, !tbaa !5 + %arrayidx8 = getelementptr inbounds i32, ptr %Y, i64 %indvars.iv + %3 = load i32, ptr %arrayidx8, align 4, !tbaa !5 + %tobool9.not = icmp eq i32 %3, 0 + br i1 %tobool9.not, label %for.inc, label %if.then10 + +if.then10: ; preds = %if.then + %arrayidx12 = getelementptr inbounds i32, ptr %E, i64 %indvars.iv + %4 = load i32, ptr %arrayidx12, align 4, !tbaa !5 + %arrayidx14 = getelementptr inbounds i32, ptr %F, i64 %indvars.iv + %5 = load i32, ptr %arrayidx14, align 4, !tbaa !5 + %mul = mul nsw i32 %5, %4 + %arrayidx16 = getelementptr inbounds i32, ptr %D, i64 %indvars.iv + store i32 %mul, ptr %arrayidx16, align 4, !tbaa !5 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then10, %if.then + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !9 +} + +attributes #0 = { argmemonly nofree norecurse nosync nounwind uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="znver3" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+clwb,+clzero,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mwaitx,+pclmul,+pku,+popcnt,+prfchw,+rdpid,+rdpru,+rdrnd,+rdseed,+sahf,+sha,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,+ssse3,+vaes,+vpclmulqdq,+wbnoinvd,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" } + +!llvm.module.flags = !{!0, !1, !2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 8, !"PIC Level", i32 2} +!2 = !{i32 7, !"PIE Level", i32 2} +!3 = !{i32 7, !"uwtable", i32 2} +!4 = !{!"clang version 16.0.0 (https://github.com/llvm/llvm-project.git 1fa2019828caec1172382009d5327c265427af57)"} +!5 = !{!6, !6, i64 0} +!6 = !{!"int", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"} +!9 = distinct !{!9, !10, !11} +!10 = !{!"llvm.loop.mustprogress"} +!11 = !{!"llvm.loop.unroll.disable"}