diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -406,6 +406,11 @@ bool widenShuffleMaskElts(int Scale, ArrayRef Mask, SmallVectorImpl &ScaledMask); +/// Create a mask vector to scalar cast +/// i.e. if any lane set in mask vector then the casted result +/// is expected to be non zero +Value *createVectorToScalarCast(Value *V, IRBuilderBase &Builder, + const TargetTransformInfo *TTI); /// Splits and processes shuffle mask depending on the number of input and /// output registers. The function does 2 main things: 1) splits the /// source/destination vectors into real registers; 2) do the mask analysis to diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -538,6 +538,62 @@ return true; } +/// Create a mask vector to scalar cast +/// i.e. if any lane set in mask vector then the casted result +/// is expected to be non zero +Value *llvm::createVectorToScalarCast(Value *V, IRBuilderBase &Builder, + const TargetTransformInfo *TTI) { + VectorType *VTy = dyn_cast(V->getType()); + assert(VTy && "Expected vector Type"); + assert(VTy->getElementType()->isIntegerTy(1) && "Expected vector of I1 type"); + unsigned VF = VTy->getElementCount().getKnownMinValue(); + TypeSize WidestRegister = + TTI->getRegisterBitWidth(TargetTransformInfo::RGK_Scalar); + unsigned ScalarRegSize = WidestRegister.getKnownMinSize(); + // If ptest not feasible then generate the default + // cast based checks + if (VF <= 4) { + // Create the intermedeate vector type cast if required + // i.e. promote <4 x i1> to <4 x i8> type + Type *VecTy = FixedVectorType::get(Builder.getIntNTy(8), VF); + V = Builder.CreateZExt(V, VecTy); + // Create the scalar type cast, by casting the vector type to the scalar + // type i.e. convert <4 x i8> type to i32 type + return Builder.CreateBitCast(V, Builder.getIntNTy(VF * 8)); + } else if (ScalarRegSize >= VF) { + // Convert <16 x i1> type to i16 type + return Builder.CreateBitCast(V, Builder.getIntNTy(VF)); + } else { + assert(((VF % ScalarRegSize) == 0) && + "VF is not divisible by ScalarRegSize"); + Value *ScalarCastRes = nullptr; + for (unsigned I = 0; I < (VF / ScalarRegSize); I++) { + // 1. SubVector Extract + // %vec.extract = shufflevector <128 x i1> %Vector, <128 x i1> %undef, <32 + // x i32> Mask + SmallVector Indices; + Indices.clear(); + for (unsigned Index = 0; Index < ScalarRegSize; Index++) + Indices.push_back(ConstantInt::get(Builder.getInt32Ty(), + (I * ScalarRegSize) + Index)); + Constant *Mask = ConstantVector::get(Indices); + Value *SubVec = + Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask); + // 2. Cast to Scalar + // Convert <32 x i1> type to i32 type + Value *ScalarVal = + Builder.CreateBitCast(SubVec, Builder.getIntNTy(ScalarRegSize)); + // 3. OR With Previous Result If Any + if (ScalarCastRes) { + ScalarCastRes = Builder.CreateOr(ScalarCastRes, ScalarVal); + } else { + ScalarCastRes = ScalarVal; + } + } + return ScalarCastRes; + } +} + void llvm::processShuffleMasks( ArrayRef Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref NoInputAction, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -257,6 +257,9 @@ /// Loop Info analysis. LoopInfo *LI; + /// DominatorTree + DominatorTree *DT; + /// Target Library Info. const TargetLibraryInfo *TLI; @@ -284,7 +287,8 @@ VPBuilder Builder; public: - LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, + LoopVectorizationPlanner(Loop *L, LoopInfo *LI, DominatorTree *DT, + const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, @@ -292,8 +296,8 @@ PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints, OptimizationRemarkEmitter *ORE) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), - PSE(PSE), Hints(Hints), ORE(ORE) {} + : OrigLoop(L), LI(LI), DT(DT), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), + IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {} /// Plan how to best vectorize, return the best VF and its cost, or None if /// vectorization and interleaving should be avoided up front. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -175,6 +175,10 @@ STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization"); STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized"); +cl::opt EnableBOSCCVectorization( + "enable-boscc-vectorization", cl::init(false), cl::Hidden, + cl::desc("Enable BOSCC Vectorizer")); + static cl::opt EnableEpilogueVectorization( "enable-epilogue-vectorization", cl::init(true), cl::Hidden, cl::desc("Enable vectorization of epilogue loops.")); @@ -202,6 +206,10 @@ "vectorize-memory-check-threshold", cl::init(128), cl::Hidden, cl::desc("The maximum allowed number of runtime memory checks")); +static cl::opt BOSCCInstructionInBlockThreshold( + "boscc-instructions-in-threshold", cl::init(5), cl::Hidden, + cl::desc("The minimum instructions in a block required for boscc")); + // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired, // that predication is preferred, and this lists all options. I.e., the // vectorizer will try to fold the tail-loop (epilogue) into the vector body @@ -890,6 +898,38 @@ void printDebugTracesAtStart() override; void printDebugTracesAtEnd() override; }; + +// Block level BOSCC vectorization planner +class BOSCCBlockPlanner { +private: + BasicBlock *BB; + VPBasicBlock *VPBB; + Loop *OrigLoop; + DominatorTree *DT; + const TargetTransformInfo *TTI; + VPBuilder &Builder; + VPBasicBlock *VPGuardBlock; + VPBasicBlock *VPJoinBlock; + VPBasicBlock *VPVecContinueBlock; + +public: + BOSCCBlockPlanner(BasicBlock *BB, VPBasicBlock *VPBB, Loop *OrigLoop, + DominatorTree *DT, const TargetTransformInfo *TTI, + VPBuilder &Builder) + : BB(BB), VPBB(VPBB), OrigLoop(OrigLoop), DT(DT), TTI(TTI), + Builder(Builder), VPGuardBlock(nullptr), VPJoinBlock(nullptr), + VPVecContinueBlock(nullptr) {} + + bool isBlockLegalForBOSCC(); + bool isBlockProfitableForBOSCC(); + bool isBlockLegalAndProfitableForBOSCC(); + VPBasicBlock *getVPGuardBlock() { return VPGuardBlock; } + VPBasicBlock *getVPJoinBlock() { return VPJoinBlock; } + VPBasicBlock *getVPVecContinueBlock() { return VPVecContinueBlock; } + VPBasicBlock *createBOSCCBlocks(); + bool needBOSCCLiveOut(Instruction *I); +}; + } // end namespace llvm /// Look for a meaningful debug location on the instruction or it's @@ -2239,7 +2279,7 @@ // vectorization. Until this is addressed, mark these analyses as preserved // only for non-VPlan-native path. // TODO: Preserve Loop and Dominator analyses for VPlan-native path. - if (!EnableVPlanNativePath) { + if (!EnableVPlanNativePath && !EnableBOSCCVectorization) { AU.addPreserved(); AU.addPreserved(); } @@ -7989,6 +8029,110 @@ }); } +// BOSCC Legal Check +bool BOSCCBlockPlanner::isBlockLegalForBOSCC() { + if (!EnableBOSCCVectorization) + return false; + if (ForceTargetSupportsScalableVectors) + return false; + if (TTI->enableScalableVectorization()) + return false; + BasicBlock *Latch = OrigLoop->getLoopLatch(); + return !DT->dominates(BB, Latch); +} + +// BOSCC Profitablity Check +bool BOSCCBlockPlanner::isBlockProfitableForBOSCC() { + // TBD: At this point the profitablity is controlled by a threshold + // This can be improved later. + return (BB->sizeWithoutDebug() > BOSCCInstructionInBlockThreshold); +} + +// BOSCC Legal & Profitablity Check +bool BOSCCBlockPlanner::isBlockLegalAndProfitableForBOSCC() { + return isBlockLegalForBOSCC() && isBlockProfitableForBOSCC(); +} + +// Creates the BOSCC Block layout for a given conditional block +// +// This creates the "GUARD", "BOSCC.VEC", "BOSCC.VEC.CONTINUE" & "BOSCC.JOIN" +// +// Consider below block layout +// +// | \ +// | \ +// | BB +// | / +// | / +// +// It gets transformed to: +// +// | \ +// | \ +// | BB.BOSCC.GUARD +// | | \ +// | | \ +// | | BB.BOSCC.VEC +// | | | +// | | | +// | | BB.BOSCC.VEC.CONTINUE +// | | / +// | | / +// | BB.BOSCC.JOIN +// | / +// +// BB.BOSCC.GUARD : This serves the purpose of guarding with right condition +// BB.BOSCC.VEC : This is the vector block corrosponds to BB +// BB.BOSCC.VEC.CONTINUE: Auxilary block to facilitate control flow +// BB.BOSCC.JOIN: Required for PHI generation for the live out from BB +// +VPBasicBlock *BOSCCBlockPlanner::createBOSCCBlocks() { + Builder.setInsertPoint(VPBB); + VPBasicBlock *SuccBlock = cast(VPBB->getSingleSuccessor()); + VPBlockUtils::disconnectBlocks(VPBB, SuccBlock); + VPBasicBlock *VPVecBlock = new VPBasicBlock(); + VPVecBlock->setParent(VPBB->getParent()); + VPVecContinueBlock = new VPBasicBlock(); + VPVecContinueBlock->setParent(VPBB->getParent()); + VPJoinBlock = new VPBasicBlock(); + VPJoinBlock->setParent(VPBB->getParent()); + VPGuardBlock = VPBB; + VPGuardBlock->setName(BB->getName() + ".boscc.guard"); + VPVecBlock->setName(BB->getName() + ".boscc.vec"); + VPVecContinueBlock->setName(BB->getName() + ".boscc.vec.continue"); + VPJoinBlock->setName(BB->getName() + ".boscc.join"); + VPGuardBlock->markBOSCCBlock(); + VPVecBlock->markBOSCCBlock(); + VPJoinBlock->markBOSCCBlock(); + VPVecContinueBlock->markBOSCCBlock(); + VPBlockUtils::insertTwoBlocksAfter(VPVecBlock, VPJoinBlock, VPGuardBlock); + VPBlockUtils::connectBlocks(VPVecBlock, VPVecContinueBlock); + VPBlockUtils::connectBlocks(VPVecContinueBlock, VPJoinBlock); + VPBlockUtils::connectBlocks(VPJoinBlock, SuccBlock); + return VPVecBlock; +} + +// Identify if there is a need for BOSCC liveout for the +// given instruction. +bool BOSCCBlockPlanner::needBOSCCLiveOut(Instruction *I) { + // There has to be a join block to generate live outs + if (!getVPJoinBlock()) + return false; + // If the instruction has usage across basic block + for (Use &U : I->uses()) { + Instruction *UseInst = dyn_cast(U.getUser()); + if (!UseInst || (UseInst->getParent() == I->getParent())) + continue; + return true; + } + // If a condition plays a role in terminator branch + // Required for nested conditional statements. + BranchInst *TermBr = dyn_cast(I->getParent()->getTerminator()); + if (TermBr && TermBr->isConditional() && (TermBr->getCondition() == I)) + return true; + return false; +} + bool LoopVectorizationPlanner::getDecisionAndClampRange( const std::function &Predicate, VFRange &Range) { assert(!Range.isEmpty() && "Trying to test an empty VF range."); @@ -8887,8 +9031,26 @@ // Relevant instructions from basic block BB will be grouped into VPRecipe // ingredients and fill a new VPBasicBlock. unsigned VPBBsForBB = 0; + + // Init the BOSCC Planner and check the legal and profitablity + BOSCCBlockPlanner BOSCCPlanner(BB, VPBB, OrigLoop, DT, TTI, Builder); + bool BOSCCRequiredOnBlock = + BOSCCPlanner.isBlockLegalAndProfitableForBOSCC(); + if (BOSCCRequiredOnBlock) { + // Mark plan with BOSCC style vectorization + Plan->markPlanWithBOSCC(); + // Create BOSCC block layout + VPBB = BOSCCPlanner.createBOSCCBlocks(); + // Create the reciepe to generate required check in guard block + VPValue *BlockInMask = RecipeBuilder.createBlockInMask(BB, Plan); + VPBasicBlock *VPGuardBlock = BOSCCPlanner.getVPGuardBlock(); + auto *BOMRecipe = new VPBranchOnBOSCCGuardRecipe(BlockInMask, TTI); + VPGuardBlock->appendRecipe(BOMRecipe); + } + if (VPBB != HeaderVPBB) - VPBB->setName(BB->getName()); + VPBB->setName(BB->getName() + (BOSCCRequiredOnBlock ? ".boscc" : "")); + Builder.setInsertPoint(VPBB); // Introduce each ingredient into VPlan. @@ -8901,6 +9063,10 @@ if (isa(Instr) || DeadInstructions.count(Instr)) continue; + // Check the need for BOSCC LiveOut + bool BOSCCLiveOutRequired = BOSCCRequiredOnBlock && + BOSCCPlanner.needBOSCCLiveOut(Instr); + SmallVector Operands; auto *Phi = dyn_cast(Instr); if (Phi && Phi->getParent() == OrigLoop->getHeader()) { @@ -8950,6 +9116,19 @@ } RecipeBuilder.setRecipe(Instr, Recipe); VPBB->appendRecipe(Recipe); + // Create the BOSCC LiveOuts + if (BOSCCLiveOutRequired) { + // Create the reciepe to generate required PHI nodes in Join block + VPBasicBlock *VPJoinBlock = BOSCCPlanner.getVPJoinBlock(); + Builder.setInsertPoint(VPJoinBlock); + VPValue *ScalarLiveOut = Plan->getOrAddVPValue(Instr); + auto *LiveOutRecipe = new VPBOSCCLiveOutRecipe( + ScalarLiveOut, BOSCCPlanner.getVPGuardBlock(), + BOSCCPlanner.getVPVecContinueBlock()); + VPJoinBlock->appendRecipe(LiveOutRecipe); + // RecipeBuilder.resetRecipe(Instr, LiveOutRecipe); + Builder.setInsertPoint(VPBB); + } continue; } @@ -8957,6 +9136,21 @@ // replicated. This may create a successor for VPBB. VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan); + + // Create the BOSCC LiveOuts + if (BOSCCLiveOutRequired) { + // Create the reciepe to generate required PHI nodes in Join block + VPBasicBlock *VPJoinBlock = BOSCCPlanner.getVPJoinBlock(); + Builder.setInsertPoint(VPJoinBlock); + VPValue *ScalarLiveOut = Plan->getOrAddVPValue(Instr); + auto *LiveOutRecipe = new VPBOSCCLiveOutRecipe( + ScalarLiveOut, BOSCCPlanner.getVPGuardBlock(), + BOSCCPlanner.getVPVecContinueBlock()); + VPJoinBlock->appendRecipe(LiveOutRecipe); + // RecipeBuilder.resetRecipe(Instr, LiveOutRecipe); + Builder.setInsertPoint(VPBB); + } + if (NextVPBB != VPBB) { VPBB = NextVPBB; VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++) @@ -8964,6 +9158,12 @@ } } + if (BOSCCRequiredOnBlock) { + // Before processing the next block, update the current + // vplan block to boscc join block + VPBB = BOSCCPlanner.getVPJoinBlock(); + } + VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB); VPBB = cast(VPBB->getSingleSuccessor()); } @@ -9949,7 +10149,7 @@ // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, DT, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -10292,7 +10492,7 @@ CM.collectElementTypesForWidening(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, DT, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); @@ -10632,7 +10832,7 @@ // vectorization. Until this is addressed, mark these analyses as preserved // only for non-VPlan-native path. // TODO: Preserve Loop and Dominator analyses for VPlan-native path. - if (!EnableVPlanNativePath) { + if (!EnableVPlanNativePath && !EnableBOSCCVectorization) { PA.preserve(); PA.preserve(); } diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -123,6 +123,10 @@ VFRange &Range, VPBasicBlock *VPBB, VPlanPtr &Plan); + void resetRecipe(Instruction *I, VPRecipeBase *R) { + Ingredient2Recipe[I] = R; + } + /// Set the recipe created for given ingredient. This operation is a no-op for /// ingredients that were not marked using a nullptr entry in the map. void setRecipe(Instruction *I, VPRecipeBase *R) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -765,6 +765,7 @@ Def->getVPDefID() == VPRecipeBase::VPReplicateSC || Def->getVPDefID() == VPRecipeBase::VPReductionSC || Def->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC || + Def->getVPDefID() == VPRecipeBase::VPBranchOnBOSCCGuardSC || Def->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC; } @@ -1611,6 +1612,58 @@ } }; +/// A recipe for generating conditional branch for BOSCC +/// guard block +class VPBranchOnBOSCCGuardRecipe : public VPRecipeBase { +private: + /// Pointer to the TTI, needed to create the target reduction + const TargetTransformInfo *TTI; + +public: + VPBranchOnBOSCCGuardRecipe(VPValue *BlockInMask, + const TargetTransformInfo *TTI) + : VPRecipeBase(VPBranchOnBOSCCGuardSC, {}), TTI(TTI) { + if (BlockInMask) // nullptr means all-one mask. + addOperand(BlockInMask); + } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPBranchOnBOSCCGuardSC; + } + + /// Generate the extraction of the appropriate bit from the block mask and the + /// conditional branch. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override { + O << Indent << "BRANCH-ON-BOSCC-GUARD "; + if (VPValue *Mask = getMask()) + Mask->printAsOperand(O, SlotTracker); + else + O << " All-One"; + } +#endif + + /// Return the mask used by this recipe. Note that a full mask is represented + /// by a nullptr. + VPValue *getMask() const { + assert(getNumOperands() <= 1 && "should have either 0 or 1 operands"); + // Mask is optional. + return getNumOperands() == 1 ? getOperand(0) : nullptr; + } + + /// Returns true if the recipe uses scalars of operand \p Op. + bool usesScalars(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } +}; + /// A recipe for generating conditional branches on the bits of a mask. class VPBranchOnMaskRecipe : public VPRecipeBase { public: @@ -1657,6 +1710,52 @@ } }; +/// VPBOSCCLiveOutRecipe is a recipe for generating the phi nodes needed when +/// control converges back from a Branch-on-Mask. The phi nodes are needed in +/// order to merge values that are set under such a branch and feed their uses. +/// The phi nodes can be scalar or vector depending on the users of the value. +/// This recipe works in concert with VPBranchOnMaskRecipe. +class VPBOSCCLiveOutRecipe : public VPRecipeBase, public VPValue { +private: + VPBasicBlock *VPGuardBlock; + VPBasicBlock *VPVecContinueBlock; + +public: + /// Construct a VPBOSCCLiveOutRecipe given \p PredInst whose value needs a phi + /// nodes after merging back from a Branch-on-Mask. + VPBOSCCLiveOutRecipe(VPValue *PredV, VPBasicBlock *VPGuardBlock, + VPBasicBlock *VPVecContinueBlock) + : VPRecipeBase(VPBOSCCLiveOutSC, PredV), + VPValue(VPValue::VPVPredInstPHI, nullptr, this), + VPGuardBlock(VPGuardBlock), VPVecContinueBlock(VPVecContinueBlock) {} + + ~VPBOSCCLiveOutRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPBOSCCLiveOutSC; + } + + /// Generates phi nodes for live-outs as needed to retain SSA form. + void execute(VPTransformState &State) override; + + VPBasicBlock *getVPGuardBlock() { return VPGuardBlock; } + VPBasicBlock *getVPVecContinueBlock() { return VPVecContinueBlock; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the recipe uses scalars of operand \p Op. + bool usesScalars(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return true; + } +}; + /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when /// control converges back from a Branch-on-Mask. The phi nodes are needed in /// order to merge values that are set under such a branch and feed their uses. @@ -2014,10 +2113,11 @@ private: /// The VPRecipes held in the order of output instructions to generate. RecipeListTy Recipes; + bool IsBOSCCBlock; public: VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr) - : VPBlockBase(VPBasicBlockSC, Name.str()) { + : VPBlockBase(VPBasicBlockSC, Name.str()), IsBOSCCBlock(false) { if (Recipe) appendRecipe(Recipe); } @@ -2116,6 +2216,8 @@ /// Returns true if the block is exiting it's parent region. bool isExiting() const; + void markBOSCCBlock() { IsBOSCCBlock = true; } + bool isBOSCCBlock() { return IsBOSCCBlock; } private: /// Create an IR BasicBlock to hold the output instructions generated by this @@ -2529,6 +2631,9 @@ /// Values used outside the plan. MapVector LiveOuts; + /// Indicates whether VPlan contains BOSCC Blocks + bool HasBOSCCBlocks = false; + public: VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) { if (Entry) @@ -2714,6 +2819,12 @@ return LiveOuts; } + /// Return true if the VPlan contains BOSCC Blocks + bool hasBOSCCBlocks() { return HasBOSCCBlocks; } + + /// Mark VPlan indicating + void markPlanWithBOSCC() { HasBOSCCBlocks = true; } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. @@ -2868,6 +2979,8 @@ if (!VPBB || !PredVPBB || PredVPBB->getNumSuccessors() != 1) return nullptr; + if (VPBB->isBOSCCBlock()) + return nullptr; for (VPRecipeBase &R : make_early_inc_range(*VPBB)) R.moveBefore(*PredVPBB, PredVPBB->end()); VPBlockUtils::disconnectBlocks(PredVPBB, VPBB); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -46,6 +46,7 @@ using namespace llvm; extern cl::opt EnableVPlanNativePath; +extern cl::opt EnableBOSCCVectorization; #define DEBUG_TYPE "vplan" @@ -347,11 +348,12 @@ // block. cast(ExitingBB->getTerminator())->setSuccessor(0, NewBB); } else if (PrevVPBB && /* A */ - !((SingleHPred = getSingleHierarchicalPredecessor()) && - SingleHPred->getExitingBasicBlock() == PrevVPBB && - PrevVPBB->getSingleHierarchicalSuccessor() && - (SingleHPred->getParent() == getEnclosingLoopRegion() && - !IsLoopRegion(SingleHPred))) && /* B */ + (getPlan()->hasBOSCCBlocks() || + !((SingleHPred = getSingleHierarchicalPredecessor()) && + SingleHPred->getExitingBasicBlock() == PrevVPBB && + PrevVPBB->getSingleHierarchicalSuccessor() && + (SingleHPred->getParent() == getEnclosingLoopRegion() && + !IsLoopRegion(SingleHPred)))) && /* B */ !(Replica && getPredecessors().empty())) { /* C */ // The last IR basic block is reused, as an optimization, in three cases: // A. the first VPBB reuses the loop pre-header BB - when PrevVPBB is null; @@ -443,7 +445,7 @@ const VPRecipeBase *R = &VPBB->back(); auto *VPI = dyn_cast(R); bool IsCondBranch = - isa(R) || + isa(R) || isa(R) || (VPI && (VPI->getOpcode() == VPInstruction::BranchOnCond || VPI->getOpcode() == VPInstruction::BranchOnCount)); (void)IsCondBranch; @@ -742,7 +744,7 @@ } // We do not attempt to preserve DT for outer loop vectorization currently. - if (!EnableVPlanNativePath) { + if (!EnableVPlanNativePath && !EnableBOSCCVectorization) { BasicBlock *VectorHeaderBB = State->CFG.VPBB2IRBB[Header]; State->DT->addNewBlock(VectorHeaderBB, VectorPreHeader); updateDominatorTree(State->DT, VectorHeaderBB, VectorLatchBB, diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" @@ -49,6 +50,7 @@ return cast(getVPSingleValue()->getUnderlyingValue()) ->mayWriteToMemory(); case VPBranchOnMaskSC: + case VPBranchOnBOSCCGuardSC: return false; case VPWidenIntOrFpInductionSC: case VPWidenCanonicalIVSC: @@ -80,6 +82,7 @@ return cast(getVPSingleValue()->getUnderlyingValue()) ->mayReadFromMemory(); case VPBranchOnMaskSC: + case VPBranchOnBOSCCGuardSC: return false; case VPWidenIntOrFpInductionSC: case VPWidenCanonicalIVSC: @@ -934,6 +937,52 @@ } #endif +// Creates the condition inside the boscc-guard block, which +// ensures when all the lanes in mask is set to zero then do +// not execute the respective vector block at runtime. +// i.e. +// if.then.boscc.guard: ; preds = %vector.body +// %5 = bitcast <8 x i1> %mask to i8 +// %6 = icmp ne i8 %5, 0 +// br i1 %6, label %if.then.boscc, label %if.then.boscc.join +// +// if.then.boscc: ; preds = %if.then.boscc.guard +// ;; The Vector Block +// +// if.then.boscc.join: +// +void VPBranchOnBOSCCGuardRecipe::execute(VPTransformState &State) { + VPValue *BlockInMask = getMask(); + Value *GuardMask = nullptr; + // Create a common mask by appending the mask from different + // unroll instances representing the same condition + auto *CurrentTerminator = State.CFG.PrevBB->getTerminator(); + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + Value *PartMask = nullptr; + if (BlockInMask) + PartMask = State.get(BlockInMask, Part); + else + PartMask = State.Builder.getTrue(); + assert(PartMask && "PartMask Missing"); + if (!PartMask->getType()->isVectorTy()) + PartMask = State.Builder.CreateVectorSplat(State.VF, PartMask, ""); + // Change the vector mask to scalar value, and then generate the condition + // check inside boscc-guard block + Value *ScalarCond = createVectorToScalarCast(PartMask, State.Builder, TTI); + if (!GuardMask) { + GuardMask = ScalarCond; + continue; + } + GuardMask = State.Builder.CreateOr(GuardMask, ScalarCond); + } + + Value *Cond = State.Builder.CreateICmpNE( + GuardMask, ConstantInt::get(GuardMask->getType(), 0)); + auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, Cond); + CondBr->setSuccessor(0, nullptr); + ReplaceInstWithInst(CurrentTerminator, CondBr); +} + void VPBranchOnMaskRecipe::execute(VPTransformState &State) { assert(State.Instance && "Branch on Mask works only on single instance."); @@ -960,6 +1009,54 @@ ReplaceInstWithInst(CurrentTerminator, CondBr); } +// Create the PHI node in the join block. +// +// It will have 2 incoming values, first representing value from +// vector block, and the second in undefined value when vector block +// is not executed. +// +// i.e. +// if.then.boscc.guard: ; preds = %vector.body +// br i1 %6, label %if.then.boscc, label %if.then.boscc.join +// +// if.then.boscc: ; preds = %if.then.boscc.guard +// ;; The Vector Block +// br label %if.then.boscc.vec.continue +// +// if.then.boscc.vec.continue: ; preds = %if.then.boscc +// br label %if.then.boscc.join +// +// if.then.boscc.join: ; preds = %if.then.boscc.vec.continue, %if.then.boscc.guard +// %12 = phi <8 x i32> [ %SomeComputedValue, %if.then.boscc.vec.continue ], +// [ undef, %if.then.boscc.guard ] +void VPBOSCCLiveOutRecipe::execute(VPTransformState &State) { + VPValue *ScalarLiveOut = getOperand(0); + BasicBlock *GuardBlock = State.CFG.VPBB2IRBB[getVPGuardBlock()]; + BasicBlock *ContinueBlock = State.CFG.VPBB2IRBB[getVPVecContinueBlock()]; + for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) { + if (State.hasVectorValue(ScalarLiveOut, Part)) { + Instruction *VecInst = cast(State.get(ScalarLiveOut, Part)); + PHINode *ExitPhi = State.Builder.CreatePHI(VecInst->getType(), 2); + ExitPhi->addIncoming(VecInst, ContinueBlock); + ExitPhi->addIncoming(UndefValue::get(VecInst->getType()), GuardBlock); + State.set(this, ExitPhi, Part); + State.reset(ScalarLiveOut, ExitPhi, Part); + } else if (State.hasScalarValue(ScalarLiveOut, {Part, 0})) { + for (unsigned Lane = 0, VF = State.VF.getKnownMinValue(); Lane < VF; + ++Lane) { + Instruction *ScalarInst = + cast(State.get(ScalarLiveOut, {Part, Lane})); + PHINode *ExitPhi = State.Builder.CreatePHI(ScalarInst->getType(), 2); + ExitPhi->addIncoming(ScalarInst, ContinueBlock); + ExitPhi->addIncoming(UndefValue::get(ScalarInst->getType()), + GuardBlock); + State.set(this, ExitPhi, {Part, Lane}); + State.reset(ScalarLiveOut, ExitPhi, {Part, Lane}); + } + } + } +} + void VPPredInstPHIRecipe::execute(VPTransformState &State) { assert(State.Instance && "Predicated instruction PHI works per instance."); Instruction *ScalarPredInst = @@ -1007,6 +1104,14 @@ } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPBOSCCLiveOutRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "BOSCC-LIVE-OUT-INSTRUCTION "; + printAsOperand(O, SlotTracker); + O << " = "; + printOperands(O, SlotTracker); +} + void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "PHI-PREDICATED-INSTRUCTION "; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -185,10 +185,13 @@ VPValue *getPredicatedMask(VPRegionBlock *R) { auto *EntryBB = dyn_cast(R->getEntry()); if (!EntryBB || EntryBB->size() != 1 || - !isa(EntryBB->begin())) + (!isa(EntryBB->begin()) && + !isa(EntryBB->begin()))) return nullptr; - - return cast(&*EntryBB->begin())->getOperand(0); + if (isa(EntryBB->begin())) + return cast(&*EntryBB->begin())->getOperand(0); + else + return cast(&*EntryBB->begin())->getOperand(0); } /// If \p R is a triangle region, return the 'then' block of the triangle. diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -355,6 +355,7 @@ /// type identification. using VPRecipeTy = enum { VPBranchOnMaskSC, + VPBranchOnBOSCCGuardSC, VPExpandSCEVSC, VPInstructionSC, VPInterleaveSC, @@ -378,6 +379,7 @@ VPWidenPHISC, VPWidenIntOrFpInductionSC, VPWidenPointerInductionSC, + VPBOSCCLiveOutSC, VPReductionPHISC, VPFirstPHISC = VPBlendSC, VPFirstHeaderPHISC = VPCanonicalIVPHISC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -147,6 +147,11 @@ if (isa(RecipeI)) NumActiveLaneMaskPhiRecipes++; + if (isa(*RecipeI)) { + RecipeI++; + continue; + } + if (IsHeaderVPBB && !isa(*RecipeI)) { errs() << "Found non-header PHI recipe in header VPBB"; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) diff --git a/llvm/test/Transforms/LoopVectorize/boscc0.ll b/llvm/test/Transforms/LoopVectorize/boscc0.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/boscc0.ll @@ -0,0 +1,139 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -enable-boscc-vectorization -S %s | FileCheck %s +; +; for (unsigned i = 0; i < len; i++) +; if (X[i]) +; A[i] = B[i] + C[i]; +; + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable +define dso_local void @foo(ptr noalias nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef readonly %C, ptr noalias nocapture noundef readnone %D, ptr nocapture noundef readnone %E, ptr nocapture noundef readnone %F, ptr nocapture noundef readonly %X, i32 noundef %len) local_unnamed_addr #0 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP12_NOT:%.*]] = icmp eq i32 [[LEN:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP12_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_INC2:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4, !tbaa [[TBAA5:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: br label [[IF_THEN_BOSCC_GUARD:%.*]] +; CHECK: if.then.boscc.guard: +; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i8 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[IF_THEN_BOSCC:%.*]], label [[IF_THEN_BOSCC_JOIN:%.*]] +; CHECK: if.then.boscc: +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP8]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP10]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP11]], ptr [[TMP13]], i32 4, <8 x i1> [[TMP4]]), !tbaa [[TBAA5]] +; CHECK-NEXT: br label [[IF_THEN_BOSCC_VEC_CONTINUE:%.*]] +; CHECK: if.then.boscc.vec.continue: +; CHECK-NEXT: br label [[IF_THEN_BOSCC_JOIN]] +; CHECK: if.then.boscc.join: +; CHECK-NEXT: br label [[FOR_INC2]] +; CHECK: for.inc2: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP15]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP17]], [[TMP16]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; +entry: + %cmp12.not = icmp eq i32 %len, 0 + br i1 %cmp12.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %len to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.inc, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.inc + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %X, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4, !tbaa !5 + %tobool.not = icmp eq i32 %0, 0 + br i1 %tobool.not, label %for.inc, label %if.then + +if.then: ; preds = %for.body + %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4, !tbaa !5 + %arrayidx4 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %2 = load i32, ptr %arrayidx4, align 4, !tbaa !5 + %add = add nsw i32 %2, %1 + %arrayidx6 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %add, ptr %arrayidx6, align 4, !tbaa !5 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !9 +} + +attributes #0 = { argmemonly nofree norecurse nosync nounwind uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="znver3" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+clwb,+clzero,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mwaitx,+pclmul,+pku,+popcnt,+prfchw,+rdpid,+rdpru,+rdrnd,+rdseed,+sahf,+sha,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,+ssse3,+vaes,+vpclmulqdq,+wbnoinvd,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" } + +!llvm.module.flags = !{!0, !1, !2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 8, !"PIC Level", i32 2} +!2 = !{i32 7, !"PIE Level", i32 2} +!3 = !{i32 7, !"uwtable", i32 2} +!4 = !{!"clang version 16.0.0 (https://github.com/llvm/llvm-project.git 1fa2019828caec1172382009d5327c265427af57)"} +!5 = !{!6, !6, i64 0} +!6 = !{!"int", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"} +!9 = distinct !{!9, !10, !11} +!10 = !{!"llvm.loop.mustprogress"} +!11 = !{!"llvm.loop.unroll.disable"} diff --git a/llvm/test/Transforms/LoopVectorize/boscc1.ll b/llvm/test/Transforms/LoopVectorize/boscc1.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/boscc1.ll @@ -0,0 +1,178 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -enable-boscc-vectorization -S %s | FileCheck %s +; +; for (unsigned i = 0; i < len; i++) +; if (X[i]) +; A[i] = B[i] + C[i]; +; else +; A[i] = E[i] * F[i]; +; + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable +define dso_local void @foo(ptr noalias nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef readonly %C, ptr noalias nocapture noundef readnone %D, ptr nocapture noundef readonly %E, ptr nocapture noundef readonly %F, ptr nocapture noundef readonly %X, i32 noundef %len) local_unnamed_addr #0 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP22_NOT:%.*]] = icmp eq i32 [[LEN:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP22_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_INC4:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4, !tbaa [[TBAA5:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: br label [[IF_THEN_BOSCC_GUARD:%.*]] +; CHECK: if.then.boscc.guard: +; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i8 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[IF_THEN_BOSCC:%.*]], label [[IF_THEN_BOSCC_JOIN:%.*]] +; CHECK: if.then.boscc: +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP8]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP10]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: br label [[IF_THEN_BOSCC_VEC_CONTINUE:%.*]] +; CHECK: if.then.boscc.vec.continue: +; CHECK-NEXT: br label [[IF_THEN_BOSCC_JOIN]] +; CHECK: if.then.boscc.join: +; CHECK-NEXT: [[TMP12:%.*]] = phi <8 x i32> [ [[TMP11]], [[IF_THEN_BOSCC_VEC_CONTINUE]] ], [ undef, [[IF_THEN_BOSCC_GUARD]] ] +; CHECK-NEXT: br label [[IF_ELSE_BOSCC_GUARD:%.*]] +; CHECK: if.else.boscc.guard: +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <8 x i1> [[TMP3]] to i8 +; CHECK-NEXT: [[TMP14:%.*]] = icmp ne i8 [[TMP13]], 0 +; CHECK-NEXT: br i1 [[TMP14]], label [[IF_ELSE_BOSCC:%.*]], label [[IF_ELSE_BOSCC_JOIN:%.*]] +; CHECK: if.else.boscc: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[E:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, ptr [[TMP15]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP16]], i32 4, <8 x i1> [[TMP3]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i32, ptr [[F:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP18]], i32 4, <8 x i1> [[TMP3]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP19:%.*]] = mul nsw <8 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD2]] +; CHECK-NEXT: br label [[IF_ELSE_BOSCC_VEC_CONTINUE:%.*]] +; CHECK: if.else.boscc.vec.continue: +; CHECK-NEXT: br label [[IF_ELSE_BOSCC_JOIN]] +; CHECK: if.else.boscc.join: +; CHECK-NEXT: [[TMP20:%.*]] = phi <8 x i32> [ [[TMP19]], [[IF_ELSE_BOSCC_VEC_CONTINUE]] ], [ undef, [[IF_ELSE_BOSCC_GUARD]] ] +; CHECK-NEXT: br label [[FOR_INC4]] +; CHECK: for.inc4: +; CHECK-NEXT: [[PREDPHI:%.*]] = select <8 x i1> [[TMP4]], <8 x i32> [[TMP12]], <8 x i32> [[TMP20]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0 +; CHECK-NEXT: store <8 x i32> [[PREDPHI]], ptr [[TMP22]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP24]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP26]], [[TMP25]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: if.else: +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[E]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP28]], [[TMP27]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[ADD_SINK:%.*]] = phi i32 [ [[MUL]], [[IF_ELSE]] ], [ [[ADD]], [[IF_THEN]] ] +; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD_SINK]], ptr [[TMP29]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; +entry: + %cmp22.not = icmp eq i32 %len, 0 + br i1 %cmp22.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %len to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.inc, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.inc + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %X, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4, !tbaa !5 + %tobool.not = icmp eq i32 %0, 0 + br i1 %tobool.not, label %if.else, label %if.then + +if.then: ; preds = %for.body + %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4, !tbaa !5 + %arrayidx4 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %2 = load i32, ptr %arrayidx4, align 4, !tbaa !5 + %add = add nsw i32 %2, %1 + br label %for.inc + +if.else: ; preds = %for.body + %arrayidx8 = getelementptr inbounds i32, ptr %E, i64 %indvars.iv + %3 = load i32, ptr %arrayidx8, align 4, !tbaa !5 + %arrayidx10 = getelementptr inbounds i32, ptr %F, i64 %indvars.iv + %4 = load i32, ptr %arrayidx10, align 4, !tbaa !5 + %mul = mul nsw i32 %4, %3 + br label %for.inc + +for.inc: ; preds = %if.then, %if.else + %add.sink = phi i32 [ %mul, %if.else ], [ %add, %if.then ] + %5 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %add.sink, ptr %5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !9 +} + +attributes #0 = { argmemonly nofree norecurse nosync nounwind uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="znver3" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+clwb,+clzero,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mwaitx,+pclmul,+pku,+popcnt,+prfchw,+rdpid,+rdpru,+rdrnd,+rdseed,+sahf,+sha,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,+ssse3,+vaes,+vpclmulqdq,+wbnoinvd,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" } + +!llvm.module.flags = !{!0, !1, !2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 8, !"PIC Level", i32 2} +!2 = !{i32 7, !"PIE Level", i32 2} +!3 = !{i32 7, !"uwtable", i32 2} +!4 = !{!"clang version 16.0.0 (https://github.com/llvm/llvm-project.git 1fa2019828caec1172382009d5327c265427af57)"} +!5 = !{!6, !6, i64 0} +!6 = !{!"int", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"} +!9 = distinct !{!9, !10, !11} +!10 = !{!"llvm.loop.mustprogress"} +!11 = !{!"llvm.loop.unroll.disable"} diff --git a/llvm/test/Transforms/LoopVectorize/boscc2.ll b/llvm/test/Transforms/LoopVectorize/boscc2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/boscc2.ll @@ -0,0 +1,195 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -enable-boscc-vectorization -S %s | FileCheck %s +; +; for (unsigned i = 0; i < len; i++) { +; if (X[i]) { +; A[i] = B[i] + C[i]; +; if (Y[i]) +; D[i] = E[i] * F[i]; +; } +; } +; + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: argmemonly nofree norecurse nosync nounwind uwtable +define dso_local void @foo(ptr noalias nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, ptr nocapture noundef readonly %C, ptr noalias nocapture noundef writeonly %D, ptr nocapture noundef readonly %E, ptr nocapture noundef readonly %F, ptr nocapture noundef readonly %X, ptr nocapture noundef readonly %Y, i32 noundef %len) local_unnamed_addr #0 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP27_NOT:%.*]] = icmp eq i32 [[LEN:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP27_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LEN]] to i64 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 8 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_INC5:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, ptr [[TMP2]], align 4, !tbaa [[TBAA5:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <8 x i32> [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: br label [[IF_THEN_BOSCC_GUARD:%.*]] +; CHECK: if.then.boscc.guard: +; CHECK-NEXT: [[TMP4:%.*]] = xor <8 x i1> [[TMP3]], +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <8 x i1> [[TMP4]] to i8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i8 [[TMP5]], 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[IF_THEN_BOSCC:%.*]], label [[IF_THEN_BOSCC_JOIN:%.*]] +; CHECK: if.then.boscc: +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP8]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[C:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP9]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP10]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP11]], ptr [[TMP13]], i32 4, <8 x i1> [[TMP4]]), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[Y:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP14]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP15]], i32 4, <8 x i1> [[TMP4]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq <8 x i32> [[WIDE_MASKED_LOAD2]], zeroinitializer +; CHECK-NEXT: br label [[IF_THEN_BOSCC_VEC_CONTINUE:%.*]] +; CHECK: if.then.boscc.vec.continue: +; CHECK-NEXT: br label [[IF_THEN_BOSCC_JOIN]] +; CHECK: if.then.boscc.join: +; CHECK-NEXT: [[TMP17:%.*]] = phi <8 x i1> [ [[TMP16]], [[IF_THEN_BOSCC_VEC_CONTINUE]] ], [ undef, [[IF_THEN_BOSCC_GUARD]] ] +; CHECK-NEXT: br label [[IF_THEN10_BOSCC_GUARD:%.*]] +; CHECK: if.then10.boscc.guard: +; CHECK-NEXT: [[TMP18:%.*]] = xor <8 x i1> [[TMP17]], +; CHECK-NEXT: [[TMP19:%.*]] = select <8 x i1> [[TMP4]], <8 x i1> [[TMP18]], <8 x i1> zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = bitcast <8 x i1> [[TMP19]] to i8 +; CHECK-NEXT: [[TMP21:%.*]] = icmp ne i8 [[TMP20]], 0 +; CHECK-NEXT: br i1 [[TMP21]], label [[IF_THEN10_BOSCC:%.*]], label [[IF_THEN10_BOSCC_JOIN:%.*]] +; CHECK: if.then10.boscc: +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[E:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP22]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP23]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i32, ptr [[F:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i32, ptr [[TMP24]], i32 0 +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0(ptr [[TMP25]], i32 4, <8 x i1> [[TMP19]], <8 x i32> poison), !tbaa [[TBAA5]] +; CHECK-NEXT: [[TMP26:%.*]] = mul nsw <8 x i32> [[WIDE_MASKED_LOAD4]], [[WIDE_MASKED_LOAD3]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr i32, ptr [[D:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr i32, ptr [[TMP27]], i32 0 +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0(<8 x i32> [[TMP26]], ptr [[TMP28]], i32 4, <8 x i1> [[TMP19]]), !tbaa [[TBAA5]] +; CHECK-NEXT: br label [[IF_THEN10_BOSCC_VEC_CONTINUE:%.*]] +; CHECK: if.then10.boscc.vec.continue: +; CHECK-NEXT: br label [[IF_THEN10_BOSCC_JOIN]] +; CHECK: if.then10.boscc.join: +; CHECK-NEXT: br label [[FOR_INC5]] +; CHECK: for.inc5: +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_INC:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP30]], 0 +; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]] +; CHECK: if.then: +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP32]], [[TMP31]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[TOBOOL9_NOT:%.*]] = icmp eq i32 [[TMP33]], 0 +; CHECK-NEXT: br i1 [[TOBOOL9_NOT]], label [[FOR_INC]], label [[IF_THEN10:%.*]] +; CHECK: if.then10: +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[E]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[F]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX14]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP35]], [[TMP34]] +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[D]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX16]], align 4, !tbaa [[TBAA5]] +; CHECK-NEXT: br label [[FOR_INC]] +; CHECK: for.inc: +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; +entry: + %cmp27.not = icmp eq i32 %len, 0 + br i1 %cmp27.not, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %len to i64 + br label %for.body + +for.cond.cleanup: ; preds = %for.inc, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.inc + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.inc ] + %arrayidx = getelementptr inbounds i32, ptr %X, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4, !tbaa !5 + %tobool.not = icmp eq i32 %0, 0 + br i1 %tobool.not, label %for.inc, label %if.then + +if.then: ; preds = %for.body + %arrayidx2 = getelementptr inbounds i32, ptr %B, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4, !tbaa !5 + %arrayidx4 = getelementptr inbounds i32, ptr %C, i64 %indvars.iv + %2 = load i32, ptr %arrayidx4, align 4, !tbaa !5 + %add = add nsw i32 %2, %1 + %arrayidx6 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv + store i32 %add, ptr %arrayidx6, align 4, !tbaa !5 + %arrayidx8 = getelementptr inbounds i32, ptr %Y, i64 %indvars.iv + %3 = load i32, ptr %arrayidx8, align 4, !tbaa !5 + %tobool9.not = icmp eq i32 %3, 0 + br i1 %tobool9.not, label %for.inc, label %if.then10 + +if.then10: ; preds = %if.then + %arrayidx12 = getelementptr inbounds i32, ptr %E, i64 %indvars.iv + %4 = load i32, ptr %arrayidx12, align 4, !tbaa !5 + %arrayidx14 = getelementptr inbounds i32, ptr %F, i64 %indvars.iv + %5 = load i32, ptr %arrayidx14, align 4, !tbaa !5 + %mul = mul nsw i32 %5, %4 + %arrayidx16 = getelementptr inbounds i32, ptr %D, i64 %indvars.iv + store i32 %mul, ptr %arrayidx16, align 4, !tbaa !5 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then10, %if.then + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !9 +} + +attributes #0 = { argmemonly nofree norecurse nosync nounwind uwtable "frame-pointer"="none" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="znver3" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+clflushopt,+clwb,+clzero,+crc32,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+mwaitx,+pclmul,+pku,+popcnt,+prfchw,+rdpid,+rdpru,+rdrnd,+rdseed,+sahf,+sha,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+sse4a,+ssse3,+vaes,+vpclmulqdq,+wbnoinvd,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" } + +!llvm.module.flags = !{!0, !1, !2, !3} +!llvm.ident = !{!4} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 8, !"PIC Level", i32 2} +!2 = !{i32 7, !"PIE Level", i32 2} +!3 = !{i32 7, !"uwtable", i32 2} +!4 = !{!"clang version 16.0.0 (https://github.com/llvm/llvm-project.git 1fa2019828caec1172382009d5327c265427af57)"} +!5 = !{!6, !6, i64 0} +!6 = !{!"int", !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C/C++ TBAA"} +!9 = distinct !{!9, !10, !11} +!10 = !{!"llvm.loop.mustprogress"} +!11 = !{!"llvm.loop.unroll.disable"}