Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1412,6 +1412,13 @@ /// \returns How the target needs this vector-predicated operation to be /// transformed. VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const; + + /// If the target uses custom instruction to compute + /// active vector length, use an intrinsic in the IR that will be lowered to + /// this instruction. Else, the IR will use instructions for computing Min(VF, + /// TripCount - Induction). + bool useCustomActiveVectorLengthIntrinsic() const; + /// @} /// @} @@ -1721,6 +1728,7 @@ virtual unsigned getGISelRematGlobalCost() const = 0; virtual bool supportsScalableVectors() const = 0; virtual bool hasActiveVectorLength() const = 0; + virtual bool useCustomActiveVectorLengthIntrinsic() const = 0; virtual InstructionCost getInstructionLatency(const Instruction *I) = 0; virtual VPLegalization getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; @@ -2292,6 +2300,10 @@ return Impl.hasActiveVectorLength(); } + bool useCustomActiveVectorLengthIntrinsic() const override { + return Impl.useCustomActiveVectorLengthIntrinsic(); + } + InstructionCost getInstructionLatency(const Instruction *I) override { return Impl.getInstructionLatency(I); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -757,6 +757,8 @@ /* OperatorStrategy */ TargetTransformInfo::VPLegalization::Convert); } + bool useCustomActiveVectorLengthIntrinsic() const { return false; } + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. Index: llvm/include/llvm/IR/IRBuilder.h =================================================================== --- llvm/include/llvm/IR/IRBuilder.h +++ llvm/include/llvm/IR/IRBuilder.h @@ -43,6 +43,7 @@ #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/CBindingWrapping.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/TypeSize.h" #include #include #include @@ -2552,6 +2553,12 @@ unsigned Index, unsigned FieldIndex, MDNode *DbgInfo); + /// Return an all true boolean vector of size and scalability \p NumElts. + Value *getTrueVector(ElementCount NumElts) { + VectorType *VTy = VectorType::get(Type::getInt1Ty(Context), NumElts); + return Constant::getAllOnesValue(VTy); + } + private: /// Helper function that creates an assume intrinsic call that /// represents an alignment assumption on the provided pointer \p PtrValue Index: llvm/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/include/llvm/IR/Intrinsics.td +++ llvm/include/llvm/IR/Intrinsics.td @@ -1366,6 +1366,26 @@ //===---------------- Vector Predication Intrinsics --------------===// +// Memory Intrinsics +def int_vp_store : DefaultAttrsIntrinsic<[], + [ llvm_anyvector_ty, + LLVMAnyPointerType>, + llvm_i32_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty], + [ NoCapture>, IntrNoSync, IntrWriteMem, + IntrArgMemOnly, IntrWillReturn, + ImmArg> ]>; + +def int_vp_load : DefaultAttrsIntrinsic<[ llvm_anyvector_ty], + [ LLVMAnyPointerType>, + llvm_i32_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty], + [ NoCapture>, IntrNoSync, IntrReadMem, + IntrWillReturn, IntrArgMemOnly, + ImmArg> ]>; + // Speculatable Binary operators let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in { def int_vp_add : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], @@ -1474,6 +1494,11 @@ [llvm_anyint_ty, LLVMMatchType<1>], [IntrNoMem, IntrNoSync, IntrWillReturn]>; +def int_experimental_set_vector_length: + DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrNoSync, IntrWillReturn]>; + //===-------------------------- Masked Intrinsics -------------------------===// // def int_masked_load: Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1048,6 +1048,10 @@ return TTIImpl->hasActiveVectorLength(); } +bool TargetTransformInfo::useCustomActiveVectorLengthIntrinsic() const { + return TTIImpl->useCustomActiveVectorLengthIntrinsic(); +} + InstructionCost TargetTransformInfo::getInstructionLatency(const Instruction *I) const { return TTIImpl->getInstructionLatency(I); Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -60,6 +60,7 @@ #include "VPlanHCFGBuilder.h" #include "VPlanPredicator.h" #include "VPlanTransforms.h" +#include "VPlanValue.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -235,6 +236,54 @@ "prefers tail-folding, don't attempt vectorization if " "tail-folding fails."))); +// Option prefer-predicate-with-vp-intrinsics is an experimental switch to +// indicate that the loop vectorizer should try to generate VP intrinsics if +// tail-folding is enabled (note that this option is dependent on the +// prefer-predicate-over-epilogue option being set to predicate-dont-vectorize). +// This can be particularly useful for targets like RISC-V and SX-Aurora that +// support vector length predication. +// Currently this switch takes four possible values: +// 0. no-predication: Do not generate VP intrinsics. +// 1. if-active-vector-length-supported: Only generate VP intrinsics if the +// target supports active vector length based predication. +// 2. without-avl-support: Generate VP intrinsics even if vector length based +// predication is not supported. This will behave a bit like existing +// tail-folding by using a mask for predication, except all instructions are +// widened to VP intrinsics and not just memory instructions. Use of this +// options is discouraged and is only meant for experimental/testing purpose. +// 3. force-active-vector-length-support: This is purely an experimental/testing +// option which will be removed in future. It forces the loop vectorizer to +// assume that the target supports vector length predication. +namespace PreferVPIntrinsicsTy { +enum Option { + NoPredication = 0, + IfAVLSupported, + WithoutAVLSupport, + ForceAVLSupport +}; +} // namespace PreferVPIntrinsicsTy + +static cl::opt PreferPredicateWithVPIntrinsics( + "prefer-predicate-with-vp-intrinsics", + cl::init(PreferVPIntrinsicsTy::NoPredication), cl::Hidden, + cl::desc("When vectorizing with tail-folding, generate vector predication " + "intrinsics."), + cl::values( + clEnumValN(PreferVPIntrinsicsTy::NoPredication, "no-predication", + "Do not generate VP intrinsics."), + clEnumValN(PreferVPIntrinsicsTy::IfAVLSupported, + "if-active-vector-length-support", + "Only generate VP intrinsics if the target supports vector " + "length predication."), + clEnumValN(PreferVPIntrinsicsTy::WithoutAVLSupport, + "without-active-vector-length-support", + "Generate VP intrinsics even if vector length predication " + "is not supported. This option is discouraged."), + clEnumValN(PreferVPIntrinsicsTy::ForceAVLSupport, + "force-active-vector-length-support", + "Assume that the target supports vector length predication " + "and generate VP intrinsics accordingly."))); + static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -478,6 +527,11 @@ void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, VPTransformState &State); + /// Widen a single instruction to a VP intrinsic within the innermost loop. + void widenPredicatedInstruction(Instruction &I, VPValue *Def, + VPUser &Operands, VPTransformState &State, + VPValue *BlockInMask, VPValue *EVL); + /// Widen a single call instruction within the innermost loop. void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, VPTransformState &State); @@ -545,7 +599,8 @@ /// vectorized loop. void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask); + VPValue *StoredValue, VPValue *BlockInMask, + VPValue *EVL = nullptr); /// Set the debug location in the builder using the debug location in /// the instruction. @@ -565,6 +620,10 @@ /// element. virtual Value *getBroadcastInstrs(Value *V); + /// Create Instructions to compute Explicit Vector Length when using VP + /// intrinsics. + Value *createEVL(VPValue *IV, VPValue *TC, VPTransformState &State); + protected: friend class LoopVectorizationPlanner; @@ -1589,6 +1648,11 @@ return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// Returns true if VP intrinsics should be generated in the tail folded loop. + bool preferVPIntrinsics() const { + return foldTailByMasking() && PreferVPIntrinsics; + } + /// A SmallMapVector to store the InLoop reduction op chains, mapping phi /// nodes to the chain of instructions representing the reductions. Uses a /// MapVector to ensure deterministic iteration order. @@ -1749,6 +1813,9 @@ /// All blocks of loop are to be masked to fold tail of scalar iterations. bool FoldTailByMasking = false; + /// Control whether to generate VP intrinsics in vectorized code. + bool PreferVPIntrinsics = false; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -2872,7 +2939,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction( Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask) { + VPValue *StoredValue, VPValue *BlockInMask, VPValue *EVL) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast(Instr); StoreInst *SI = dyn_cast(Instr); @@ -2901,6 +2968,13 @@ bool CreateGatherScatter = (Decision == LoopVectorizationCostModel::CM_GatherScatter); + if (Reverse) + assert(!EVL && + "Vector reverse not supported for predicated vectorization."); + if (CreateGatherScatter) + assert(!EVL && "Gather/Scatter operations not supported for " + "predicated vectorization."); + // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector // gather/scatter. Otherwise Decision should have been to Scalarize. assert((ConsecutiveStride || CreateGatherScatter) && @@ -2956,6 +3030,13 @@ for (unsigned Part = 0; Part < UF; ++Part) { Instruction *NewSI = nullptr; Value *StoredVal = State.get(StoredValue, Part); + + // If EVL is not nullptr, then EVL must be a valid value set during plan + // creation, possibly default value = whole vector register length. EVL is + // created only if TTI prefers predicated vectorization, thus if EVL is + // not nullptr it also implies preference for predicated vectorization. + Value *EVLPart = EVL ? State.get(EVL, Part) : nullptr; + if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(Addr, Part); @@ -2970,11 +3051,24 @@ // another expression. So don't call resetVectorValue(StoredVal). } auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); - if (isMaskRequired) + // if EVLPart is not null, we can vectorize using predicated + // intrinsic. + if (EVLPart) { + assert(isMaskRequired && + "Mask argument is required for VP intrinsics."); + Value *BlockInMaskPart = BlockInMaskParts[Part]; + Value *EVLPartI32 = + Builder.CreateSExtOrTrunc(EVLPart, Builder.getInt32Ty()); + NewSI = Builder.CreateIntrinsic( + Intrinsic::vp_store, {StoredVal->getType(), VecPtr->getType()}, + {StoredVal, VecPtr, Builder.getInt32(Alignment.value()), + BlockInMaskPart, EVLPartI32}); + } else if (isMaskRequired) { NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, BlockInMaskParts[Part]); - else + } else { NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); + } } addMetadata(NewSI, SI); } @@ -2986,6 +3080,13 @@ setDebugLocFromInst(Builder, LI); for (unsigned Part = 0; Part < UF; ++Part) { Value *NewLI; + + // If EVL is not nullptr, then EVL must be a valid value set during plan + // creation, possibly default value = whole vector register length. EVL is + // created only if TTI prefers predicated vectorization, thus if EVL is + // not nullptr it also implies preference for predicated vectorization. + Value *EVLPart = EVL ? State.get(EVL, Part) : nullptr; + if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(Addr, Part); @@ -2994,13 +3095,26 @@ addMetadata(NewLI, LI); } else { auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); - if (isMaskRequired) + if (EVLPart) { + assert(isMaskRequired && + "Mask argument is required for VP intrinsics."); + Value *BlockInMaskPart = BlockInMaskParts[Part]; + Value *EVLPartI32 = + Builder.CreateSExtOrTrunc(EVLPart, Builder.getInt32Ty()); + NewLI = Builder.CreateIntrinsic( + Intrinsic::vp_load, + {VecPtr->getType()->getPointerElementType(), VecPtr->getType()}, + {VecPtr, Builder.getInt32(Alignment.value()), BlockInMaskPart, + EVLPartI32}, + nullptr, "vp.op.load"); + } else if (isMaskRequired) { NewLI = Builder.CreateMaskedLoad( VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), "wide.masked.load"); - else + } else { NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); + } // Add metadata to the load, but setVectorValue to the reverse shuffle. addMetadata(NewLI, LI); @@ -4961,6 +5075,72 @@ return !CInt || CInt->isZero(); } +void InnerLoopVectorizer::widenPredicatedInstruction(Instruction &I, + VPValue *Def, VPUser &User, + VPTransformState &State, + VPValue *BlockInMask, + VPValue *EVL) { + auto getVPIntrInstr = [](unsigned Opcode) { + switch (Opcode) { + case Instruction::Add: + return Intrinsic::vp_add; + case Instruction::Sub: + return Intrinsic::vp_sub; + case Instruction::Mul: + return Intrinsic::vp_mul; + case Instruction::SDiv: + return Intrinsic::vp_sdiv; + case Instruction::UDiv: + return Intrinsic::vp_udiv; + case Instruction::SRem: + return Intrinsic::vp_srem; + case Instruction::URem: + return Intrinsic::vp_urem; + case Instruction::AShr: + return Intrinsic::vp_ashr; + case Instruction::LShr: + return Intrinsic::vp_lshr; + case Instruction::Shl: + return Intrinsic::vp_shl; + case Instruction::Or: + return Intrinsic::vp_or; + case Instruction::And: + return Intrinsic::vp_and; + case Instruction::Xor: + return Intrinsic::vp_xor; + } + return Intrinsic::not_intrinsic; + }; + + unsigned Opcode = I.getOpcode(); + assert(getVPIntrInstr(Opcode) != Intrinsic::not_intrinsic && + "Instruction does not have VP intrinsic support."); + + setDebugLocFromInst(Builder, &I); + + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector Ops; + for (unsigned OpIdx = 0; OpIdx < User.getNumOperands() - 2; OpIdx++) + Ops.push_back(State.get(User.getOperand(OpIdx), Part)); + + VectorType *OpTy = cast(Ops[0]->getType()); + Value *MaskOp = State.get(BlockInMask, Part); + Ops.push_back(MaskOp); + + Value *EVLOp = State.get(EVL, Part); + Ops.push_back(EVLOp); + + Value *V = Builder.CreateIntrinsic(getVPIntrInstr(Opcode), OpTy, Ops, + nullptr, "vp.op"); + if (auto *VecOp = dyn_cast(V)) + VecOp->copyIRFlags(&I); + + // Use this vector value for all users of the original instruction. + State.set(Def, V, Part); + addMetadata(V, &I); + } +} + void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, VPUser &User, VPTransformState &State) { @@ -5905,6 +6085,28 @@ // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { FoldTailByMasking = true; + if (!PreferPredicateWithVPIntrinsics) + return MaxFactors; + + if (UserIC > 1) { + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " + "not generate VP intrinsics since interleave count " + "specified is greater than 1.\n"); + return MaxFactors; + } + + if (PreferPredicateWithVPIntrinsics == + PreferVPIntrinsicsTy::IfAVLSupported) { + PreferVPIntrinsics = TTI.hasActiveVectorLength(); + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " + "try to generate VP Intrinsics if the target " + "support vector length predication.\n"); + } else { + PreferVPIntrinsics = true; + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " + "try to generate VP Intrinsics.\n"); + } + return MaxFactors; } @@ -6333,6 +6535,11 @@ if (!isScalarEpilogueAllowed()) return 1; + // Do not interleave if VP intrinsics are preferred and no User IC is + // specified. + if (preferVPIntrinsics()) + return 1; + // We used the distance for the interleave count. if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; @@ -8572,6 +8779,22 @@ return EdgeMaskCache[Edge] = EdgeMask; } +VPValue *VPRecipeBuilder::getOrCreateIV(VPBasicBlock *VPBB, VPlanPtr &Plan) { + IVCacheTy::iterator IVEntryIt = IVCache.find(VPBB); + if (IVEntryIt != IVCache.end()) + return IVEntryIt->second; + + VPValue *IV = nullptr; + if (Legal->getPrimaryInduction()) + IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); + else { + auto *IVRecipe = new VPWidenCanonicalIVRecipe(); + Builder.getInsertBlock()->insert(IVRecipe, Builder.getInsertPoint()); + IV = IVRecipe->getVPSingleValue(); + } + return IVCache[VPBB] = IV; +} + VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) { assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); @@ -8588,6 +8811,19 @@ if (!CM.blockNeedsPredication(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. + // if header block needs predication then it is only because tail-folding is + // enabled. If we are using VP intrinsics for a target with vector length + // predication support, this mask (icmp ule %IV %BTC) becomes redundant with + // EVL, which means unless we are using VP intrinsics without vector length + // predication support we can replace this mask with an all-true mask for + // possibly better latency. + if (CM.preferVPIntrinsics() && + PreferPredicateWithVPIntrinsics != + PreferVPIntrinsicsTy::WithoutAVLSupport) { + BlockMask = Builder.createNaryOp(VPInstruction::AllTrueMask, {}); + return BlockMaskCache[BB] = BlockMask; + } + // Create the block in mask as the first non-phi instruction in the block. VPBuilder::InsertPointGuard Guard(Builder); auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); @@ -8596,14 +8832,7 @@ // Introduce the early-exit compare IV <= BTC to form header block mask. // This is used instead of IV < TC because TC may wrap, unlike BTC. // Start by constructing the desired canonical IV. - VPValue *IV = nullptr; - if (Legal->getPrimaryInduction()) - IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction()); - else { - auto IVRecipe = new VPWidenCanonicalIVRecipe(); - Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint); - IV = IVRecipe->getVPSingleValue(); - } + VPValue *IV = getOrCreateIV(Builder.getInsertBlock(), Plan); VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); bool TailFolded = !CM.isScalarEpilogueAllowed(); @@ -8636,10 +8865,31 @@ return BlockMaskCache[BB] = BlockMask; } -VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, - ArrayRef Operands, - VFRange &Range, - VPlanPtr &Plan) { +VPValue *VPRecipeBuilder::getOrCreateEVL(VPlanPtr &Plan) { + if (EVL) + return EVL; + + if (PreferPredicateWithVPIntrinsics == + PreferVPIntrinsicsTy::WithoutAVLSupport) { + EVL = Plan->getOrCreateRuntimeVF(); + return EVL; + } + + VPBuilder::InsertPointGuard Guard(Builder); + auto *HeaderBB = Plan->getEntry()->getSingleSuccessor()->getEntryBasicBlock(); + auto NewInsertionPoint = HeaderBB->getFirstNonPhi(); + Builder.setInsertPoint(HeaderBB, NewInsertionPoint); + + VPValue *IV = getOrCreateIV(Builder.getInsertBlock(), Plan); + VPValue *TC = Plan->getOrCreateTripCount(); + auto *EVLRecipe = new VPWidenEVLRecipe(IV, TC); + Builder.getInsertBlock()->insert(EVLRecipe, Builder.getInsertPoint()); + EVL = EVLRecipe->getEVL(); + return EVL; +} + +bool VPRecipeBuilder::validateWidenMemory(Instruction *I, + VFRange &Range) const { assert((isa(I) || isa(I)) && "Must be called with either a load or store"); @@ -8658,7 +8908,14 @@ return Decision != LoopVectorizationCostModel::CM_Scalarize; }; - if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) + return LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range); +} + +VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, + ArrayRef Operands, + VFRange &Range, + VPlanPtr &Plan) { + if (!validateWidenMemory(I, Range)) return nullptr; VPValue *Mask = nullptr; @@ -8673,6 +8930,24 @@ Mask); } +VPRecipeBase * +VPRecipeBuilder::tryToPredicatedWidenMemory(Instruction *I, + ArrayRef Operands, + VFRange &Range, VPlanPtr &Plan) { + if (!validateWidenMemory(I, Range)) + return nullptr; + + VPValue *Mask = createBlockInMask(I->getParent(), Plan); + VPValue *EVL = getOrCreateEVL(Plan); + if (LoadInst *Load = dyn_cast(I)) + return new VPPredicatedWidenMemoryInstructionRecipe(*Load, Operands[0], + Mask, EVL); + + StoreInst *Store = cast(I); + return new VPPredicatedWidenMemoryInstructionRecipe(*Store, Operands[1], + Operands[0], Mask, EVL); +} + VPWidenIntOrFpInductionRecipe * VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, ArrayRef Operands) const { @@ -8805,8 +9080,11 @@ Range); } -VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, - ArrayRef Operands) const { +bool VPRecipeBuilder::preferPredicatedWiden() const { + return CM.preferVPIntrinsics(); +} + +bool VPRecipeBuilder::validateWiden(Instruction *I) const { auto IsVectorizableOpcode = [](unsigned Opcode) { switch (Opcode) { case Instruction::Add: @@ -8848,7 +9126,12 @@ return false; }; - if (!IsVectorizableOpcode(I->getOpcode())) + return IsVectorizableOpcode(I->getOpcode()); +} + +VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, + ArrayRef Operands) const { + if (!validateWiden(I)) return nullptr; // Success: widen this instruction. @@ -8865,6 +9148,17 @@ } } +VPPredicatedWidenRecipe *VPRecipeBuilder::tryToPredicatedWiden( + Instruction *I, ArrayRef Operands, VPlanPtr &Plan) { + if (!validateWiden(I)) + return nullptr; + + VPValue *Mask = createBlockInMask(I->getParent(), Plan); + VPValue *EVL = getOrCreateEVL(Plan); + return new VPPredicatedWidenRecipe( + *I, make_range(Operands.begin(), Operands.end()), Mask, EVL); +} + VPBasicBlock *VPRecipeBuilder::handleReplication( Instruction *I, VFRange &Range, VPBasicBlock *VPBB, VPlanPtr &Plan) { @@ -8953,8 +9247,13 @@ if (auto *CI = dyn_cast(Instr)) return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); - if (isa(Instr) || isa(Instr)) + if (isa(Instr) || isa(Instr)) { + if (preferPredicatedWiden()) + return toVPRecipeResult( + tryToPredicatedWidenMemory(Instr, Operands, Range, Plan)); + return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); + } VPRecipeBase *Recipe; if (auto Phi = dyn_cast(Instr)) { @@ -9000,6 +9299,9 @@ *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); } + if (preferPredicatedWiden()) + return toVPRecipeResult(tryToPredicatedWiden(Instr, Operands, Plan)); + return toVPRecipeResult(tryToWiden(Instr, Operands)); } @@ -9432,6 +9734,11 @@ State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); } +void VPPredicatedWidenRecipe::execute(VPTransformState &State) { + State.ILV->widenPredicatedInstruction(*getUnderlyingInstr(), this, *this, + State, getMask(), getEVL()); +} + void VPWidenGEPRecipe::execute(VPTransformState &State) { State.ILV->widenGEP(cast(getUnderlyingInstr()), this, *this, State.UF, State.VF, IsPtrLoopInvariant, @@ -9648,6 +9955,63 @@ StoredValue, getMask()); } +void VPPredicatedWidenMemoryInstructionRecipe::execute( + VPTransformState &State) { + VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; + State.ILV->vectorizeMemoryInstruction( + &Ingredient, State, StoredValue ? nullptr : getVPSingleValue(), getAddr(), + StoredValue, getMask(), getEVL()); +} + +Value *InnerLoopVectorizer::createEVL(VPValue *IV, VPValue *TC, + VPTransformState &State) { + assert(PreferPredicateWithVPIntrinsics != + PreferVPIntrinsicsTy::NoPredication && + "Predication with VP intrinsics turned off."); + + if (PreferPredicateWithVPIntrinsics == PreferVPIntrinsicsTy::IfAVLSupported) + assert(TTI->hasActiveVectorLength() && + "Target does not support vector length predication."); + + auto *MinVF = Builder.getInt32(VF.getKnownMinValue()); + Value *RuntimeVL = + VF.isScalable() ? Builder.CreateVScale(MinVF, "vscale.x.vf") : MinVF; + + // TODO: Add support for interleaving. + auto *TripCount = State.get(TC, 0); + auto *Induction = State.get(IV, VPIteration(0, 0)); + + Value *Remaining = Builder.CreateSub(TripCount, Induction); + // FIXME: This is a proof-of-concept naive implementation to demonstrate using + // a target dependent intrinsic to compute the vector length. + if (TTI->useCustomActiveVectorLengthIntrinsic()) { + // Set Element width to the widest type used in the loop. + unsigned SmallestType, WidestType; + std::tie(SmallestType, WidestType) = Cost->getSmallestAndWidestTypes(); + Constant *ElementWidth = Builder.getInt32(WidestType); + // Set Register width factor to 1. + Constant *RegWidthFactor = Builder.getInt32(1); + return Builder.CreateIntrinsic(Intrinsic::experimental_set_vector_length, + {Remaining->getType()}, + {Remaining, ElementWidth, RegWidthFactor}); + } + + Value *RuntimeVLExt = Builder.CreateZExt(RuntimeVL, Remaining->getType()); + Value *EVL = + Builder.CreateBinaryIntrinsic(Intrinsic::umin, RuntimeVLExt, Remaining); + return Builder.CreateTrunc(EVL, Builder.getInt32Ty()); +} + +void VPWidenEVLRecipe::execute(VPTransformState &State) { + // FIXME: Interleaving with predicated vectorization is not yet supported. + // Since VPlan only provides set methods for per Part or per Instance, we use + // the per Part set method to store the same EVL for each Part (State.UF would + // be 1 for now.) + for (unsigned Part = 0; Part < State.UF; Part++) + State.set(getEVL(), State.ILV->createEVL(getIV(), getTripCount(), State), + Part); +} + // Determine how to lower the scalar epilogue, which depends on 1) optimising // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing // predication, and 4) a TTI hook that analyses whether the loop is suitable Index: llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -11,6 +11,7 @@ #include "LoopVectorizationPlanner.h" #include "VPlan.h" +#include "VPlanValue.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/PointerUnion.h" #include "llvm/IR/IRBuilder.h" @@ -41,6 +42,8 @@ VPBuilder &Builder; + VPValue *EVL = nullptr; + /// When we if-convert we need to create edge masks. We have to cache values /// so that we don't end up with exponential recursion/IR. Note that /// if-conversion currently takes place during VPlan-construction, so these @@ -51,6 +54,11 @@ EdgeMaskCacheTy EdgeMaskCache; BlockMaskCacheTy BlockMaskCache; + /// Hold a mapping of Basic block to the canonical vector induction VPValue + /// inserted for that block or the primary induction if it exists. + using IVCacheTy = DenseMap; + IVCacheTy IVCache; + // VPlan-VPlan transformations support: Hold a mapping from ingredients to // their recipe. To save on memory, only do so for selected ingredients, // marked by having a nullptr entry in this map. @@ -71,6 +79,16 @@ VPRecipeBase *tryToWidenMemory(Instruction *I, ArrayRef Operands, VFRange &Range, VPlanPtr &Plan); + /// Similar to tryToWidenMemory, but create a predicated recipe. The + /// predicated recipe takes mandatory mask and EVL VPInstructions. + VPRecipeBase *tryToPredicatedWidenMemory(Instruction *I, + ArrayRef Operands, + VFRange &Range, VPlanPtr &Plan); + + /// Helper method used by tryToWidenMemory and tryToPredicatedWidenMemory to + /// validate if a memory instructions can be widened. + bool validateWidenMemory(Instruction *I, VFRange &Range) const; + /// Check if an induction recipe should be constructed for \I. If so build and /// return it. If not, return null. VPWidenIntOrFpInductionRecipe * @@ -100,9 +118,24 @@ /// that widening should be performed. VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef Operands) const; + /// Similar to tryToWiden, but widen to VP intrinsics. + VPPredicatedWidenRecipe *tryToPredicatedWiden(Instruction *I, + ArrayRef Operands, + VPlanPtr &Plan); + + /// Helper method used by tryToWiden and tryToPredicatedWiden to validate if + /// an instruction can be widened. + bool validateWiden(Instruction *I) const; + /// Return a VPRecipeOrValueTy with VPRecipeBase * being set. This can be used to force the use as VPRecipeBase* for recipe sub-types that also inherit from VPValue. VPRecipeOrVPValueTy toVPRecipeResult(VPRecipeBase *R) const { return R; } + /// Create recipes that will expand to VP intrinsics. + bool preferPredicatedWiden() const; + + /// Insert and Cache Induction Variable + VPValue *getOrCreateIV(VPBasicBlock *VPBB, VPlanPtr &Plan); + public: VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI, LoopVectorizationLegality *Legal, @@ -138,6 +171,10 @@ /// and DST. VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan); + /// A helper function that computes the Explicit(Active) Vector Length for the + /// current vector iteration. + VPValue *getOrCreateEVL(VPlanPtr &Plan); + /// Mark given ingredient for recording its recipe once one is created for /// it. void recordRecipeOf(Instruction *I) { Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -40,6 +40,7 @@ #include "llvm/ADT/ilist_node.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" #include "llvm/Support/InstructionCost.h" #include #include @@ -748,6 +749,7 @@ inline bool VPUser::classof(const VPDef *Def) { return Def->getVPDefID() == VPRecipeBase::VPInstructionSC || + Def->getVPDefID() == VPRecipeBase::VPPredicatedWidenSC || Def->getVPDefID() == VPRecipeBase::VPWidenSC || Def->getVPDefID() == VPRecipeBase::VPWidenCallSC || Def->getVPDefID() == VPRecipeBase::VPWidenSelectSC || @@ -757,6 +759,8 @@ Def->getVPDefID() == VPRecipeBase::VPReplicateSC || Def->getVPDefID() == VPRecipeBase::VPReductionSC || Def->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC || + Def->getVPDefID() == + VPRecipeBase::VPPredicatedWidenMemoryInstructionSC || Def->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC; } @@ -775,6 +779,7 @@ SLPLoad, SLPStore, ActiveLaneMask, + AllTrueMask, }; private: @@ -867,11 +872,17 @@ /// ingredient. This recipe covers most of the traditional vectorization cases /// where each ingredient transforms into a vectorized version of itself. class VPWidenRecipe : public VPRecipeBase, public VPValue { +protected: + template + VPWidenRecipe(Instruction &I, iterator_range Operands, + const unsigned char RecipeSC, const unsigned char ValueSC) + : VPRecipeBase(RecipeSC, Operands), VPValue(ValueSC, &I, this) {} + public: template VPWidenRecipe(Instruction &I, iterator_range Operands) - : VPRecipeBase(VPRecipeBase::VPWidenSC, Operands), - VPValue(VPValue::VPVWidenSC, &I, this) {} + : VPWidenRecipe(I, Operands, VPRecipeBase::VPWidenSC, + VPValue::VPVWidenSC) {} ~VPWidenRecipe() override = default; @@ -893,6 +904,47 @@ #endif }; +/// VPPredicatedWidenRecipe is a recipe for producing a copy of vector type +/// using VP intrinsics for its ingredient. This recipe covers most of the +/// traditional vectorization cases where each ingredient transforms into a +/// vectorized version of itself. +class VPPredicatedWidenRecipe : public VPWidenRecipe { +public: + template + VPPredicatedWidenRecipe(Instruction &I, iterator_range Operands, + VPValue *Mask, VPValue *EVL) + : VPWidenRecipe(I, Operands, VPRecipeBase::VPPredicatedWidenSC, + VPValue::VPVPredicatedWidenSC) { + addOperand(Mask); + addOperand(EVL); + } + + ~VPPredicatedWidenRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPPredicatedWidenSC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVPredicatedWidenSC; + } + + /// Return the mask used by this recipe. + VPValue *getMask() const { return getOperand(getNumOperands() - 2); } + + /// Return the explicit vector length used by this recipe. + VPValue *getEVL() const { return getOperand(getNumOperands() - 1); } + + /// Generate the wide load/store. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A recipe for widening Call instructions. class VPWidenCallRecipe : public VPRecipeBase, public VPValue { @@ -1373,6 +1425,41 @@ } }; +/// A recipe to generate Explicit Vector Length (EVL) value to be used with +/// VPred intrinsics. +class VPWidenEVLRecipe : public VPRecipeBase, public VPValue { + +public: + VPWidenEVLRecipe(VPValue *IV, VPValue *TC) + : VPRecipeBase(VPRecipeBase::VPWidenEVLSC, {IV, TC}), + VPValue(VPValue::VPVWidenEVLSC, nullptr, this) {} + ~VPWidenEVLRecipe() override = default; + + /// Return the VPValue representing EVL. + const VPValue *getEVL() const { return this; } + VPValue *getEVL() { return this; } + + /// Return VPValue representing Induction Variable. + VPValue *getIV() const { return getOperand(0); } + + /// Return VPValue representing trip count. + VPValue *getTripCount() const { return getOperand(1); } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenEVLSC; + } + + /// Generate the instructions to compute EVL. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when /// control converges back from a Branch-on-Mask. The phi nodes are needed in /// order to merge values that are set under such a branch and feed their uses. @@ -1409,8 +1496,6 @@ /// TODO: We currently execute only per-part unless a specific instance is /// provided. class VPWidenMemoryInstructionRecipe : public VPRecipeBase { - Instruction &Ingredient; - void setMask(VPValue *Mask) { if (!Mask) return; @@ -1421,17 +1506,32 @@ return isStore() ? getNumOperands() == 3 : getNumOperands() == 2; } +protected: + Instruction &Ingredient; + + VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, + const unsigned char RecipeSC, + const unsigned char ValueSC) + : VPRecipeBase(RecipeSC, {Addr}), Ingredient(Load) { + new VPValue(ValueSC, &Load, this); + } + + VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr, + VPValue *StoredValue, + const unsigned char RecipeSC) + : VPRecipeBase(RecipeSC, {Addr, StoredValue}), Ingredient(Store) {} + public: VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask) - : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr}), Ingredient(Load) { - new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this); + : VPWidenMemoryInstructionRecipe(Load, Addr, VPWidenMemoryInstructionSC, + VPValue::VPVMemoryInstructionSC) { setMask(Mask); } VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr, VPValue *StoredValue, VPValue *Mask) - : VPRecipeBase(VPWidenMemoryInstructionSC, {Addr, StoredValue}), - Ingredient(Store) { + : VPWidenMemoryInstructionRecipe(Store, Addr, StoredValue, + VPWidenMemoryInstructionSC) { setMask(Mask); } @@ -1471,6 +1571,60 @@ #endif }; +/// A Recipe for widening load/store operations to VP intrinsics. +/// The recipe uses the following VPValues: +/// - For load: Address, mask, EVL +/// - For store: Address, stored value, mask, EVL +class VPPredicatedWidenMemoryInstructionRecipe + : public VPWidenMemoryInstructionRecipe { + +public: + VPPredicatedWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, + VPValue *Mask, VPValue *EVL) + : VPWidenMemoryInstructionRecipe( + Load, Addr, VPPredicatedWidenMemoryInstructionSC, + VPValue::VPVPredicatedMemoryInstructionSC) { + addOperand(Mask); + addOperand(EVL); + } + + VPPredicatedWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr, + VPValue *StoredValue, VPValue *Mask, + VPValue *EVL) + : VPWidenMemoryInstructionRecipe(Store, Addr, StoredValue, + VPPredicatedWidenMemoryInstructionSC) { + addOperand(Mask); + addOperand(EVL); + } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == + VPRecipeBase::VPPredicatedWidenMemoryInstructionSC; + } + + /// Return the mask used by this recipe. + VPValue *getMask() const { + // Mask is the second last, mandatory operand. + return getOperand(getNumOperands() - 2); + } + + /// Return the EVL used by this recipe. + VPValue *getEVL() const { + // EVL is the last, mandatory operand. + return getOperand(getNumOperands() - 1); + } + + /// Generate the wide load/store. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif +}; + /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase { public: @@ -1986,6 +2140,16 @@ /// the tail. VPValue *BackedgeTakenCount = nullptr; + /// Represents the trip count of the original loop, for computing EVL. + VPValue *TripCount = nullptr; + + /// Represents the runtime VF. Some recipes like Vector Predicated recipes may + /// use runtime VF as an operand. At the time of plan construction while it is + /// known that this value is a loop invariant, but the corresponding IR value + /// is only available at plan execution once the final VF and corresponding + /// plan are chosen. + VPValue *RuntimeVF = nullptr; + /// Holds a mapping between Values and their corresponding VPValue inside /// VPlan. Value2VPValueTy Value2VPValue; @@ -2015,6 +2179,10 @@ delete VPV; if (BackedgeTakenCount) delete BackedgeTakenCount; + if (TripCount) + delete TripCount; + if (RuntimeVF) + delete RuntimeVF; for (VPValue *Def : VPExternalDefs) delete Def; } @@ -2038,6 +2206,21 @@ return BackedgeTakenCount; } + /// The trip count of the original loop. + VPValue *getOrCreateTripCount() { + if (!TripCount) + TripCount = new VPValue(); + return TripCount; + } + + /// A VPValue representing the loop invariant runtime VF to be expanded at + /// paln execution. + VPValue *getOrCreateRuntimeVF() { + if (!RuntimeVF) + RuntimeVF = new VPValue(); + return RuntimeVF; + } + void addVF(ElementCount VF) { VFs.insert(VF); } bool hasVF(ElementCount VF) { return VFs.count(VF); } Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -580,6 +580,7 @@ case VPWidenPHISC: case VPBlendSC: case VPWidenSC: + case VPPredicatedWidenSC: case VPWidenGEPSC: case VPReductionSC: case VPWidenSelectSC: { @@ -687,6 +688,12 @@ State.set(this, Call, Part); break; } + case VPInstruction::AllTrueMask: { + Value *AllTrueMask = Builder.getTrueVector(State.VF); + State.set(this, AllTrueMask, Part); + break; + } + default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -729,7 +736,9 @@ case VPInstruction::ActiveLaneMask: O << "active lane mask"; break; - + case VPInstruction::AllTrueMask: + O << "all true mask"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -745,10 +754,26 @@ /// LoopVectorBody basic-block was created for this. Introduce additional /// basic-blocks as needed, and fill them all. void VPlan::execute(VPTransformState *State) { + IRBuilder<> Builder(State->CFG.PrevBB->getTerminator()); + + // -3 Check if the trip count is needed, if so build it. + if (TripCount && TripCount->getNumUsers()) { + Value *TC = State->TripCount; + for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) + State->set(TripCount, TC, Part); + } + + // -2 Set the runtime VF if it is needed. + if (RuntimeVF && RuntimeVF->getNumUsers()) { + Value *RuntimeVFVal = + getRuntimeVF(Builder, Builder.getInt32Ty(), State->VF); + for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) + State->set(RuntimeVF, RuntimeVFVal, Part); + } + // -1. Check if the backedge taken count is needed, and if so build it. if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) { Value *TC = State->TripCount; - IRBuilder<> Builder(State->CFG.PrevBB->getTerminator()); auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1), "trip.count.minus.1"); auto VF = State->VF; @@ -917,6 +942,16 @@ Plan.BackedgeTakenCount->print(OS, SlotTracker); OS << " := BackedgeTakenCount"; } + if (Plan.TripCount) { + OS << "\\n"; + Plan.RuntimeVF->print(OS, SlotTracker); + OS << " := TripCount"; + } + if (Plan.RuntimeVF) { + OS << "\\n"; + Plan.RuntimeVF->print(OS, SlotTracker); + OS << " := RuntimeVF"; + } OS << "\"]\n"; OS << "node [shape=rect, fontname=Courier, fontsize=30]\n"; OS << "edge [fontname=Courier, fontsize=30]\n"; @@ -1069,6 +1104,14 @@ printOperands(O, SlotTracker); } +void VPPredicatedWidenRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "PREDICATED-WIDEN "; + printAsOperand(O, SlotTracker); + O << " = " << getUnderlyingInstr()->getOpcodeName() << " "; + printOperands(O, SlotTracker); +} + void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-INDUCTION"; @@ -1187,6 +1230,19 @@ } #endif +void VPPredicatedWidenMemoryInstructionRecipe::print( + raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { + O << Indent << "PREDICATED-WIDEN "; + + if (!isStore()) { + getVPValue()->printAsOperand(O, SlotTracker); + O << " = "; + } + O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; + + printOperands(O, SlotTracker); +} + void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { Value *CanonicalIV = State.CanonicalIV; Type *STy = CanonicalIV->getType(); @@ -1221,6 +1277,13 @@ } #endif +void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + getEVL()->printAsOperand(O, SlotTracker); + O << " = GENERATE-EXPLICIT-VECTOR-LENGTH"; +} + template void DomTreeBuilder::Calculate(VPDominatorTree &DT); void VPValue::replaceAllUsesWith(VPValue *New) { @@ -1322,6 +1385,12 @@ if (Plan.BackedgeTakenCount) assignSlot(Plan.BackedgeTakenCount); + if (Plan.TripCount) + assignSlot(Plan.TripCount); + + if (Plan.RuntimeVF) + assignSlot(Plan.RuntimeVF); + ReversePostOrderTraversal< VPBlockRecursiveTraversalWrapper> RPOT(VPBlockRecursiveTraversalWrapper( Index: llvm/lib/Transforms/Vectorize/VPlanValue.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlanValue.h +++ llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -51,6 +51,7 @@ friend class VPSlotTracker; friend class VPRecipeBase; friend class VPWidenMemoryInstructionRecipe; + friend class VPPredicatedWidenMemoryInstructionRecipe; const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). @@ -94,10 +95,13 @@ VPVInstructionSC, VPVMemoryInstructionSC, VPVPredInstPHI, + VPVPredicatedMemoryInstructionSC, + VPVPredicatedWidenSC, VPVReductionSC, VPVReplicateSC, VPVWidenSC, VPVWidenCallSC, + VPVWidenEVLSC, VPVWidenGEPSC, VPVWidenIntOrFpIndcutionSC, VPVWidenPHISC, @@ -319,10 +323,13 @@ VPInstructionSC, VPInterleaveSC, VPPredInstPHISC, + VPPredicatedWidenMemoryInstructionSC, + VPPredicatedWidenSC, VPReductionSC, VPReplicateSC, VPWidenCallSC, VPWidenCanonicalIVSC, + VPWidenEVLSC, VPWidenGEPSC, VPWidenIntOrFpInductionSC, VPWidenMemoryInstructionSC, Index: llvm/test/Transforms/LoopVectorize/vectorize-vp-intrinsics.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/vectorize-vp-intrinsics.ll @@ -0,0 +1,292 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=without-active-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -S %s 2>&1 | FileCheck --check-prefix=WITHOUT-AVL %s + +; RUN: opt -loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=if-active-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -S %s 2>&1 | FileCheck --check-prefix=IF-AVL %s + +; RUN: opt -loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=force-active-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -S %s 2>&1 | FileCheck --check-prefix=FORCE-AVL %s + +; RUN: opt -loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -S %s 2>&1 | FileCheck --check-prefix=NO-VP %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) local_unnamed_addr { +; WITHOUT-AVL-LABEL: @foo( +; WITHOUT-AVL-NEXT: entry: +; WITHOUT-AVL-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; WITHOUT-AVL-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; WITHOUT-AVL: for.body.preheader: +; WITHOUT-AVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; WITHOUT-AVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; WITHOUT-AVL: vector.ph: +; WITHOUT-AVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 3 +; WITHOUT-AVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; WITHOUT-AVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; WITHOUT-AVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1 +; WITHOUT-AVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; WITHOUT-AVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; WITHOUT-AVL-NEXT: br label [[VECTOR_BODY:%.*]] +; WITHOUT-AVL: vector.body: +; WITHOUT-AVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; WITHOUT-AVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0 +; WITHOUT-AVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; WITHOUT-AVL-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; WITHOUT-AVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; WITHOUT-AVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]] +; WITHOUT-AVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; WITHOUT-AVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 +; WITHOUT-AVL-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* +; WITHOUT-AVL-NEXT: [[VP_OP_LOAD:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* [[TMP4]], i32 4, <4 x i1> [[TMP1]], i32 4) +; WITHOUT-AVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]] +; WITHOUT-AVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 +; WITHOUT-AVL-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; WITHOUT-AVL-NEXT: [[VP_OP_LOAD3:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* [[TMP7]], i32 4, <4 x i1> [[TMP1]], i32 4) +; WITHOUT-AVL-NEXT: [[VP_OP:%.*]] = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> [[VP_OP_LOAD3]], <4 x i32> [[VP_OP_LOAD]], <4 x i1> [[TMP1]], i32 4) +; WITHOUT-AVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; WITHOUT-AVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 +; WITHOUT-AVL-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; WITHOUT-AVL-NEXT: call void @llvm.vp.store.v4i32.p0v4i32(<4 x i32> [[VP_OP]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[TMP1]], i32 4) +; WITHOUT-AVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; WITHOUT-AVL-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; WITHOUT-AVL-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; WITHOUT-AVL: middle.block: +; WITHOUT-AVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; WITHOUT-AVL: scalar.ph: +; WITHOUT-AVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; WITHOUT-AVL-NEXT: br label [[FOR_BODY:%.*]] +; WITHOUT-AVL: for.cond.cleanup.loopexit: +; WITHOUT-AVL-NEXT: br label [[FOR_COND_CLEANUP]] +; WITHOUT-AVL: for.cond.cleanup: +; WITHOUT-AVL-NEXT: ret void +; WITHOUT-AVL: for.body: +; WITHOUT-AVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; WITHOUT-AVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; WITHOUT-AVL-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; WITHOUT-AVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] +; WITHOUT-AVL-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; WITHOUT-AVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]] +; WITHOUT-AVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; WITHOUT-AVL-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; WITHOUT-AVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; WITHOUT-AVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; WITHOUT-AVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; +; IF-AVL-LABEL: @foo( +; IF-AVL-NEXT: entry: +; IF-AVL-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; IF-AVL-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; IF-AVL: for.body.preheader: +; IF-AVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; IF-AVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-AVL: vector.ph: +; IF-AVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 3 +; IF-AVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; IF-AVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-AVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1 +; IF-AVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; IF-AVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IF-AVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-AVL: vector.body: +; IF-AVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-AVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0 +; IF-AVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; IF-AVL-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; IF-AVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; IF-AVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]] +; IF-AVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; IF-AVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 +; IF-AVL-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* +; IF-AVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP4]], i32 4, <4 x i1> [[TMP1]], <4 x i32> poison) +; IF-AVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]] +; IF-AVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 +; IF-AVL-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; IF-AVL-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP7]], i32 4, <4 x i1> [[TMP1]], <4 x i32> poison) +; IF-AVL-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; IF-AVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; IF-AVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 +; IF-AVL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* +; IF-AVL-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], i32 4, <4 x i1> [[TMP1]]) +; IF-AVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; IF-AVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-AVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-AVL: middle.block: +; IF-AVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; IF-AVL: scalar.ph: +; IF-AVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-AVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-AVL: for.cond.cleanup.loopexit: +; IF-AVL-NEXT: br label [[FOR_COND_CLEANUP]] +; IF-AVL: for.cond.cleanup: +; IF-AVL-NEXT: ret void +; IF-AVL: for.body: +; IF-AVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-AVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; IF-AVL-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; IF-AVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] +; IF-AVL-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; IF-AVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]] +; IF-AVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; IF-AVL-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; IF-AVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; IF-AVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; IF-AVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; +; FORCE-AVL-LABEL: @foo( +; FORCE-AVL-NEXT: entry: +; FORCE-AVL-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; FORCE-AVL-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; FORCE-AVL: for.body.preheader: +; FORCE-AVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; FORCE-AVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FORCE-AVL: vector.ph: +; FORCE-AVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 3 +; FORCE-AVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; FORCE-AVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; FORCE-AVL-NEXT: br label [[VECTOR_BODY:%.*]] +; FORCE-AVL: vector.body: +; FORCE-AVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-AVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0 +; FORCE-AVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FORCE-AVL-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; FORCE-AVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; FORCE-AVL-NEXT: [[TMP1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[TMP0]] +; FORCE-AVL-NEXT: [[TMP2:%.*]] = call i64 @llvm.umin.i64(i64 4, i64 [[TMP1]]) +; FORCE-AVL-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 +; FORCE-AVL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; FORCE-AVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 +; FORCE-AVL-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; FORCE-AVL-NEXT: [[VP_OP_LOAD:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* [[TMP6]], i32 4, <4 x i1> , i32 [[TMP3]]) +; FORCE-AVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]] +; FORCE-AVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 +; FORCE-AVL-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* +; FORCE-AVL-NEXT: [[VP_OP_LOAD1:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* [[TMP9]], i32 4, <4 x i1> , i32 [[TMP3]]) +; FORCE-AVL-NEXT: [[VP_OP:%.*]] = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> [[VP_OP_LOAD1]], <4 x i32> [[VP_OP_LOAD]], <4 x i1> , i32 [[TMP3]]) +; FORCE-AVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; FORCE-AVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 +; FORCE-AVL-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* +; FORCE-AVL-NEXT: call void @llvm.vp.store.v4i32.p0v4i32(<4 x i32> [[VP_OP]], <4 x i32>* [[TMP12]], i32 4, <4 x i1> , i32 [[TMP3]]) +; FORCE-AVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; FORCE-AVL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCE-AVL-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FORCE-AVL: middle.block: +; FORCE-AVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; FORCE-AVL: scalar.ph: +; FORCE-AVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; FORCE-AVL-NEXT: br label [[FOR_BODY:%.*]] +; FORCE-AVL: for.cond.cleanup.loopexit: +; FORCE-AVL-NEXT: br label [[FOR_COND_CLEANUP]] +; FORCE-AVL: for.cond.cleanup: +; FORCE-AVL-NEXT: ret void +; FORCE-AVL: for.body: +; FORCE-AVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; FORCE-AVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; FORCE-AVL-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; FORCE-AVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] +; FORCE-AVL-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; FORCE-AVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP14]] +; FORCE-AVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; FORCE-AVL-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; FORCE-AVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; FORCE-AVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; FORCE-AVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; +; NO-VP-LABEL: @foo( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; NO-VP-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; NO-VP: for.body.preheader: +; NO-VP-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; NO-VP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 3 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1 +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0 +; NO-VP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; NO-VP-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]] +; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 +; NO-VP-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* +; NO-VP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP4]], i32 4, <4 x i1> [[TMP1]], <4 x i32> poison) +; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 +; NO-VP-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; NO-VP-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP7]], i32 4, <4 x i1> [[TMP1]], <4 x i32> poison) +; NO-VP-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 +; NO-VP-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* +; NO-VP-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], i32 4, <4 x i1> [[TMP1]]) +; NO-VP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; NO-VP-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.cond.cleanup.loopexit: +; NO-VP-NEXT: br label [[FOR_COND_CLEANUP]] +; NO-VP: for.cond.cleanup: +; NO-VP-NEXT: ret void +; NO-VP: for.body: +; NO-VP-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]] +; NO-VP-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; NO-VP-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; +entry: + %cmp10 = icmp sgt i32 %N, 0 + br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} Index: llvm/test/Transforms/LoopVectorize/vplan-vp-intrinsics.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/vplan-vp-intrinsics.ll @@ -0,0 +1,111 @@ +; REQUIRES: asserts + +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=without-active-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -disable-output %s 2>&1 | FileCheck --check-prefix=WITHOUT-AVL %s + +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=if-active-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -disable-output %s 2>&1 | FileCheck --check-prefix=IF-AVL %s + +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=force-active-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -disable-output %s 2>&1 | FileCheck --check-prefix=FORCE-AVL %s + +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -disable-output %s 2>&1 | FileCheck --check-prefix=NO-VP %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) local_unnamed_addr { +; WITHOUT-AVL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; WITHOUT-AVL-NEXT: for.body: +; WITHOUT-AVL-NEXT: WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; WITHOUT-AVL-NEXT: EMIT vp<%3> = icmp ule ir<%indvars.iv> vp<%0> +; WITHOUT-AVL-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv> +; WITHOUT-AVL-NEXT: PREDICATED-WIDEN ir<%0> = load ir<%arrayidx>, vp<%3>, vp<%1> +; WITHOUT-AVL-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv> +; WITHOUT-AVL-NEXT: PREDICATED-WIDEN ir<%1> = load ir<%arrayidx2>, vp<%3>, vp<%1> +; WITHOUT-AVL-NEXT: PREDICATED-WIDEN ir<%add> = add ir<%1>, ir<%0>, vp<%3>, vp<%1> +; WITHOUT-AVL-NEXT: CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv> +; WITHOUT-AVL-NEXT: PREDICATED-WIDEN store ir<%arrayidx4>, ir<%add>, vp<%3>, vp<%1> +; WITHOUT-AVL-NEXT: No successors +; WITHOUT-AVL-NEXT: } + +; IF-AVL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; IF-AVL-NEXT: for.body: +; IF-AVL-NEXT: WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; IF-AVL-NEXT: EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0> +; IF-AVL-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv> +; IF-AVL-NEXT: WIDEN ir<%0> = load ir<%arrayidx>, vp<%2> +; IF-AVL-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv> +; IF-AVL-NEXT: WIDEN ir<%1> = load ir<%arrayidx2>, vp<%2> +; IF-AVL-NEXT: WIDEN ir<%add> = add ir<%1>, ir<%0> +; IF-AVL-NEXT: CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv> +; IF-AVL-NEXT: WIDEN store ir<%arrayidx4>, ir<%add>, vp<%2> +; IF-AVL-NEXT: No successors +; IF-AVL-NEXT: } + +; FORCE-AVL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; FORCE-AVL-NEXT: for.body: +; FORCE-AVL-NEXT: WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; FORCE-AVL-NEXT: EMIT vp<%2> = GENERATE-EXPLICIT-VECTOR-LENGTH +; FORCE-AVL-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv> +; FORCE-AVL-NEXT: EMIT vp<%4> = all true mask +; FORCE-AVL-NEXT: PREDICATED-WIDEN ir<%0> = load ir<%arrayidx>, vp<%4>, vp<%2> +; FORCE-AVL-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv> +; FORCE-AVL-NEXT: PREDICATED-WIDEN ir<%1> = load ir<%arrayidx2>, vp<%4>, vp<%2> +; FORCE-AVL-NEXT: PREDICATED-WIDEN ir<%add> = add ir<%1>, ir<%0>, vp<%4>, vp<%2> +; FORCE-AVL-NEXT: CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv> +; FORCE-AVL-NEXT: PREDICATED-WIDEN store ir<%arrayidx4>, ir<%add>, vp<%4>, vp<%2> +; FORCE-AVL-NEXT: No successors +; FORCE-AVL-NEXT: } + +; NO-VP: VPlan 'Initial VPlan for VF={4},UF>=1' { +; NO-VP-NEXT: for.body: +; NO-VP-NEXT: WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; NO-VP-NEXT: EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0> +; NO-VP-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv> +; NO-VP-NEXT: WIDEN ir<%0> = load ir<%arrayidx>, vp<%2> +; NO-VP-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv> +; NO-VP-NEXT: WIDEN ir<%1> = load ir<%arrayidx2>, vp<%2> +; NO-VP-NEXT: WIDEN ir<%add> = add ir<%1>, ir<%0> +; NO-VP-NEXT: CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv> +; NO-VP-NEXT: WIDEN store ir<%arrayidx4>, ir<%add>, vp<%2> +; NO-VP-NEXT: No successors +; NO-VP-NEXT: } + +entry: + %cmp10 = icmp sgt i32 %N, 0 + br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +}