diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1364,6 +1364,12 @@ /// Intrinsics") Use of %evl is discouraged when that is not the case. bool hasActiveVectorLength() const; + /// If the target uses custom instruction to compute + /// active vector length, use an intrinsic in the IR that will be lowered to + /// this instruction. Else, the IR will use instructions for computing Min(VF, + /// TripCount - Induction). + bool useCustomActiveVectorLengthIntrinsic() const; + /// @} /// @} @@ -1663,6 +1669,7 @@ virtual unsigned getGISelRematGlobalCost() const = 0; virtual bool supportsScalableVectors() const = 0; virtual bool hasActiveVectorLength() const = 0; + virtual bool useCustomActiveVectorLengthIntrinsic() const = 0; virtual InstructionCost getInstructionLatency(const Instruction *I) = 0; }; @@ -2216,6 +2223,10 @@ return Impl.hasActiveVectorLength(); } + bool useCustomActiveVectorLengthIntrinsic() const override { + return Impl.useCustomActiveVectorLengthIntrinsic(); + } + InstructionCost getInstructionLatency(const Instruction *I) override { return Impl.getInstructionLatency(I); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -740,6 +740,8 @@ bool hasActiveVectorLength() const { return false; } + bool useCustomActiveVectorLengthIntrinsic() const { return false; } + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -43,6 +43,7 @@ #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/CBindingWrapping.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/TypeSize.h" #include #include #include @@ -2534,6 +2535,18 @@ unsigned Index, unsigned FieldIndex, MDNode *DbgInfo); + /// Return an all true boolean vector of size and scalability \p NumElts. + Value *CreateTrueVector(ElementCount NumElts) { + VectorType *VTy = VectorType::get(Type::getInt1Ty(Context), NumElts); + return Constant::getAllOnesValue(VTy); + } + + /// Return an all false boolean vector of size and scalability \p NumElts. + Value *CreateFalseVector(ElementCount NumElts) { + VectorType *VTy = VectorType::get(Type::getInt1Ty(Context), NumElts); + return ConstantAggregateZero::get(VTy); + } + private: /// Helper function that creates an assume intrinsic call that /// represents an alignment assumption on the provided pointer \p PtrValue diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1342,6 +1342,26 @@ //===---------------- Vector Predication Intrinsics --------------===// +// Memory Intrinsics +def int_vp_store : DefaultAttrsIntrinsic<[], + [ llvm_anyvector_ty, + LLVMAnyPointerType>, + llvm_i32_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty], + [ NoCapture>, IntrNoSync, IntrWriteMem, + IntrArgMemOnly, IntrWillReturn, + ImmArg> ]>; + +def int_vp_load : DefaultAttrsIntrinsic<[ llvm_anyvector_ty], + [ LLVMAnyPointerType>, + llvm_i32_ty, + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_i32_ty], + [ NoCapture>, IntrNoSync, IntrReadMem, + IntrWillReturn, IntrArgMemOnly, + ImmArg> ]>; + // Speculatable Binary operators let IntrProperties = [IntrSpeculatable, IntrNoMem, IntrNoSync, IntrWillReturn] in { def int_vp_add : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ], @@ -1420,6 +1440,11 @@ [llvm_anyint_ty, LLVMMatchType<1>], [IntrNoMem, IntrNoSync, IntrWillReturn]>; +def int_experimental_set_vector_length: + DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrNoSync, IntrWillReturn]>; + //===-------------------------- Masked Intrinsics -------------------------===// // def int_masked_load: diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1033,6 +1033,14 @@ return TTIImpl->supportsScalableVectors(); } +bool TargetTransformInfo::hasActiveVectorLength() const { + return TTIImpl->hasActiveVectorLength(); +} + +bool TargetTransformInfo::useCustomActiveVectorLengthIntrinsic() const { + return TTIImpl->useCustomActiveVectorLengthIntrinsic(); +} + InstructionCost TargetTransformInfo::getInstructionLatency(const Instruction *I) const { return TTIImpl->getInstructionLatency(I); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -60,6 +60,7 @@ #include "VPlanHCFGBuilder.h" #include "VPlanPredicator.h" #include "VPlanTransforms.h" +#include "VPlanValue.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" @@ -233,6 +234,54 @@ "prefers tail-folding, don't attempt vectorization if " "tail-folding fails."))); +// Option prefer-predicate-with-vp-intrinsics is an experimental switch to +// indicate that the loop vectorizer should try to generate VP intrinsics if +// tail-folding is enabled (note that this option is dependent on the +// prefer-predicate-over-epilogue option being set to predicate-dont-vectorize). +// This can be particularly useful for targets like RISC-V and SX-Aurora that +// support vector length predication. +// Currently this switch takes four possible values: +// 0. no-predication: Do not generate VP intrinsics. +// 1. if-active-vector-length-supported: Only generate VP intrinsics if the +// target supports active vector length based predication. +// 2. without-avl-support: Generate VP intriniscs even if vector length based +// predication is not supported. This will behave a bit like existing +// tail-folding by using a mask for predication, except all instructions are +// widened to VP intrinsics and not just memory instructions. Use of this +// options is discouraged and is only meant for experimental/testing purpose. +// 3. force-active-vector-length-support: This is purely an experimental/testing +// option which will be removed in future. It forces the loop vectorizer to +// assume that the target supports vector length predication. +namespace PreferVPIntrinsicsTy { +enum Option { + NoPredication = 0, + IfAVLSupported, + WithoutAVLSupport, + ForceAVLSupport +}; +} // namespace PreferVPIntrinsicsTy + +static cl::opt PreferPredicateWithVPIntrinsics( + "prefer-predicate-with-vp-intrinsics", + cl::init(PreferVPIntrinsicsTy::NoPredication), cl::Hidden, + cl::desc("When vectorizing with tail-folding, generate vector predication " + "intrinsics."), + cl::values( + clEnumValN(PreferVPIntrinsicsTy::NoPredication, "no-predication", + "Do not generate VP intrinsics."), + clEnumValN(PreferVPIntrinsicsTy::IfAVLSupported, + "if-active-vector-length-support", + "Only generate VP intrinsics if the target supports vector " + "length predication."), + clEnumValN(PreferVPIntrinsicsTy::WithoutAVLSupport, + "without-active-vector-length-support", + "Generate VP intrinsics even if vector length predication " + "is not supported. This option is discouraged."), + clEnumValN(PreferVPIntrinsicsTy::ForceAVLSupport, + "force-active-vector-length-support", + "Assume that the target supports vector length predication " + "and generate VP intrinsics accordingly."))); + static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, cl::desc("Maximize bandwidth when selecting vectorization factor which " @@ -480,6 +529,11 @@ void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands, VPTransformState &State); + /// Widen a single instruction to a VP intrinsic within the innermost loop. + void widenPredicatedInstruction(Instruction &I, VPValue *Def, + VPUser &Operands, VPTransformState &State, + VPValue *BlockInMask, VPValue *EVL); + /// Widen a single call instruction within the innermost loop. void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands, VPTransformState &State); @@ -548,7 +602,8 @@ /// vectorized loop. void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask); + VPValue *StoredValue, VPValue *BlockInMask, + VPValue *EVL = nullptr); /// Set the debug location in the builder using the debug location in /// the instruction. @@ -564,6 +619,10 @@ /// element. virtual Value *getBroadcastInstrs(Value *V); + /// Create Instructions to compute Explicit Vector Length when using VP + /// intrinsics. + Value *createEVL(); + protected: friend class LoopVectorizationPlanner; @@ -1568,6 +1627,11 @@ return foldTailByMasking() || Legal->blockNeedsPredication(BB); } + /// Returns true if VP intrinsics should be generated in the tail folded loop. + bool preferVPIntrinsics() const { + return foldTailByMasking() && PreferVPIntrinsics; + } + /// A SmallMapVector to store the InLoop reduction op chains, mapping phi /// nodes to the chain of instructions representing the reductions. Uses a /// MapVector to ensure deterministic iteration order. @@ -1704,6 +1768,9 @@ /// All blocks of loop are to be masked to fold tail of scalar iterations. bool FoldTailByMasking = false; + /// Control whether to generate VP intrinsics in vectorized code. + bool PreferVPIntrinsics = false; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -2834,7 +2901,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction( Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr, - VPValue *StoredValue, VPValue *BlockInMask) { + VPValue *StoredValue, VPValue *BlockInMask, VPValue *EVL) { // Attempt to issue a wide load. LoadInst *LI = dyn_cast(Instr); StoreInst *SI = dyn_cast(Instr); @@ -2863,6 +2930,13 @@ bool CreateGatherScatter = (Decision == LoopVectorizationCostModel::CM_GatherScatter); + if (Reverse) + assert(!EVL && + "Vector reverse not supported for predicated vectorization."); + if (CreateGatherScatter) + assert(!EVL && "Gather/Scatter operations not supported for " + "predicated vectorization."); + // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector // gather/scatter. Otherwise Decision should have been to Scalarize. assert((ConsecutiveStride || CreateGatherScatter) && @@ -2918,6 +2992,13 @@ for (unsigned Part = 0; Part < UF; ++Part) { Instruction *NewSI = nullptr; Value *StoredVal = State.get(StoredValue, Part); + + // If EVL is not nullptr, then EVL must be a valid value set during plan + // creation, possibly default value = whole vector register length. EVL is + // created only if TTI prefers predicated vectorization, thus if EVL is + // not nullptr it also implies preference for predicated vectorization. + Value *EVLPart = EVL ? State.get(EVL, Part) : nullptr; + if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(Addr, Part); @@ -2932,11 +3013,25 @@ // another expression. So don't call resetVectorValue(StoredVal). } auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); - if (isMaskRequired) + // if EVLPart is not null, we can vectorize using predicated + // intrinsic. + if (EVLPart) { + assert(isMaskRequired && + "Mask argument is required for VP intrinsics."); + VectorType *StoredValTy = cast(StoredVal->getType()); + Value *BlockInMaskPart = BlockInMaskParts[Part]; + Value *EVLPartI32 = Builder.CreateSExtOrTrunc( + EVLPart, Type::getInt32Ty(Builder.getContext())); + NewSI = Builder.CreateIntrinsic( + Intrinsic::vp_store, {StoredValTy, VecPtr->getType()}, + {StoredVal, VecPtr, Builder.getInt32(Alignment.value()), + BlockInMaskPart, EVLPartI32}); + } else if (isMaskRequired) { NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, BlockInMaskParts[Part]); - else + } else { NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); + } } addMetadata(NewSI, SI); } @@ -2948,6 +3043,13 @@ setDebugLocFromInst(Builder, LI); for (unsigned Part = 0; Part < UF; ++Part) { Value *NewLI; + + // If EVL is not nullptr, then EVL must be a valid value set during plan + // creation, possibly default value = whole vector register length. EVL is + // created only if TTI prefers predicated vectorization, thus if EVL is + // not nullptr it also implies preference for predicated vectorization. + Value *EVLPart = EVL ? State.get(EVL, Part) : nullptr; + if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; Value *VectorGep = State.get(Addr, Part); @@ -2956,13 +3058,26 @@ addMetadata(NewLI, LI); } else { auto *VecPtr = CreateVecPtr(Part, State.get(Addr, VPIteration(0, 0))); - if (isMaskRequired) + if (EVLPart) { + assert(isMaskRequired && + "Mask argument is required for VP intrinsics."); + Value *BlockInMaskPart = BlockInMaskParts[Part]; + Value *EVLPartI32 = Builder.CreateSExtOrTrunc( + EVLPart, Type::getInt32Ty(Builder.getContext())); + NewLI = Builder.CreateIntrinsic( + Intrinsic::vp_load, + {VecPtr->getType()->getPointerElementType(), VecPtr->getType()}, + {VecPtr, Builder.getInt32(Alignment.value()), BlockInMaskPart, + EVLPartI32}, + nullptr, "vp.op.load"); + } else if (isMaskRequired) { NewLI = Builder.CreateMaskedLoad( VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy), "wide.masked.load"); - else + } else { NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); + } // Add metadata to the load, but setVectorValue to the reverse shuffle. addMetadata(NewLI, LI); @@ -4840,6 +4955,73 @@ return !CInt || CInt->isZero(); } +void InnerLoopVectorizer::widenPredicatedInstruction(Instruction &I, + VPValue *Def, VPUser &User, + VPTransformState &State, + VPValue *BlockInMask, + VPValue *EVL) { + auto getVPIntrInstr = [](unsigned Opcode) { + switch (Opcode) { + case Instruction::Add: + return Intrinsic::vp_add; + case Instruction::Sub: + return Intrinsic::vp_sub; + case Instruction::Mul: + return Intrinsic::vp_mul; + case Instruction::SDiv: + return Intrinsic::vp_sdiv; + case Instruction::UDiv: + return Intrinsic::vp_udiv; + case Instruction::SRem: + return Intrinsic::vp_srem; + case Instruction::URem: + return Intrinsic::vp_urem; + case Instruction::AShr: + return Intrinsic::vp_ashr; + case Instruction::LShr: + return Intrinsic::vp_lshr; + case Instruction::Shl: + return Intrinsic::vp_shl; + case Instruction::Or: + return Intrinsic::vp_or; + case Instruction::And: + return Intrinsic::vp_and; + case Instruction::Xor: + return Intrinsic::vp_xor; + } + return Intrinsic::not_intrinsic; + }; + + unsigned Opcode = I.getOpcode(); + assert(getVPIntrInstr(Opcode) != Intrinsic::not_intrinsic && + "Instruction does not have VP intrinsic support."); + + // Just widen unops and binops. + setDebugLocFromInst(Builder, &I); + + for (unsigned Part = 0; Part < UF; ++Part) { + SmallVector Ops; + for (unsigned OpIdx = 0; OpIdx < User.getNumOperands() - 2; OpIdx++) + Ops.push_back(State.get(User.getOperand(OpIdx), Part)); + + VectorType *OpTy = cast(Ops[0]->getType()); + Value *MaskOp = State.get(BlockInMask, Part); + Ops.push_back(MaskOp); + + Value *EVLOp = State.get(EVL, Part); + Ops.push_back(EVLOp); + + Value *V = Builder.CreateIntrinsic(getVPIntrInstr(Opcode), OpTy, Ops, + nullptr, "vp.op"); + if (auto *VecOp = dyn_cast(V)) + VecOp->copyIRFlags(&I); + + // Use this vector value for all users of the original instruction. + State.set(Def, V, Part); + addMetadata(V, &I); + } +} + void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def, VPUser &User, VPTransformState &State) { @@ -5655,6 +5837,28 @@ // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { FoldTailByMasking = true; + if (!PreferPredicateWithVPIntrinsics) + return MaxVF; + + if (UserIC > 1) { + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " + "not generate VP intrinsics since interleave count " + "specified is greater than 1.\n"); + return MaxVF; + } + + if (PreferPredicateWithVPIntrinsics == + PreferVPIntrinsicsTy::IfAVLSupported) { + PreferVPIntrinsics = TTI.hasActiveVectorLength(); + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " + "try to generate VP Intrinsics if the target " + "support vector length predication.\n"); + } else { + PreferVPIntrinsics = true; + LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. Will " + "try to generate VP Intrinsics.\n"); + } + return MaxVF; } @@ -6132,6 +6336,11 @@ if (!isScalarEpilogueAllowed()) return 1; + // Do not interleave if VP intrinsics are preferred and no User IC is + // specified. + if (preferVPIntrinsics()) + return 1; + // We used the distance for the interleave count. if (Legal->getMaxSafeDepDistBytes() != -1U) return 1; @@ -8350,6 +8559,19 @@ if (!CM.blockNeedsPredication(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. + // if header block needs predication then it is only because tail-folding is + // enabled. If we are using VP intrinsics for a target with vector length + // predication support, this mask (icmp ule %IV %BTC) becomes redundant with + // EVL, which means unless we are using VP intrinsics without vector length + // predication support we can replace this mask with an all-true mask for + // possibly better latency. + if (CM.preferVPIntrinsics() && + PreferPredicateWithVPIntrinsics != + PreferVPIntrinsicsTy::WithoutAVLSupport) { + BlockMask = Builder.createNaryOp(VPInstruction::AllTrueMask, {}); + return BlockMaskCache[BB] = BlockMask; + } + // Create the block in mask as the first non-phi instruction in the block. VPBuilder::InsertPointGuard Guard(Builder); auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); @@ -8398,8 +8620,17 @@ return BlockMaskCache[BB] = BlockMask; } -VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, - VPlanPtr &Plan) { +VPValue *VPRecipeBuilder::getOrCreateEVL(VPlanPtr &Plan) { + if (!EVL) { + auto *EVLRecipe = new VPWidenEVLRecipe(); + Builder.getInsertBlock()->appendRecipe(EVLRecipe); + EVL = EVLRecipe->getEVL(); + } + return EVL; +} + +bool VPRecipeBuilder::validateWidenMemory(Instruction *I, + VFRange &Range) const { assert((isa(I) || isa(I)) && "Must be called with either a load or store"); @@ -8418,7 +8649,12 @@ return Decision != LoopVectorizationCostModel::CM_Scalarize; }; - if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) + return (LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)); +} + +VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan) { + if (!validateWidenMemory(I, Range)) return nullptr; VPValue *Mask = nullptr; @@ -8434,6 +8670,24 @@ return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask); } +VPRecipeBase *VPRecipeBuilder::tryToPredicatedWidenMemory(Instruction *I, + VFRange &Range, + VPlanPtr &Plan) { + if (!validateWidenMemory(I, Range)) + return nullptr; + + VPValue *Mask = createBlockInMask(I->getParent(), Plan); + VPValue *EVL = getOrCreateEVL(Plan); + VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I)); + if (LoadInst *Load = dyn_cast(I)) + return new VPPredicatedWidenMemoryInstructionRecipe(*Load, Addr, Mask, EVL); + + StoreInst *Store = cast(I); + VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand()); + return new VPPredicatedWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, + Mask, EVL); +} + VPWidenIntOrFpInductionRecipe * VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const { // Check if this is an integer or fp induction. If so, build the recipe that @@ -8563,7 +8817,11 @@ Range); } -VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { +bool VPRecipeBuilder::preferPredicatedWiden() const { + return CM.preferVPIntrinsics(); +} + +bool VPRecipeBuilder::validateWiden(Instruction *I) const { auto IsVectorizableOpcode = [](unsigned Opcode) { switch (Opcode) { case Instruction::Add: @@ -8605,13 +8863,28 @@ return false; }; - if (!IsVectorizableOpcode(I->getOpcode())) + return IsVectorizableOpcode(I->getOpcode()); +} + +VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const { + if (!validateWiden(I)) return nullptr; // Success: widen this instruction. return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands())); } +VPPredicatedWidenRecipe *VPRecipeBuilder::tryToPredicatedWiden(Instruction *I, + VPlanPtr &Plan) { + if (!validateWiden(I)) + return nullptr; + + VPValue *Mask = createBlockInMask(I->getParent(), Plan); + VPValue *EVL = getOrCreateEVL(Plan); + return new VPPredicatedWidenRecipe(*I, Plan->mapToVPValues(I->operands()), + Mask, EVL); +} + VPBasicBlock *VPRecipeBuilder::handleReplication( Instruction *I, VFRange &Range, VPBasicBlock *VPBB, VPlanPtr &Plan) { @@ -8700,8 +8973,12 @@ if (auto *CI = dyn_cast(Instr)) return toVPRecipeResult(tryToWidenCall(CI, Range, *Plan)); - if (isa(Instr) || isa(Instr)) + if (isa(Instr) || isa(Instr)) { + if (preferPredicatedWiden()) { + return toVPRecipeResult(tryToPredicatedWidenMemory(Instr, Range, Plan)); + } return toVPRecipeResult(tryToWidenMemory(Instr, Range, Plan)); + } VPRecipeBase *Recipe; if (auto Phi = dyn_cast(Instr)) { @@ -8738,6 +9015,9 @@ *SI, Plan->mapToVPValues(SI->operands()), InvariantCond)); } + if (preferPredicatedWiden()) { + return toVPRecipeResult(tryToPredicatedWiden(Instr, Plan)); + } return toVPRecipeResult(tryToWiden(Instr, *Plan)); } @@ -9115,6 +9395,11 @@ State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); } +void VPPredicatedWidenRecipe::execute(VPTransformState &State) { + State.ILV->widenPredicatedInstruction(*getUnderlyingInstr(), this, *this, + State, getMask(), getEVL()); +} + void VPWidenGEPRecipe::execute(VPTransformState &State) { State.ILV->widenGEP(cast(getUnderlyingInstr()), this, *this, State.UF, State.VF, IsPtrLoopInvariant, @@ -9321,6 +9606,63 @@ getAddr(), StoredValue, getMask()); } +void VPPredicatedWidenMemoryInstructionRecipe::execute( + VPTransformState &State) { + VPValue *StoredValue = isStore() ? getStoredValue() : nullptr; + State.ILV->vectorizeMemoryInstruction( + &Ingredient, State, StoredValue ? nullptr : getVPValue(), getAddr(), + StoredValue, getMask(), getEVL()); +} + +Value *InnerLoopVectorizer::createEVL() { + assert(PreferPredicateWithVPIntrinsics != + PreferVPIntrinsicsTy::NoPredication && + "Predication with VP intrinsics turned off."); + + if (PreferPredicateWithVPIntrinsics == PreferVPIntrinsicsTy::IfAVLSupported) + assert(TTI->hasActiveVectorLength() && + "Target does not support vector length predication."); + + auto *MinVF = Builder.getInt32(VF.getKnownMinValue()); + Value *RuntimeVL = + VF.isScalable() ? Builder.CreateVScale(MinVF, "vscale.x.vf") : MinVF; + + if (PreferPredicateWithVPIntrinsics == + PreferVPIntrinsicsTy::WithoutAVLSupport && + !TTI->hasActiveVectorLength()) { + return RuntimeVL; + } + + Value *Remaining = Builder.CreateSub(TripCount, Induction); + // FIXME: This is a proof-of-concept naive implementation to demonstrate using + // a target dependent intrinisc to compute the vector length. + if (TTI->useCustomActiveVectorLengthIntrinsic()) { + // Set Element width to the widest type used in the loop. + unsigned SmallestType, WidestType; + std::tie(SmallestType, WidestType) = Cost->getSmallestAndWidestTypes(); + Constant *ElementWidth = Builder.getInt32(WidestType); + // Set Register width factor to 1. + Constant *RegWidthFactor = Builder.getInt32(1); + return Builder.CreateIntrinsic(Intrinsic::experimental_set_vector_length, + {Remaining->getType()}, + {Remaining, ElementWidth, RegWidthFactor}); + } + + Value *RuntimeVLExt = Builder.CreateZExt(RuntimeVL, Remaining->getType()); + Value *EVL = + Builder.CreateBinaryIntrinsic(Intrinsic::umin, RuntimeVLExt, Remaining); + return Builder.CreateTrunc(EVL, Builder.getInt32Ty()); +} + +void VPWidenEVLRecipe::execute(VPTransformState &State) { + // FIXME: Interleaving with predicated vectorization is not yet supported. + // Since VPlan only provides set methods for per Part or per Instance, we use + // the per Part set method to store the same EVL for each Part (State.UF would + // be 1 for now.) + for (unsigned Part = 0; Part < State.UF; Part++) + State.set(getEVL(), State.ILV->createEVL(), Part); +} + // Determine how to lower the scalar epilogue, which depends on 1) optimising // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing // predication, and 4) a TTI hook that analyses whether the loop is suitable diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -41,6 +41,8 @@ VPBuilder &Builder; + VPValue *EVL = nullptr; + /// When we if-convert we need to create edge masks. We have to cache values /// so that we don't end up with exponential recursion/IR. Note that /// if-conversion currently takes place during VPlan-construction, so these @@ -67,6 +69,15 @@ VPRecipeBase *tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan); + /// Similar to tryToWidenMemory, but create a predicated recipe. The + /// predicated recipe takes mandatory mask and EVL VPInstructions. + VPRecipeBase *tryToPredicatedWidenMemory(Instruction *I, VFRange &Range, + VPlanPtr &Plan); + + /// Helper method used by tryToWidenMemory and tryToPredicatedWidenMemory to + /// validate if a memory instructions can be widened. + bool validateWidenMemory(Instruction *I, VFRange &Range) const; + /// Check if an induction recipe should be constructed for \I. If so build and /// return it. If not, return null. VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi, @@ -95,9 +106,19 @@ /// that widening should be performed. VPWidenRecipe *tryToWiden(Instruction *I, VPlan &Plan) const; + /// Similar to tryToWiden, but widen to VP intrinsics. + VPPredicatedWidenRecipe *tryToPredicatedWiden(Instruction *I, VPlanPtr &Plan); + + /// Helper method used by tryToWiden and tryToPredicatedWiden to validate if + /// an instruction can be widened. + bool validateWiden(Instruction *I) const; + /// Return a VPRecipeOrValueTy with VPRecipeBase * being set. This can be used to force the use as VPRecipeBase* for recipe sub-types that also inherit from VPValue. VPRecipeOrVPValueTy toVPRecipeResult(VPRecipeBase *R) const { return R; } + /// Create recipes that will expand to VP intrinsics. + bool preferPredicatedWiden() const; + public: VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI, LoopVectorizationLegality *Legal, @@ -132,6 +153,10 @@ /// and DST. VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan); + /// A helper function that computes the Explicit(Active) Vector Length for the + /// current vector iteration. + VPValue *getOrCreateEVL(VPlanPtr &Plan); + /// Mark given ingredient for recording its recipe once one is created for /// it. void recordRecipeOf(Instruction *I) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -685,6 +685,7 @@ inline bool VPUser::classof(const VPDef *Def) { return Def->getVPDefID() == VPRecipeBase::VPInstructionSC || + Def->getVPDefID() == VPRecipeBase::VPPredicatedWidenSC || Def->getVPDefID() == VPRecipeBase::VPWidenSC || Def->getVPDefID() == VPRecipeBase::VPWidenCallSC || Def->getVPDefID() == VPRecipeBase::VPWidenSelectSC || @@ -694,6 +695,8 @@ Def->getVPDefID() == VPRecipeBase::VPReplicateSC || Def->getVPDefID() == VPRecipeBase::VPReductionSC || Def->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC || + Def->getVPDefID() == + VPRecipeBase::VPPredicatedWidenMemoryInstructionSC || Def->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC; } @@ -712,6 +715,7 @@ SLPLoad, SLPStore, ActiveLaneMask, + AllTrueMask, }; private: @@ -830,6 +834,45 @@ #endif }; +/// VPPredicatedWidenRecipe is a recipe for producing a copy of vector type +/// using VP intrinsics for its ingredient. This recipe covers most of the +/// traditional vectorization cases where each ingredient transforms into a +/// vectorized version of itself. +class VPPredicatedWidenRecipe : public VPRecipeBase, public VPValue { +public: + template + VPPredicatedWidenRecipe(Instruction &I, iterator_range Operands, + VPValue *Mask, VPValue *EVL) + : VPRecipeBase(VPRecipeBase::VPPredicatedWidenSC, Operands), + VPValue(VPValue::VPVPredicatedWidenSC, &I, this) { + addOperand(Mask); + addOperand(EVL); + } + + ~VPPredicatedWidenRecipe() override = default; + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPPredicatedWidenSC; + } + static inline bool classof(const VPValue *V) { + return V->getVPValueID() == VPValue::VPVPredicatedWidenSC; + } + + /// Return the mask used by this recipe. + VPValue *getMask() const { return getOperand(getNumOperands() - 2); } + + /// Return the explicit vector length used by this recipe. + VPValue *getEVL() const { return getOperand(getNumOperands() - 1); } + + /// Generate the wide load/store. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +}; + /// A recipe for widening Call instructions. class VPWidenCallRecipe : public VPRecipeBase, public VPValue { @@ -1300,6 +1343,33 @@ } }; +/// A recipe to generate Explicit Vector Length (EVL) value to be used with +/// VPred intrinsics. +class VPWidenEVLRecipe : public VPRecipeBase, public VPValue { + +public: + VPWidenEVLRecipe() + : VPRecipeBase(VPRecipeBase::VPWidenEVLSC, {}), + VPValue(VPValue::VPVWidenEVLSC, nullptr, this) {} + ~VPWidenEVLRecipe() override = default; + + /// Return the VPValue representing EVL. + const VPValue *getEVL() const { return this; } + VPValue *getEVL() { return this; } + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == VPRecipeBase::VPWidenEVLSC; + } + + /// Generate the instructions to compute EVL. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +}; + /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when /// control converges back from a Branch-on-Mask. The phi nodes are needed in /// order to merge values that are set under such a branch and feed their uses. @@ -1398,6 +1468,68 @@ #endif }; +/// A Recipe for widening load/store operations to VP intrinsics. +/// The recipe uses the following VPValues: +/// - For load: Address, mask, EVL +/// - For store: Address, stored value, mask, EVL +class VPPredicatedWidenMemoryInstructionRecipe : public VPRecipeBase { + Instruction &Ingredient; + +public: + VPPredicatedWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, + VPValue *Mask, VPValue *EVL) + : VPRecipeBase(VPPredicatedWidenMemoryInstructionSC, {Addr, Mask, EVL}), + Ingredient(Load) { + new VPValue(VPValue::VPVPredicatedMemoryInstructionSC, &Load, this); + } + + VPPredicatedWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr, + VPValue *StoredValue, VPValue *Mask, + VPValue *EVL) + : VPRecipeBase(VPPredicatedWidenMemoryInstructionSC, + {Addr, StoredValue, Mask, EVL}), + Ingredient(Store) {} + + /// Method to support type inquiry through isa, cast, and dyn_cast. + static inline bool classof(const VPDef *D) { + return D->getVPDefID() == + VPRecipeBase::VPPredicatedWidenMemoryInstructionSC; + } + + /// Return the address accessed by this recipe. + VPValue *getAddr() const { + return getOperand(0); // Address is the 1st, mandatory operand. + } + + /// Return the mask used by this recipe. + VPValue *getMask() const { + // Mask is the second last, mandatory operand. + return getOperand(getNumOperands() - 2); + } + + /// Return the EVL used by this recipe. + VPValue *getEVL() const { + // EVL is the last, mandatory operand. + return getOperand(getNumOperands() - 1); + } + + /// Returns true if this recipe is a store. + bool isStore() const { return isa(Ingredient); } + + /// Return the address accessed by this recipe. + VPValue *getStoredValue() const { + assert(isStore() && "Stored value only available for store instructions"); + return getOperand(1); // Stored value is the 2nd, mandatory operand. + } + + /// Generate the wide load/store. + void execute(VPTransformState &State) override; + + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +}; + /// A Recipe for widening the canonical induction variable of the vector loop. class VPWidenCanonicalIVRecipe : public VPRecipeBase { public: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -598,6 +598,12 @@ State.set(this, Call, Part); break; } + case VPInstruction::AllTrueMask: { + Value *AllTrueMask = Builder.CreateTrueVector(State.VF); + State.set(this, AllTrueMask, Part); + break; + } + default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -640,7 +646,9 @@ case VPInstruction::ActiveLaneMask: O << "active lane mask"; break; - + case VPInstruction::AllTrueMask: + O << "all true mask"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } @@ -980,6 +988,14 @@ printOperands(O, SlotTracker); } +void VPPredicatedWidenRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "PREDICATED-WIDEN "; + printAsOperand(O, SlotTracker); + O << " = " << getUnderlyingInstr()->getOpcodeName() << " "; + printOperands(O, SlotTracker); +} + void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { O << Indent << "WIDEN-INDUCTION"; @@ -1084,6 +1100,19 @@ } #endif +void VPPredicatedWidenMemoryInstructionRecipe::print( + raw_ostream &O, const Twine &Indent, VPSlotTracker &SlotTracker) const { + O << Indent << "PREDICATED-WIDEN "; + + if (!isStore()) { + getVPValue()->printAsOperand(O, SlotTracker); + O << " = "; + } + O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " "; + + printOperands(O, SlotTracker); +} + void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) { Value *CanonicalIV = State.CanonicalIV; Type *STy = CanonicalIV->getType(); @@ -1118,6 +1147,13 @@ } #endif +void VPWidenEVLRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "EMIT "; + getEVL()->printAsOperand(O, SlotTracker); + O << " = GENERATE-EXPLICIT-VECTOR-LENGTH"; +} + template void DomTreeBuilder::Calculate(VPDominatorTree &DT); void VPValue::replaceAllUsesWith(VPValue *New) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -51,6 +51,7 @@ friend class VPSlotTracker; friend class VPRecipeBase; friend class VPWidenMemoryInstructionRecipe; + friend class VPPredicatedWidenMemoryInstructionRecipe; const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast). @@ -94,10 +95,13 @@ VPVInstructionSC, VPVMemoryInstructionSC, VPVPredInstPHI, + VPVPredicatedMemoryInstructionSC, + VPVPredicatedWidenSC, VPVReductionSC, VPVReplicateSC, VPVWidenSC, VPVWidenCallSC, + VPVWidenEVLSC, VPVWidenGEPSC, VPVWidenIntOrFpIndcutionSC, VPVWidenPHISC, @@ -303,10 +307,13 @@ VPInstructionSC, VPInterleaveSC, VPPredInstPHISC, + VPPredicatedWidenMemoryInstructionSC, + VPPredicatedWidenSC, VPReductionSC, VPReplicateSC, VPWidenCallSC, VPWidenCanonicalIVSC, + VPWidenEVLSC, VPWidenGEPSC, VPWidenIntOrFpInductionSC, VPWidenMemoryInstructionSC, diff --git a/llvm/test/Transforms/LoopVectorize/vectorize-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/vectorize-vp-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vectorize-vp-intrinsics.ll @@ -0,0 +1,292 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=without-active-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -S %s 2>&1 | FileCheck --check-prefix=WITHOUT-AVL %s + +; RUN: opt -loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=if-active-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -S %s 2>&1 | FileCheck --check-prefix=IF-AVL %s + +; RUN: opt -loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=force-active-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -S %s 2>&1 | FileCheck --check-prefix=FORCE-AVL %s + +; RUN: opt -loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -S %s 2>&1 | FileCheck --check-prefix=NO-VP %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) local_unnamed_addr { +; WITHOUT-AVL-LABEL: @foo( +; WITHOUT-AVL-NEXT: entry: +; WITHOUT-AVL-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; WITHOUT-AVL-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; WITHOUT-AVL: for.body.preheader: +; WITHOUT-AVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; WITHOUT-AVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; WITHOUT-AVL: vector.ph: +; WITHOUT-AVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 3 +; WITHOUT-AVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; WITHOUT-AVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; WITHOUT-AVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1 +; WITHOUT-AVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; WITHOUT-AVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; WITHOUT-AVL-NEXT: br label [[VECTOR_BODY:%.*]] +; WITHOUT-AVL: vector.body: +; WITHOUT-AVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; WITHOUT-AVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0 +; WITHOUT-AVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; WITHOUT-AVL-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; WITHOUT-AVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; WITHOUT-AVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]] +; WITHOUT-AVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; WITHOUT-AVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 +; WITHOUT-AVL-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* +; WITHOUT-AVL-NEXT: [[VP_OP_LOAD:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* [[TMP4]], i32 4, <4 x i1> [[TMP1]], i32 4) +; WITHOUT-AVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]] +; WITHOUT-AVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 +; WITHOUT-AVL-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; WITHOUT-AVL-NEXT: [[VP_OP_LOAD3:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* [[TMP7]], i32 4, <4 x i1> [[TMP1]], i32 4) +; WITHOUT-AVL-NEXT: [[VP_OP:%.*]] = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> [[VP_OP_LOAD3]], <4 x i32> [[VP_OP_LOAD]], <4 x i1> [[TMP1]], i32 4) +; WITHOUT-AVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; WITHOUT-AVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 +; WITHOUT-AVL-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* +; WITHOUT-AVL-NEXT: call void @llvm.vp.store.v4i32.p0v4i32(<4 x i32> [[VP_OP]], <4 x i32>* [[TMP10]], i32 4, <4 x i1> [[TMP1]], i32 4) +; WITHOUT-AVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; WITHOUT-AVL-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; WITHOUT-AVL-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; WITHOUT-AVL: middle.block: +; WITHOUT-AVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; WITHOUT-AVL: scalar.ph: +; WITHOUT-AVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; WITHOUT-AVL-NEXT: br label [[FOR_BODY:%.*]] +; WITHOUT-AVL: for.cond.cleanup.loopexit: +; WITHOUT-AVL-NEXT: br label [[FOR_COND_CLEANUP]] +; WITHOUT-AVL: for.cond.cleanup: +; WITHOUT-AVL-NEXT: ret void +; WITHOUT-AVL: for.body: +; WITHOUT-AVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; WITHOUT-AVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; WITHOUT-AVL-NEXT: [[TMP12:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; WITHOUT-AVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] +; WITHOUT-AVL-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; WITHOUT-AVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP12]] +; WITHOUT-AVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; WITHOUT-AVL-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; WITHOUT-AVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; WITHOUT-AVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; WITHOUT-AVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; +; IF-AVL-LABEL: @foo( +; IF-AVL-NEXT: entry: +; IF-AVL-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; IF-AVL-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; IF-AVL: for.body.preheader: +; IF-AVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; IF-AVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; IF-AVL: vector.ph: +; IF-AVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 3 +; IF-AVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; IF-AVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; IF-AVL-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1 +; IF-AVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; IF-AVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; IF-AVL-NEXT: br label [[VECTOR_BODY:%.*]] +; IF-AVL: vector.body: +; IF-AVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; IF-AVL-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0 +; IF-AVL-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; IF-AVL-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; IF-AVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; IF-AVL-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]] +; IF-AVL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; IF-AVL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 +; IF-AVL-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* +; IF-AVL-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP4]], i32 4, <4 x i1> [[TMP1]], <4 x i32> poison) +; IF-AVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]] +; IF-AVL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 +; IF-AVL-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; IF-AVL-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP7]], i32 4, <4 x i1> [[TMP1]], <4 x i32> poison) +; IF-AVL-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; IF-AVL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; IF-AVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 +; IF-AVL-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* +; IF-AVL-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], i32 4, <4 x i1> [[TMP1]]) +; IF-AVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; IF-AVL-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; IF-AVL-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; IF-AVL: middle.block: +; IF-AVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; IF-AVL: scalar.ph: +; IF-AVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; IF-AVL-NEXT: br label [[FOR_BODY:%.*]] +; IF-AVL: for.cond.cleanup.loopexit: +; IF-AVL-NEXT: br label [[FOR_COND_CLEANUP]] +; IF-AVL: for.cond.cleanup: +; IF-AVL-NEXT: ret void +; IF-AVL: for.body: +; IF-AVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; IF-AVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; IF-AVL-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; IF-AVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] +; IF-AVL-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; IF-AVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]] +; IF-AVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; IF-AVL-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; IF-AVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; IF-AVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; IF-AVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; +; FORCE-AVL-LABEL: @foo( +; FORCE-AVL-NEXT: entry: +; FORCE-AVL-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; FORCE-AVL-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; FORCE-AVL: for.body.preheader: +; FORCE-AVL-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; FORCE-AVL-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FORCE-AVL: vector.ph: +; FORCE-AVL-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 3 +; FORCE-AVL-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; FORCE-AVL-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; FORCE-AVL-NEXT: br label [[VECTOR_BODY:%.*]] +; FORCE-AVL: vector.body: +; FORCE-AVL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FORCE-AVL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0 +; FORCE-AVL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; FORCE-AVL-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], +; FORCE-AVL-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; FORCE-AVL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; FORCE-AVL-NEXT: [[TMP2:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[INDEX]] +; FORCE-AVL-NEXT: [[TMP3:%.*]] = call i64 @llvm.umin.i64(i64 4, i64 [[TMP2]]) +; FORCE-AVL-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; FORCE-AVL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0 +; FORCE-AVL-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; FORCE-AVL-NEXT: [[VP_OP_LOAD:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* [[TMP6]], i32 4, <4 x i1> , i32 [[TMP4]]) +; FORCE-AVL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]] +; FORCE-AVL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 +; FORCE-AVL-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* +; FORCE-AVL-NEXT: [[VP_OP_LOAD1:%.*]] = call <4 x i32> @llvm.vp.load.v4i32.p0v4i32(<4 x i32>* [[TMP9]], i32 4, <4 x i1> , i32 [[TMP4]]) +; FORCE-AVL-NEXT: [[VP_OP:%.*]] = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> [[VP_OP_LOAD1]], <4 x i32> [[VP_OP_LOAD]], <4 x i1> , i32 [[TMP4]]) +; FORCE-AVL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; FORCE-AVL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i32 0 +; FORCE-AVL-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>* +; FORCE-AVL-NEXT: call void @llvm.vp.store.v4i32.p0v4i32(<4 x i32> [[VP_OP]], <4 x i32>* [[TMP12]], i32 4, <4 x i1> , i32 [[TMP4]]) +; FORCE-AVL-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; FORCE-AVL-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; FORCE-AVL-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FORCE-AVL: middle.block: +; FORCE-AVL-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; FORCE-AVL: scalar.ph: +; FORCE-AVL-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; FORCE-AVL-NEXT: br label [[FOR_BODY:%.*]] +; FORCE-AVL: for.cond.cleanup.loopexit: +; FORCE-AVL-NEXT: br label [[FOR_COND_CLEANUP]] +; FORCE-AVL: for.cond.cleanup: +; FORCE-AVL-NEXT: ret void +; FORCE-AVL: for.body: +; FORCE-AVL-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; FORCE-AVL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; FORCE-AVL-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; FORCE-AVL-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] +; FORCE-AVL-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; FORCE-AVL-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP14]] +; FORCE-AVL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; FORCE-AVL-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; FORCE-AVL-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; FORCE-AVL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; FORCE-AVL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; +; NO-VP-LABEL: @foo( +; NO-VP-NEXT: entry: +; NO-VP-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; NO-VP-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; NO-VP: for.body.preheader: +; NO-VP-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; NO-VP-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; NO-VP: vector.ph: +; NO-VP-NEXT: [[N_RND_UP:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 3 +; NO-VP-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 4 +; NO-VP-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; NO-VP-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1 +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 +; NO-VP-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; NO-VP-NEXT: br label [[VECTOR_BODY:%.*]] +; NO-VP: vector.body: +; NO-VP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; NO-VP-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i32 0 +; NO-VP-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer +; NO-VP-NEXT: [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT2]], +; NO-VP-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; NO-VP-NEXT: [[TMP1:%.*]] = icmp ule <4 x i64> [[INDUCTION]], [[BROADCAST_SPLAT]] +; NO-VP-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP2]], i32 0 +; NO-VP-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* +; NO-VP-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP4]], i32 4, <4 x i1> [[TMP1]], <4 x i32> poison) +; NO-VP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 +; NO-VP-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; NO-VP-NEXT: [[WIDE_MASKED_LOAD3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP7]], i32 4, <4 x i1> [[TMP1]], <4 x i32> poison) +; NO-VP-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD3]], [[WIDE_MASKED_LOAD]] +; NO-VP-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]] +; NO-VP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0 +; NO-VP-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* +; NO-VP-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], i32 4, <4 x i1> [[TMP1]]) +; NO-VP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 +; NO-VP-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NO-VP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NO-VP: middle.block: +; NO-VP-NEXT: br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; NO-VP: scalar.ph: +; NO-VP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; NO-VP-NEXT: br label [[FOR_BODY:%.*]] +; NO-VP: for.cond.cleanup.loopexit: +; NO-VP-NEXT: br label [[FOR_COND_CLEANUP]] +; NO-VP: for.cond.cleanup: +; NO-VP-NEXT: ret void +; NO-VP: for.body: +; NO-VP-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; NO-VP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; NO-VP-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; NO-VP-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]] +; NO-VP-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; NO-VP-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; NO-VP-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; NO-VP-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; NO-VP-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; +entry: + %cmp10 = icmp sgt i32 %N, 0 + br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} diff --git a/llvm/test/Transforms/LoopVectorize/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/vplan-vp-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vplan-vp-intrinsics.ll @@ -0,0 +1,112 @@ +; REQUIRES: asserts + +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=without-active-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -disable-output %s 2>&1 | FileCheck --check-prefix=WITHOUT-AVL %s + +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=if-active-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -disable-output %s 2>&1 | FileCheck --check-prefix=IF-AVL %s + +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=force-active-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -disable-output %s 2>&1 | FileCheck --check-prefix=FORCE-AVL %s + +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -disable-output %s 2>&1 | FileCheck --check-prefix=NO-VP %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32 %N) local_unnamed_addr { +; WITHOUT-AVL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; WITHOUT-AVL-NEXT: for.body: +; WITHOUT-AVL-NEXT: WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; WITHOUT-AVL-NEXT: EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0> +; WITHOUT-AVL-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv> +; WITHOUT-AVL-NEXT: EMIT vp<%4> = GENERATE-EXPLICIT-VECTOR-LENGTH +; WITHOUT-AVL-NEXT: PREDICATED-WIDEN ir<%0> = load ir<%arrayidx>, vp<%2>, vp<%4> +; WITHOUT-AVL-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv> +; WITHOUT-AVL-NEXT: PREDICATED-WIDEN ir<%1> = load ir<%arrayidx2>, vp<%2>, vp<%4> +; WITHOUT-AVL-NEXT: PREDICATED-WIDEN ir<%add> = add ir<%1>, ir<%0>, vp<%2>, vp<%4> +; WITHOUT-AVL-NEXT: CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv> +; WITHOUT-AVL-NEXT: PREDICATED-WIDEN store ir<%arrayidx4>, ir<%add>, vp<%2>, vp<%4> +; WITHOUT-AVL-NEXT: No successors +; WITHOUT-AVL-NEXT: } + +; IF-AVL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; IF-AVL-NEXT: for.body: +; IF-AVL-NEXT: WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; IF-AVL-NEXT: EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0> +; IF-AVL-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv> +; IF-AVL-NEXT: WIDEN ir<%0> = load ir<%arrayidx>, vp<%2> +; IF-AVL-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv> +; IF-AVL-NEXT: WIDEN ir<%1> = load ir<%arrayidx2>, vp<%2> +; IF-AVL-NEXT: WIDEN ir<%add> = add ir<%1>, ir<%0> +; IF-AVL-NEXT: CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv> +; IF-AVL-NEXT: WIDEN store ir<%arrayidx4>, ir<%add>, vp<%2> +; IF-AVL-NEXT: No successors +; IF-AVL-NEXT: } + +; FORCE-AVL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; FORCE-AVL-NEXT: for.body: +; FORCE-AVL-NEXT: WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; FORCE-AVL-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv> +; FORCE-AVL-NEXT: EMIT vp<%2> = all true mask +; FORCE-AVL-NEXT: EMIT vp<%3> = GENERATE-EXPLICIT-VECTOR-LENGTH +; FORCE-AVL-NEXT: PREDICATED-WIDEN ir<%0> = load ir<%arrayidx>, vp<%2>, vp<%3> +; FORCE-AVL-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv> +; FORCE-AVL-NEXT: PREDICATED-WIDEN ir<%1> = load ir<%arrayidx2>, vp<%2>, vp<%3> +; FORCE-AVL-NEXT: PREDICATED-WIDEN ir<%add> = add ir<%1>, ir<%0>, vp<%2>, vp<%3> +; FORCE-AVL-NEXT: CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv> +; FORCE-AVL-NEXT: PREDICATED-WIDEN store ir<%arrayidx4>, ir<%add>, vp<%2>, vp<%3> +; FORCE-AVL-NEXT: No successors +; FORCE-AVL-NEXT: } + +; NO-VP: VPlan 'Initial VPlan for VF={4},UF>=1' { +; NO-VP-NEXT: for.body: +; NO-VP-NEXT: WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; NO-VP-NEXT: EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0> +; NO-VP-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv> +; NO-VP-NEXT: WIDEN ir<%0> = load ir<%arrayidx>, vp<%2> +; NO-VP-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv> +; NO-VP-NEXT: WIDEN ir<%1> = load ir<%arrayidx2>, vp<%2> +; NO-VP-NEXT: WIDEN ir<%add> = add ir<%1>, ir<%0> +; NO-VP-NEXT: CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv> +; NO-VP-NEXT: WIDEN store ir<%arrayidx4>, ir<%add>, vp<%2> +; NO-VP-NEXT: No successors +; NO-VP-NEXT: } + +entry: + %cmp10 = icmp sgt i32 %N, 0 + br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %N to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +}