diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -162,7 +162,7 @@ bool skipScalarizationCost() const { return ScalarizationCost.isValid(); } }; -enum class PredicationStyle { None, Data, DataAndControlFlow }; +enum class PredicationStyle { None, Data, DataAndControlFlow, ImplictData}; class TargetTransformInfo; typedef TargetTransformInfo TTI; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -72,7 +72,7 @@ bool supportsScalableVectors() const { return ST->hasVInstructions(); } bool enableScalableVectorization() const { return ST->hasVInstructions(); } PredicationStyle emitGetActiveLaneMask() const { - return ST->hasVInstructions() ? PredicationStyle::Data + return ST->hasVInstructions() ? PredicationStyle::ImplictData : PredicationStyle::None; } Optional getMaxVScale() const; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -140,6 +140,7 @@ #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #include "llvm/Transforms/Utils/SizeOpts.h" #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h" +#include "llvm/IR/IntrinsicsRISCV.h" #include #include #include @@ -211,6 +212,7 @@ enum Option { ScalarEpilogue = 0, PredicateElseScalarEpilogue, + VlElseScalarEpilogue, PredicateOrDontVectorize }; } // namespace PreferPredicateTy @@ -228,6 +230,10 @@ "predicate-else-scalar-epilogue", "prefer tail-folding, create scalar epilogue if tail " "folding fails."), + clEnumValN(PreferPredicateTy::VlElseScalarEpilogue, + "vlen-else-scalar-epilogue", + "prefer tail-folding, we use dynamic vlen to support it. Only works for rvv, create scalar epilogue if tail " + "folding fails."), clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, "predicate-dont-vectorize", "prefers tail-folding, don't attempt vectorization if " @@ -1122,6 +1128,8 @@ // Loop hint predicate indicating an epilogue is undesired. CM_ScalarEpilogueNotNeededUsePredicate, + CM_ScalarEpilogueNotNeededUseVl, + // Directive indicating we must either tail fold or not vectorize CM_ScalarEpilogueNotAllowedUsePredicate }; @@ -1524,6 +1532,8 @@ /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } + bool foldTailByImplictMasking() const {return FoldTailByImplictMasking; } + /// Returns true if were tail-folding and want to use the active lane mask /// for vector loop control flow. bool useActiveLaneMaskForControlFlow() const { @@ -1700,6 +1710,9 @@ /// All blocks of loop are to be masked to fold tail of scalar iterations. bool FoldTailByMasking = false; + /// All blocks of loop are to be masked to fold tail of scalar iterations by vl. + bool FoldTailByImplictMasking = false; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -5061,6 +5074,11 @@ << "LV: Not allowing scalar epilogue, creating predicated " << "vector loop.\n"); break; + case CM_ScalarEpilogueNotNeededUseVl: + LLVM_DEBUG(errs() << "LV: vector Vl hint/switch found.\n" + << "LV: Not allowing scalar epilogue, creating vlenset " + << "vector loop.\n";); + break; case CM_ScalarEpilogueNotAllowedLowTripLoop: // fallthrough as a special case of OptForSize case CM_ScalarEpilogueNotAllowedOptSize: @@ -5086,8 +5104,8 @@ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { // If there was a tail-folding hint/switch, but we can't fold the tail by // masking, fallback to a vectorization with a scalar epilogue. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate|| ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUseVl) { + LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking or vl: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; return computeFeasibleMaxVF(TC, UserVF, false); @@ -5138,12 +5156,15 @@ // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { FoldTailByMasking = true; + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUseVl) + FoldTailByImplictMasking = true; return MaxFactors; } // If there was a tail-folding hint/switch, but we can't fold the tail by // masking, fallback to a vectorization with a scalar epilogue. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate || + ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUseVl) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; @@ -7613,7 +7634,10 @@ // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. - VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; + unsigned WidestType; + + std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); + VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan, WidestType}; Value *CanonicalIVStartValue; std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = ILV.createVectorizedLoopSkeleton(); @@ -8091,8 +8115,20 @@ Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); if (EmitGetActiveLaneMask != PredicationStyle::None) { VPValue *TC = Plan->getOrCreateTripCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, + if (EmitGetActiveLaneMask == PredicationStyle::Data) { + assert((!CM.foldTailByImplictMasking()) && "target doesn't support dynamic vl"); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, nullptr, "active.lane.mask"); + } + // riscv vsetvl is implicit predicate ImplictData + else { + if (CM.foldTailByImplictMasking()) + BlockMask = Builder.createNaryOp(VPInstruction::GetDynamicVl, {IV, TC}, + nullptr, "get.dynamic.vl"); + else + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, + nullptr, "active.lane.mask"); + } } else { VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); @@ -9746,6 +9782,7 @@ Value *StoredVal = State.get(StoredValue, Part); if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + assert((MaskPart == nullptr || !MaskPart->getType()->isIntegerTy()) && " -prefer-predicate-over-epilogue=vlen-else-scalar-epilogue not support gather scatter, you maybe prefer -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue"); Value *VectorGep = State.get(getAddr(), Part); NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, MaskPart); @@ -9759,9 +9796,15 @@ } auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); - if (isMaskRequired) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, - BlockInMaskParts[Part]); + if (isMaskRequired) { + Value* Mask = BlockInMaskParts[Part]; + Type* MaskTy = Mask->getType(); + if (MaskTy->isIntegerTy()) + NewSI = Builder.CreateIntrinsic(Intrinsic::riscv_vse, {StoredVal->getType(), MaskTy}, {StoredVal, VecPtr, Mask}, nullptr); + else + NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, + Mask); + } else NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); } @@ -9777,6 +9820,7 @@ Value *NewLI; if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + assert((MaskPart == nullptr || !MaskPart->getType()->isIntegerTy()) && " -prefer-predicate-over-epilogue=vlen-else-scalar-epilogue not support gather scatter, you maybe prefer -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue"); Value *VectorGep = State.get(getAddr(), Part); NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, nullptr, "wide.masked.gather"); @@ -9784,10 +9828,18 @@ } else { auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); - if (isMaskRequired) - NewLI = Builder.CreateMaskedLoad( - DataTy, VecPtr, Alignment, BlockInMaskParts[Part], - PoisonValue::get(DataTy), "wide.masked.load"); + if (isMaskRequired) { + Value* Mask = BlockInMaskParts[Part];//State.get(getMask(), Part); + Type* MaskTy = Mask->getType(); + if (MaskTy->isIntegerTy()) + NewLI = Builder.CreateIntrinsic( + Intrinsic::riscv_vle, {DataTy, MaskTy}, + {UndefValue::get(DataTy), VecPtr, Mask}, nullptr); + else + NewLI = Builder.CreateMaskedLoad( + DataTy, VecPtr, Alignment, BlockInMaskParts[Part], + PoisonValue::get(DataTy), "wide.masked.load"); + } else NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); @@ -9829,6 +9881,8 @@ switch (PreferPredicateOverEpilogue) { case PreferPredicateTy::ScalarEpilogue: return CM_ScalarEpilogueAllowed; + case PreferPredicateTy::VlElseScalarEpilogue: + return CM_ScalarEpilogueNotNeededUseVl; case PreferPredicateTy::PredicateElseScalarEpilogue: return CM_ScalarEpilogueNotNeededUsePredicate; case PreferPredicateTy::PredicateOrDontVectorize: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -205,13 +205,14 @@ struct VPTransformState { VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, - InnerLoopVectorizer *ILV, VPlan *Plan) - : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan), + InnerLoopVectorizer *ILV, VPlan *Plan, unsigned WidestTy) + : VF(VF), UF(UF), WidestTy(WidestTy), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan), LVer(nullptr) {} /// The chosen Vectorization and Unroll Factors of the loop being vectorized. ElementCount VF; unsigned UF; + unsigned WidestTy; /// Hold the indices to generate specific scalar instructions. Null indicates /// that all instances are to be generated, using either scalar or vector @@ -786,6 +787,7 @@ SLPLoad, SLPStore, ActiveLaneMask, + GetDynamicVl, CanonicalIVIncrement, CanonicalIVIncrementNUW, // The next two are similar to the above, but instead increment the @@ -907,6 +909,7 @@ default: return false; case VPInstruction::ActiveLaneMask: + case VPInstruction::GetDynamicVl: case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementNUW: case VPInstruction::CanonicalIVIncrementForPart: diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -17,14 +17,17 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" @@ -222,6 +225,21 @@ State.set(this, V, Part); break; } + case VPInstruction::GetDynamicVl: { + // Get first lane of vector induction variable. + Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); + // Get the original loop tripcount. + Value *ScalarTC = State.get(getOperand(1), Part); + ScalarTC = Builder.CreateIntCast(ScalarTC, VIVElem0->getType(), true); + Value* AvLen = Builder.CreateSub(ScalarTC, VIVElem0, "avl_length"); + auto *IntTy = VIVElem0->getType(); + // auto *PredTy = VectorType::get(Int1Ty, State.VF); + Instruction *Call = Builder.CreateIntrinsic( + Intrinsic::riscv_vsetvli, IntTy, + {AvLen, ConstantInt::get(IntTy, Log2_32(State.WidestTy/8)), ConstantInt::get(IntTy, Log2_32(State.VF.getKnownMinValue() * State.WidestTy/64))}, nullptr, Name); + State.set(this, Call, Part); + break; + } case VPInstruction::ActiveLaneMask: { // Get first lane of vector induction variable. Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); @@ -385,6 +403,9 @@ case VPInstruction::ActiveLaneMask: O << "active lane mask"; break; + case VPInstruction::GetDynamicVl: + O << "get dynamic vl"; + break; case VPInstruction::FirstOrderRecurrenceSplice: O << "first-order splice"; break;