diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -162,7 +162,7 @@ bool skipScalarizationCost() const { return ScalarizationCost.isValid(); } }; -enum class PredicationStyle { None, Data, DataAndControlFlow }; +enum class PredicationStyle { None, Data, DataAndControlFlow, ImplictData }; class TargetTransformInfo; typedef TargetTransformInfo TTI; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -72,7 +72,7 @@ bool supportsScalableVectors() const { return ST->hasVInstructions(); } bool enableScalableVectorization() const { return ST->hasVInstructions(); } PredicationStyle emitGetActiveLaneMask() const { - return ST->hasVInstructions() ? PredicationStyle::Data + return ST->hasVInstructions() ? PredicationStyle::ImplictData : PredicationStyle::None; } Optional getMaxVScale() const; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -112,6 +112,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" @@ -208,30 +209,35 @@ // and predicate the instructions accordingly. If tail-folding fails, there are // different fallback strategies depending on these values: namespace PreferPredicateTy { - enum Option { - ScalarEpilogue = 0, - PredicateElseScalarEpilogue, - PredicateOrDontVectorize - }; +enum Option { + ScalarEpilogue = 0, + PredicateElseScalarEpilogue, + VlElseScalarEpilogue, + PredicateOrDontVectorize +}; } // namespace PreferPredicateTy static cl::opt PreferPredicateOverEpilogue( "prefer-predicate-over-epilogue", - cl::init(PreferPredicateTy::ScalarEpilogue), - cl::Hidden, + cl::init(PreferPredicateTy::ScalarEpilogue), cl::Hidden, cl::desc("Tail-folding and predication preferences over creating a scalar " "epilogue loop."), - cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue, - "scalar-epilogue", - "Don't tail-predicate loops, create scalar epilogue"), - clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, - "predicate-else-scalar-epilogue", - "prefer tail-folding, create scalar epilogue if tail " - "folding fails."), - clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, - "predicate-dont-vectorize", - "prefers tail-folding, don't attempt vectorization if " - "tail-folding fails."))); + cl::values( + clEnumValN(PreferPredicateTy::ScalarEpilogue, "scalar-epilogue", + "Don't tail-predicate loops, create scalar epilogue"), + clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue, + "predicate-else-scalar-epilogue", + "prefer tail-folding, create scalar epilogue if tail " + "folding fails."), + clEnumValN(PreferPredicateTy::VlElseScalarEpilogue, + "vlen-else-scalar-epilogue", + "prefer tail-folding, we use dynamic vlen to support it. " + "Only works for rvv, create scalar epilogue if tail " + "folding fails."), + clEnumValN(PreferPredicateTy::PredicateOrDontVectorize, + "predicate-dont-vectorize", + "prefers tail-folding, don't attempt vectorization if " + "tail-folding fails."))); static cl::opt MaximizeBandwidth( "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden, @@ -1122,6 +1128,8 @@ // Loop hint predicate indicating an epilogue is undesired. CM_ScalarEpilogueNotNeededUsePredicate, + CM_ScalarEpilogueNotNeededUseVl, + // Directive indicating we must either tail fold or not vectorize CM_ScalarEpilogueNotAllowedUsePredicate }; @@ -1524,6 +1532,8 @@ /// Returns true if all loop blocks should be masked to fold tail loop. bool foldTailByMasking() const { return FoldTailByMasking; } + bool foldTailByImplictMasking() const { return FoldTailByImplictMasking; } + /// Returns true if were tail-folding and want to use the active lane mask /// for vector loop control flow. bool useActiveLaneMaskForControlFlow() const { @@ -1700,6 +1710,10 @@ /// All blocks of loop are to be masked to fold tail of scalar iterations. bool FoldTailByMasking = false; + /// All blocks of loop are to be masked to fold tail of scalar iterations by + /// vl. + bool FoldTailByImplictMasking = false; + /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated @@ -5061,6 +5075,11 @@ << "LV: Not allowing scalar epilogue, creating predicated " << "vector loop.\n"); break; + case CM_ScalarEpilogueNotNeededUseVl: + LLVM_DEBUG(errs() << "LV: vector Vl hint/switch found.\n" + << "LV: Not allowing scalar epilogue, creating vlenset " + << "vector loop.\n";); + break; case CM_ScalarEpilogueNotAllowedLowTripLoop: // fallthrough as a special case of OptForSize case CM_ScalarEpilogueNotAllowedOptSize: @@ -5086,9 +5105,11 @@ if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { // If there was a tail-folding hint/switch, but we can't fold the tail by // masking, fallback to a vectorization with a scalar epilogue. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " - "scalar epilogue instead.\n"); + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate || + ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUseVl) { + LLVM_DEBUG( + dbgs() << "LV: Cannot fold tail by masking or vl: vectorize with a " + "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; return computeFeasibleMaxVF(TC, UserVF, false); } @@ -5138,12 +5159,15 @@ // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { FoldTailByMasking = true; + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUseVl) + FoldTailByImplictMasking = true; return MaxFactors; } // If there was a tail-folding hint/switch, but we can't fold the tail by // masking, fallback to a vectorization with a scalar epilogue. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate || + ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUseVl) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " "scalar epilogue instead.\n"); ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; @@ -7613,7 +7637,11 @@ // 1. Set up the skeleton for vectorization, including vector pre-header and // middle block. The vector loop is created during VPlan execution. - VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; + unsigned WidestType; + + std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes(); + VPTransformState State{BestVF, BestUF, LI, DT, + ILV.Builder, &ILV, &BestVPlan, WidestType}; Value *CanonicalIVStartValue; std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = ILV.createVectorizedLoopSkeleton(); @@ -8091,8 +8119,22 @@ Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); if (EmitGetActiveLaneMask != PredicationStyle::None) { VPValue *TC = Plan->getOrCreateTripCount(); - BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, - nullptr, "active.lane.mask"); + if (EmitGetActiveLaneMask == PredicationStyle::Data) { + assert((!CM.foldTailByImplictMasking()) && + "target doesn't support dynamic vl"); + BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, + {IV, TC}, nullptr, "active.lane.mask"); + } + // riscv vsetvl is implicit predicate ImplictData + else { + if (CM.foldTailByImplictMasking()) + BlockMask = Builder.createNaryOp(VPInstruction::GetDynamicVl, + {IV, TC}, nullptr, "get.dynamic.vl"); + else + BlockMask = + Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC}, + nullptr, "active.lane.mask"); + } } else { VPValue *BTC = Plan->getOrCreateBackedgeTakenCount(); BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC}); @@ -9746,6 +9788,11 @@ Value *StoredVal = State.get(StoredValue, Part); if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + assert( + (MaskPart == nullptr || !MaskPart->getType()->isIntegerTy()) && + " -prefer-predicate-over-epilogue=vlen-else-scalar-epilogue not " + "support gather scatter, you maybe prefer " + "-prefer-predicate-over-epilogue=predicate-else-scalar-epilogue"); Value *VectorGep = State.get(getAddr(), Part); NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment, MaskPart); @@ -9759,10 +9806,17 @@ } auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); - if (isMaskRequired) - NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, - BlockInMaskParts[Part]); - else + if (isMaskRequired) { + Value *Mask = BlockInMaskParts[Part]; + Type *MaskTy = Mask->getType(); + if (MaskTy->isIntegerTy()) + NewSI = Builder.CreateIntrinsic(Intrinsic::riscv_vse, + {StoredVal->getType(), MaskTy}, + {StoredVal, VecPtr, Mask}, nullptr); + else + NewSI = + Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, Mask); + } else NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment); } State.addMetadata(NewSI, SI); @@ -9777,6 +9831,10 @@ Value *NewLI; if (CreateGatherScatter) { Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr; + assert((MaskPart == nullptr || !MaskPart->getType()->isIntegerTy()) && + " -prefer-predicate-over-epilogue=vlen-else-scalar-epilogue not " + "support gather scatter, you maybe prefer " + "-prefer-predicate-over-epilogue=predicate-else-scalar-epilogue"); Value *VectorGep = State.get(getAddr(), Part); NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart, nullptr, "wide.masked.gather"); @@ -9784,11 +9842,18 @@ } else { auto *VecPtr = CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); - if (isMaskRequired) - NewLI = Builder.CreateMaskedLoad( - DataTy, VecPtr, Alignment, BlockInMaskParts[Part], - PoisonValue::get(DataTy), "wide.masked.load"); - else + if (isMaskRequired) { + Value *Mask = BlockInMaskParts[Part]; // State.get(getMask(), Part); + Type *MaskTy = Mask->getType(); + if (MaskTy->isIntegerTy()) + NewLI = Builder.CreateIntrinsic( + Intrinsic::riscv_vle, {DataTy, MaskTy}, + {UndefValue::get(DataTy), VecPtr, Mask}, nullptr); + else + NewLI = Builder.CreateMaskedLoad( + DataTy, VecPtr, Alignment, BlockInMaskParts[Part], + PoisonValue::get(DataTy), "wide.masked.load"); + } else NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load"); @@ -9829,6 +9894,8 @@ switch (PreferPredicateOverEpilogue) { case PreferPredicateTy::ScalarEpilogue: return CM_ScalarEpilogueAllowed; + case PreferPredicateTy::VlElseScalarEpilogue: + return CM_ScalarEpilogueNotNeededUseVl; case PreferPredicateTy::PredicateElseScalarEpilogue: return CM_ScalarEpilogueNotNeededUsePredicate; case PreferPredicateTy::PredicateOrDontVectorize: diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -205,13 +205,14 @@ struct VPTransformState { VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, - InnerLoopVectorizer *ILV, VPlan *Plan) - : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan), - LVer(nullptr) {} + InnerLoopVectorizer *ILV, VPlan *Plan, unsigned WidestTy) + : VF(VF), UF(UF), WidestTy(WidestTy), LI(LI), DT(DT), Builder(Builder), + ILV(ILV), Plan(Plan), LVer(nullptr) {} /// The chosen Vectorization and Unroll Factors of the loop being vectorized. ElementCount VF; unsigned UF; + unsigned WidestTy; /// Hold the indices to generate specific scalar instructions. Null indicates /// that all instances are to be generated, using either scalar or vector @@ -786,6 +787,7 @@ SLPLoad, SLPStore, ActiveLaneMask, + GetDynamicVl, CanonicalIVIncrement, CanonicalIVIncrementNUW, // The next two are similar to the above, but instead increment the @@ -907,6 +909,7 @@ default: return false; case VPInstruction::ActiveLaneMask: + case VPInstruction::GetDynamicVl: case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementNUW: case VPInstruction::CanonicalIVIncrementForPart: diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -17,14 +17,17 @@ #include "llvm/ADT/Twine.h" #include "llvm/Analysis/IVDescriptors.h" #include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" @@ -222,6 +225,24 @@ State.set(this, V, Part); break; } + case VPInstruction::GetDynamicVl: { + // Get first lane of vector induction variable. + Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); + // Get the original loop tripcount. + Value *ScalarTC = State.get(getOperand(1), Part); + ScalarTC = Builder.CreateIntCast(ScalarTC, VIVElem0->getType(), true); + Value *AvLen = Builder.CreateSub(ScalarTC, VIVElem0, "avl_length"); + auto *IntTy = VIVElem0->getType(); + // auto *PredTy = VectorType::get(Int1Ty, State.VF); + Instruction *Call = Builder.CreateIntrinsic( + Intrinsic::riscv_vsetvli, IntTy, + {AvLen, ConstantInt::get(IntTy, Log2_32(State.WidestTy / 8)), + ConstantInt::get(IntTy, Log2_32(State.VF.getKnownMinValue() * + State.WidestTy / 64))}, + nullptr, Name); + State.set(this, Call, Part); + break; + } case VPInstruction::ActiveLaneMask: { // Get first lane of vector induction variable. Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0)); @@ -385,6 +406,9 @@ case VPInstruction::ActiveLaneMask: O << "active lane mask"; break; + case VPInstruction::GetDynamicVl: + O << "get dynamic vl"; + break; case VPInstruction::FirstOrderRecurrenceSplice: O << "first-order splice"; break; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/prefer_tail_folding_with_vsetvl.ll b/llvm/test/Transforms/LoopVectorize/RISCV/prefer_tail_folding_with_vsetvl.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/prefer_tail_folding_with_vsetvl.ll @@ -0,0 +1,310 @@ +; RUN: opt -mtriple riscv64-linux-gnu -mattr=+v,+f \ +; RUN: -loop-vectorize -scalable-vectorization=on \ +; RUN: -prefer-predicate-over-epilogue=vlen-else-scalar-epilogue -S < %s | \ +; RUN: FileCheck %s -check-prefixes=CHECK,PREFER-FOLDING + +target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +; /data00/home/lizhengxian.123/company_code/llvm-dev/llvm_community/bytedance-riscv/llvm-project/build/bin/opt < tail_folding.ll -loop-vectorize -scalable-vectorization=on -prefer-predicate-over-epilogue=vlen-else-scalar-epilogue -mtriple riscv64-linux-gnu -mattr=+v,+f -S +define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { +; CHECK-LABEL: prefer_folding( +; PREFER-FOLDING: vector.body: +; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 +; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]] +; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 2, i32 0) +; PREFER-FOLDING: call @llvm.riscv.vle.nxv2i32.i32({{.*}}, {{.*}}, i32 %get.dynamic.vl) +; PREFER-FOLDING: call @llvm.riscv.vle.nxv2i32.i32({{.*}}, {{.*}}, i32 %get.dynamic.vl) +; PREFER-FOLDING: call void @llvm.riscv.vse.nxv2i32.i32({{.*}}, {{.*}}, i32 %get.dynamic.vl) +; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body +; +; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32( +; NO-FOLDING-NOT: call void @llvm.masked.store.v4i32.p0v4i32( +; NO-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %for.body +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 + %1 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %1, %0 + %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 + store i32 %add, i32* %arrayidx2, align 4 + %add3 = add nuw nsw i32 %i.09, 1 + %exitcond = icmp eq i32 %add3, 431 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + + +define void @mixed_types(i16* noalias nocapture %A, i16* noalias nocapture readonly %B, i16* noalias nocapture readonly %C, i32* noalias nocapture %D, i32* noalias nocapture readonly %E, i32* noalias nocapture readonly %F) #0 { +; CHECK-LABEL: mixed_types( +; PREFER-FOLDING: vector.body: +; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 +; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]] +; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 2, i32 0) +; PREFER-FOLDING: call @llvm.riscv.vle.nxv2i16.i32 +; PREFER-FOLDING: call @llvm.riscv.vle.nxv2i16.i32 +; PREFER-FOLDING: call void @llvm.riscv.vse.nxv2i16.i32 +; PREFER-FOLDING: call @llvm.riscv.vle.nxv2i32.i32 +; PREFER-FOLDING: call @llvm.riscv.vle.nxv2i32.i32 +; PREFER-FOLDING: call void @llvm.riscv.vse.nxv2i32.i32 +; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.018 = phi i32 [ 0, %entry ], [ %add9, %for.body ] + %arrayidx = getelementptr inbounds i16, i16* %B, i32 %i.018 + %0 = load i16, i16* %arrayidx, align 2 + %arrayidx1 = getelementptr inbounds i16, i16* %C, i32 %i.018 + %1 = load i16, i16* %arrayidx1, align 2 + %add = add i16 %1, %0 + %arrayidx4 = getelementptr inbounds i16, i16* %A, i32 %i.018 + store i16 %add, i16* %arrayidx4, align 2 + %arrayidx5 = getelementptr inbounds i32, i32* %E, i32 %i.018 + %2 = load i32, i32* %arrayidx5, align 4 + %arrayidx6 = getelementptr inbounds i32, i32* %F, i32 %i.018 + %3 = load i32, i32* %arrayidx6, align 4 + %add7 = add nsw i32 %3, %2 + %arrayidx8 = getelementptr inbounds i32, i32* %D, i32 %i.018 + store i32 %add7, i32* %arrayidx8, align 4 + %add9 = add nuw nsw i32 %i.018, 1 + %exitcond = icmp eq i32 %add9, 431 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define void @zero_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { +; CHECK-LABEL: zero_extending_load_allowed( +; PREFER-FOLDING: vector.body: +; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 +; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]] +; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 2, i32 0) +; PREFER-FOLDING: call @llvm.riscv.vle.nxv2i8.i32 +; PREFER-FOLDING: call @llvm.riscv.vle.nxv2i32.i32 +; PREFER-FOLDING: call void @llvm.riscv.vse.nxv2i32.i32 +; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09 + %0 = load i8, i8* %arrayidx, align 1 + %conv = zext i8 %0 to i32 + %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 + %1 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %1, %conv + %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 + store i32 %add, i32* %arrayidx2, align 4 + %add3 = add nuw nsw i32 %i.09, 1 + %exitcond = icmp eq i32 %add3, 431 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define void @sign_extending_load_allowed(i32* noalias nocapture %A, i8* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { +; CHECK-LABEL: sign_extending_load_allowed( +; PREFER-FOLDING: vector.body: +; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 +; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]] +; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 2, i32 0) +; PREFER-FOLDING: call @llvm.riscv.vle.nxv2i8.i32 +; PREFER-FOLDING: call @llvm.riscv.vle.nxv2i32.i32 +; PREFER-FOLDING: call void @llvm.riscv.vse.nxv2i32.i32 +; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %arrayidx = getelementptr inbounds i8, i8* %B, i32 %i.09 + %0 = load i8, i8* %arrayidx, align 1 + %conv = sext i8 %0 to i32 + %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 + %1 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %1, %conv + %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09 + store i32 %add, i32* %arrayidx2, align 4 + %add3 = add nuw nsw i32 %i.09, 1 + %exitcond = icmp eq i32 %add3, 431 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define void @narrowing_store_allowed(i8* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 { +; CHECK-LABEL: narrowing_store_allowed( +; PREFER-FOLDING: call void @llvm.riscv.vse.nxv2i8.i32 +; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09 + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09 + %1 = load i32, i32* %arrayidx1, align 4 + %add = add nsw i32 %1, %0 + %conv = trunc i32 %add to i8 + %arrayidx2 = getelementptr inbounds i8, i8* %A, i32 %i.09 + store i8 %conv, i8* %arrayidx2, align 1 + %add3 = add nuw nsw i32 %i.09, 1 + %exitcond = icmp eq i32 %add3, 431 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define dso_local void @half(half* noalias nocapture %A, half* noalias nocapture readonly %B, half* noalias nocapture readonly %C) #0 { +; CHECK-LABEL: half( +; PREFER-FOLDING: vector.body: +; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 +; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]] +; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 1, i32 0) +; PREFER-FOLDING: call @llvm.riscv.vle.nxv4f16.i32 +; PREFER-FOLDING: call @llvm.riscv.vle.nxv4f16.i32 +; PREFER-FOLDING: call void @llvm.riscv.vse.nxv4f16.i32 +; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09 + %0 = load half, half* %arrayidx, align 2 + %arrayidx1 = getelementptr inbounds half, half* %C, i32 %i.09 + %1 = load half, half* %arrayidx1, align 2 + %add = fadd fast half %1, %0 + %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09 + store half %add, half* %arrayidx2, align 2 + %add3 = add nuw nsw i32 %i.09, 1 + %exitcond = icmp eq i32 %add3, 431 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { +; CHECK-LABEL: float( +; PREFER-FOLDING: vector.body: +; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 +; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]] +; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 2, i32 1) +; PREFER-FOLDING: call @llvm.riscv.vle.nxv4f32.i32 +; PREFER-FOLDING: call @llvm.riscv.vle.nxv4f32.i32 +; PREFER-FOLDING: call void @llvm.riscv.vse.nxv4f32.i32 +; PREFER-FOLDING: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; PREFER-FOLDING: %[[STEP:.*]] = mul i32 %[[VSCALE]], 4 +; PREFER-FOLDING: %index.next = add i32 %index, %[[STEP]] +; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09 + %1 = load float, float* %arrayidx1, align 4 + %add = fadd fast float %1, %0 + %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09 + store float %add, float* %arrayidx2, align 4 + %add3 = add nuw nsw i32 %i.09, 1 + %exitcond = icmp eq i32 %add3, 431 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10 +} + +define void @fpext_allowed(float* noalias nocapture %A, half* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { +; CHECK-LABEL: fpext_allowed( +; PREFER-FOLDING: vector.body: +; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 +; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]] +; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 2, i32 0) +; PREFER-FOLDING: call @llvm.riscv.vle.nxv2f16.i32 +; PREFER-FOLDING: call @llvm.riscv.vle.nxv2f32.i32 +; PREFER-FOLDING: call void @llvm.riscv.vse.nxv2f32.i32 +; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %arrayidx = getelementptr inbounds half, half* %B, i32 %i.09 + %0 = load half, half* %arrayidx, align 2 + %conv = fpext half %0 to float + %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09 + %1 = load float, float* %arrayidx1, align 4 + %add = fadd fast float %1, %conv + %arrayidx2 = getelementptr inbounds float, float* %A, i32 %i.09 + store float %add, float* %arrayidx2, align 4 + %add3 = add nuw nsw i32 %i.09, 1 + %exitcond = icmp eq i32 %add3, 431 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define void @fptrunc_allowed(half* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 { +; CHECK-LABEL: fptrunc_allowed( +; PREFER-FOLDING: vector.body: +; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] +; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0 +; PREFER-FOLDING: %avl_length = sub i32 431, %[[VIVELEM0]] +; PREFER-FOLDING: %get.dynamic.vl = call i32 @llvm.riscv.vsetvli.i32(i32 %avl_length, i32 2, i32 0) +; PREFER-FOLDING: call @llvm.riscv.vle.nxv2f32.i32 +; PREFER-FOLDING: call @llvm.riscv.vle.nxv2f32.i32 +; PREFER-FOLDING: call void @llvm.riscv.vse.nxv2f16.i32 +; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body +entry: + br label %for.body + +for.cond.cleanup: + ret void + +for.body: + %i.09 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %arrayidx = getelementptr inbounds float, float* %B, i32 %i.09 + %0 = load float, float* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds float, float* %C, i32 %i.09 + %1 = load float, float* %arrayidx1, align 4 + %add = fadd fast float %1, %0 + %conv = fptrunc float %add to half + %arrayidx2 = getelementptr inbounds half, half* %A, i32 %i.09 + store half %conv, half* %arrayidx2, align 2 + %add3 = add nuw nsw i32 %i.09, 1 + %exitcond = icmp eq i32 %add3, 431 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +attributes #0 = { "target-features"="+v,+f,+experimental-zvfh,+zfh" } +!5 = distinct !{!5, !6} +!6 = !{!"llvm.loop.vectorize.enable", i1 true} + +!7 = distinct !{!7, !8} +!8 = !{!"llvm.loop.vectorize.predicate.enable", i1 false} + +!10 = distinct !{!10, !11} +!11 = !{!"llvm.loop.vectorize.width", i32 4}