diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8687,6 +8687,19 @@ if (!CM.blockNeedsPredication(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. + // if header block needs predication then it is only because tail-folding is + // enabled. If we are using VP intrinsics for a target with vector length + // predication support, this mask (icmp ule %IV %BTC) becomes redundant with + // EVL, which means unless we are using VP intrinsics without vector length + // predication support we can replace this mask with an all-true mask for + // possibly better latency. + if (CM.preferVPIntrinsics() && + PreferPredicateWithVPIntrinsics != + PreferVPIntrinsicsTy::WithoutAVLSupport) { + BlockMask = Builder.createNaryOp(VPInstruction::AllTrueMask, {}); + return BlockMaskCache[BB] = BlockMask; + } + // Create the block in mask as the first non-phi instruction in the block. VPBuilder::InsertPointGuard Guard(Builder); auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi(); @@ -8728,10 +8741,31 @@ return BlockMaskCache[BB] = BlockMask; } -VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, - ArrayRef Operands, - VFRange &Range, - VPlanPtr &Plan) { +VPValue *VPRecipeBuilder::getOrCreateEVL(VPlanPtr &Plan) { + if (EVL) + return EVL; + + if (PreferPredicateWithVPIntrinsics == + PreferVPIntrinsicsTy::WithoutAVLSupport) { + EVL = Plan->getOrCreateRuntimeVF(); + return EVL; + } + + VPBuilder::InsertPointGuard Guard(Builder); + auto *HeaderBB = Plan->getEntry()->getSingleSuccessor()->getEntryBasicBlock(); + auto NewInsertionPoint = HeaderBB->getFirstNonPhi(); + Builder.setInsertPoint(HeaderBB, NewInsertionPoint); + + VPValue *IV = getOrCreateIV(Builder.getInsertBlock(), Plan); + VPValue *TC = Plan->getOrCreateTripCount(); + auto *EVLRecipe = new VPWidenEVLRecipe(IV, TC); + Builder.getInsertBlock()->insert(EVLRecipe, Builder.getInsertPoint()); + EVL = EVLRecipe->getEVL(); + return EVL; +} + +bool VPRecipeBuilder::validateWidenMemory(Instruction *I, + VFRange &Range) const { assert((isa(I) || isa(I)) && "Must be called with either a load or store"); @@ -8750,7 +8784,14 @@ return Decision != LoopVectorizationCostModel::CM_Scalarize; }; - if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range)) + return LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range); +} + +VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, + ArrayRef Operands, + VFRange &Range, + VPlanPtr &Plan) { + if (!validateWidenMemory(I, Range)) return nullptr; VPValue *Mask = nullptr; @@ -8765,6 +8806,24 @@ Mask); } +VPRecipeBase * +VPRecipeBuilder::tryToPredicatedWidenMemory(Instruction *I, + ArrayRef Operands, + VFRange &Range, VPlanPtr &Plan) { + if (!validateWidenMemory(I, Range)) + return nullptr; + + VPValue *Mask = createBlockInMask(I->getParent(), Plan); + VPValue *EVL = getOrCreateEVL(Plan); + if (LoadInst *Load = dyn_cast(I)) + return new VPPredicatedWidenMemoryInstructionRecipe(*Load, Operands[0], + Mask, EVL); + + StoreInst *Store = cast(I); + return new VPPredicatedWidenMemoryInstructionRecipe(*Store, Operands[1], + Operands[0], Mask, EVL); +} + VPWidenIntOrFpInductionRecipe * VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, ArrayRef Operands) const { @@ -8897,8 +8956,11 @@ Range); } -VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, - ArrayRef Operands) const { +bool VPRecipeBuilder::preferPredicatedWiden() const { + return CM.preferVPIntrinsics(); +} + +bool VPRecipeBuilder::validateWiden(Instruction *I) const { auto IsVectorizableOpcode = [](unsigned Opcode) { switch (Opcode) { case Instruction::Add: @@ -8940,7 +9002,12 @@ return false; }; - if (!IsVectorizableOpcode(I->getOpcode())) + return IsVectorizableOpcode(I->getOpcode()); +} + +VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, + ArrayRef Operands) const { + if (!validateWiden(I)) return nullptr; // Success: widen this instruction. @@ -8957,6 +9024,17 @@ } } +VPPredicatedWidenRecipe *VPRecipeBuilder::tryToPredicatedWiden( + Instruction *I, ArrayRef Operands, VPlanPtr &Plan) { + if (!validateWiden(I)) + return nullptr; + + VPValue *Mask = createBlockInMask(I->getParent(), Plan); + VPValue *EVL = getOrCreateEVL(Plan); + return new VPPredicatedWidenRecipe( + *I, make_range(Operands.begin(), Operands.end()), Mask, EVL); +} + VPBasicBlock *VPRecipeBuilder::handleReplication( Instruction *I, VFRange &Range, VPBasicBlock *VPBB, VPlanPtr &Plan) { @@ -9045,8 +9123,13 @@ if (auto *CI = dyn_cast(Instr)) return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); - if (isa(Instr) || isa(Instr)) + if (isa(Instr) || isa(Instr)) { + if (preferPredicatedWiden()) + return toVPRecipeResult( + tryToPredicatedWidenMemory(Instr, Operands, Range, Plan)); + return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); + } VPRecipeBase *Recipe; if (auto Phi = dyn_cast(Instr)) { @@ -9092,6 +9175,9 @@ *SI, make_range(Operands.begin(), Operands.end()), InvariantCond)); } + if (preferPredicatedWiden()) + return toVPRecipeResult(tryToPredicatedWiden(Instr, Operands, Plan)); + return toVPRecipeResult(tryToWiden(Instr, Operands)); } @@ -9524,6 +9610,10 @@ State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State); } +void VPPredicatedWidenRecipe::execute(VPTransformState &State) { + // TODO: Implement widening +} + void VPWidenGEPRecipe::execute(VPTransformState &State) { State.ILV->widenGEP(cast(getUnderlyingInstr()), this, *this, State.UF, State.VF, IsPtrLoopInvariant, @@ -9740,6 +9830,15 @@ StoredValue, getMask()); } +void VPPredicatedWidenMemoryInstructionRecipe::execute( + VPTransformState &State) { + // TODO: Implement widening +} + +void VPWidenEVLRecipe::execute(VPTransformState &State) { + // TODO: Implement widening +} + // Determine how to lower the scalar epilogue, which depends on 1) optimising // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing // predication, and 4) a TTI hook that analyses whether the loop is suitable diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -41,6 +41,8 @@ VPBuilder &Builder; + VPValue *EVL = nullptr; + /// When we if-convert we need to create edge masks. We have to cache values /// so that we don't end up with exponential recursion/IR. Note that /// if-conversion currently takes place during VPlan-construction, so these @@ -76,6 +78,16 @@ VPRecipeBase *tryToWidenMemory(Instruction *I, ArrayRef Operands, VFRange &Range, VPlanPtr &Plan); + /// Similar to tryToWidenMemory, but create a predicated recipe. The + /// predicated recipe takes mandatory mask and EVL VPInstructions. + VPRecipeBase *tryToPredicatedWidenMemory(Instruction *I, + ArrayRef Operands, + VFRange &Range, VPlanPtr &Plan); + + /// Helper method used by tryToWidenMemory and tryToPredicatedWidenMemory to + /// validate if a memory instructions can be widened. + bool validateWidenMemory(Instruction *I, VFRange &Range) const; + /// Check if an induction recipe should be constructed for \I. If so build and /// return it. If not, return null. VPWidenIntOrFpInductionRecipe * @@ -105,9 +117,21 @@ /// that widening should be performed. VPWidenRecipe *tryToWiden(Instruction *I, ArrayRef Operands) const; + /// Similar to tryToWiden, but widen to VP intrinsics. + VPPredicatedWidenRecipe *tryToPredicatedWiden(Instruction *I, + ArrayRef Operands, + VPlanPtr &Plan); + + /// Helper method used by tryToWiden and tryToPredicatedWiden to validate if + /// an instruction can be widened. + bool validateWiden(Instruction *I) const; + /// Return a VPRecipeOrValueTy with VPRecipeBase * being set. This can be used to force the use as VPRecipeBase* for recipe sub-types that also inherit from VPValue. VPRecipeOrVPValueTy toVPRecipeResult(VPRecipeBase *R) const { return R; } + /// Create recipes that will expand to VP intrinsics. + bool preferPredicatedWiden() const; + /// Insert and Cache Induction Variable VPValue *getOrCreateIV(VPBasicBlock *VPBB, VPlanPtr &Plan); @@ -146,6 +170,10 @@ /// and DST. VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan); + /// A helper function that computes the Explicit(Active) Vector Length for the + /// current vector iteration. + VPValue *getOrCreateEVL(VPlanPtr &Plan); + /// Mark given ingredient for recording its recipe once one is created for /// it. void recordRecipeOf(Instruction *I) { diff --git a/llvm/test/Transforms/LoopVectorize/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/vplan-vp-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/vplan-vp-intrinsics.ll @@ -0,0 +1,110 @@ +; REQUIRES: asserts + +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=without-active-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -disable-output %s 2>&1 | FileCheck --check-prefix=WITHOUT-AVL %s + +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=if-active-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -disable-output %s 2>&1 | FileCheck --check-prefix=IF-AVL %s + +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=force-active-vector-length-support \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -disable-output %s 2>&1 | FileCheck --check-prefix=FORCE-AVL %s + +; RUN: opt -loop-vectorize -debug-only=loop-vectorize -force-vector-width=4 \ +; RUN: -prefer-predicate-with-vp-intrinsics=no-predication \ +; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \ +; RUN: -mattr=+avx512f -disable-output %s 2>&1 | FileCheck --check-prefix=NO-VP %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nofree norecurse nounwind uwtable +define dso_local void @foo(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i64 %N) local_unnamed_addr { +; WITHOUT-AVL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; WITHOUT-AVL-NEXT: for.body: +; WITHOUT-AVL-NEXT: WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; WITHOUT-AVL-NEXT: EMIT vp<%3> = icmp ule ir<%indvars.iv> vp<%0> +; WITHOUT-AVL-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv> +; WITHOUT-AVL-NEXT: PREDICATED-WIDEN ir<%0> = load ir<%arrayidx>, vp<%3>, vp<%1> +; WITHOUT-AVL-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv> +; WITHOUT-AVL-NEXT: PREDICATED-WIDEN ir<%1> = load ir<%arrayidx2>, vp<%3>, vp<%1> +; WITHOUT-AVL-NEXT: PREDICATED-WIDEN ir<%add> = add ir<%1>, ir<%0>, vp<%3>, vp<%1> +; WITHOUT-AVL-NEXT: CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv> +; WITHOUT-AVL-NEXT: PREDICATED-WIDEN store ir<%arrayidx4>, ir<%add>, vp<%3>, vp<%1> +; WITHOUT-AVL-NEXT: No successors +; WITHOUT-AVL-NEXT: } + +; IF-AVL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; IF-AVL-NEXT: for.body: +; IF-AVL-NEXT: WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; IF-AVL-NEXT: EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0> +; IF-AVL-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv> +; IF-AVL-NEXT: WIDEN ir<%0> = load ir<%arrayidx>, vp<%2> +; IF-AVL-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv> +; IF-AVL-NEXT: WIDEN ir<%1> = load ir<%arrayidx2>, vp<%2> +; IF-AVL-NEXT: WIDEN ir<%add> = add ir<%1>, ir<%0> +; IF-AVL-NEXT: CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv> +; IF-AVL-NEXT: WIDEN store ir<%arrayidx4>, ir<%add>, vp<%2> +; IF-AVL-NEXT: No successors +; IF-AVL-NEXT: } + +; FORCE-AVL: VPlan 'Initial VPlan for VF={4},UF>=1' { +; FORCE-AVL-NEXT: for.body: +; FORCE-AVL-NEXT: WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; FORCE-AVL-NEXT: EMIT vp<%2> = GENERATE-EXPLICIT-VECTOR-LENGTH +; FORCE-AVL-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv> +; FORCE-AVL-NEXT: EMIT vp<%4> = all true mask +; FORCE-AVL-NEXT: PREDICATED-WIDEN ir<%0> = load ir<%arrayidx>, vp<%4>, vp<%2> +; FORCE-AVL-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv> +; FORCE-AVL-NEXT: PREDICATED-WIDEN ir<%1> = load ir<%arrayidx2>, vp<%4>, vp<%2> +; FORCE-AVL-NEXT: PREDICATED-WIDEN ir<%add> = add ir<%1>, ir<%0>, vp<%4>, vp<%2> +; FORCE-AVL-NEXT: CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv> +; FORCE-AVL-NEXT: PREDICATED-WIDEN store ir<%arrayidx4>, ir<%add>, vp<%4>, vp<%2> +; FORCE-AVL-NEXT: No successors +; FORCE-AVL-NEXT: } + +; NO-VP: VPlan 'Initial VPlan for VF={4},UF>=1' { +; NO-VP-NEXT: for.body: +; NO-VP-NEXT: WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next +; NO-VP-NEXT: EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0> +; NO-VP-NEXT: CLONE ir<%arrayidx> = getelementptr ir<%b>, ir<%indvars.iv> +; NO-VP-NEXT: WIDEN ir<%0> = load ir<%arrayidx>, vp<%2> +; NO-VP-NEXT: CLONE ir<%arrayidx2> = getelementptr ir<%c>, ir<%indvars.iv> +; NO-VP-NEXT: WIDEN ir<%1> = load ir<%arrayidx2>, vp<%2> +; NO-VP-NEXT: WIDEN ir<%add> = add ir<%1>, ir<%0> +; NO-VP-NEXT: CLONE ir<%arrayidx4> = getelementptr ir<%a>, ir<%indvars.iv> +; NO-VP-NEXT: WIDEN store ir<%arrayidx4>, ir<%add>, vp<%2> +; NO-VP-NEXT: No successors +; NO-VP-NEXT: } + +entry: + %cmp10 = icmp sgt i64 %N, 0 + br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx4 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %add, i32* %arrayidx4, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %N + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +}