diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8951,7 +8951,7 @@ SmallPtrSet DeadInstructions; VPlanTransforms::VPInstructionsToVPRecipes( - OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); + PSE, OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions); return Plan; } diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -23,7 +23,7 @@ /// Replaces the VPInstructions in \p Plan with corresponding /// widen recipes. static void VPInstructionsToVPRecipes( - Loop *OrigLoop, VPlanPtr &Plan, + PredicatedScalarEvolution &SPE, Loop *OrigLoop, VPlanPtr &Plan, LoopVectorizationLegality::InductionList &Inductions, SmallPtrSetImpl &DeadInstructions); }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -17,7 +17,7 @@ using namespace llvm; void VPlanTransforms::VPInstructionsToVPRecipes( - Loop *OrigLoop, VPlanPtr &Plan, + PredicatedScalarEvolution &PSE, Loop *OrigLoop, VPlanPtr &Plan, LoopVectorizationLegality::InductionList &Inductions, SmallPtrSetImpl &DeadInstructions) { @@ -59,11 +59,20 @@ II.getKind() == InductionDescriptor::IK_FpInduction) { VPValue *Start = Plan->getOrAddVPValue(II.getStartValue()); NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, nullptr); + } else NewRecipe = new VPWidenPHIRecipe(Phi); } else if (GetElementPtrInst *GEP = dyn_cast(Inst)) { NewRecipe = new VPWidenGEPRecipe( GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop); + } else if (CallInst *CI = dyn_cast(Inst)) { + NewRecipe = + new VPWidenCallRecipe(*CI, Plan->mapToVPValues(CI->arg_operands())); + } else if (SelectInst *SI = dyn_cast(Inst)) { + bool InvariantCond = PSE.getSE()->isLoopInvariant( + PSE.getSCEV(SI->getOperand(0)), OrigLoop); + NewRecipe = new VPWidenSelectRecipe( + *SI, Plan->mapToVPValues(SI->operands()), InvariantCond); } else NewRecipe = new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands())); diff --git a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll --- a/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll @@ -1,20 +1,10 @@ ; RUN: opt -loop-vectorize -force-vector-width=4 -enable-vplan-native-path -S %s | FileCheck %s -; Vectorize explict marked outer loop using vplan native path. Inner loop -; contains simple double add reduction. IR is compiled and modified by hand -; from following C code: -; void inner_loop_reduction(const double* restrict in_a, const double* restrict in_b, double* restrict out) -; { -; #pragma clang loop vectorize(enable) -; for (int i = 0; i < 1000; ++i) { -; double a = in_a[i]; -; double b = in_b[i]; -; for (int j = 0; j < 10000; ++j) { -; a = a + b; -; } -; out[i] = a; -; } -; } +; Test that VPlan native path is able to vectorize explictly marked outer loop. +; Inner loop contains simple double add reduction and outer loop contains +; intrincis call. + +declare double @llvm.sqrt.f64(double %0) define void @inner_loop_reduction(double* noalias nocapture readonly %a.in, double* noalias nocapture readonly %b.in, double* noalias nocapture %c.out) { ; CHECK-LABEL: @inner_loop_reduction( @@ -25,12 +15,13 @@ ; CHECK-NEXT: %[[MASKED_GATHER1:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %[[A_PTR]], i32 8, <4 x i1> , <4 x double> undef) ; CHECK-NEXT: %[[B_PTR:.*]] = getelementptr inbounds double, double* %b.in, <4 x i64> %[[VEC_INDEX]] ; CHECK-NEXT: %[[MASKED_GATHER2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %[[B_PTR]], i32 8, <4 x i1> , <4 x double> undef) +; CHECK-NEXT: %[[B_SQRT:.*]] = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %[[MASKED_GATHER2]]) ; CHECK-NEXT: br label %[[FOR2_HEADER:.*]] ; CHECK: [[FOR2_HEADER]]: ; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi <4 x i32> [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ], [ zeroinitializer, %vector.body ] ; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ], [ %[[MASKED_GATHER1]], %vector.body ] -; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[MASKED_GATHER2]], %[[REDUCTION]] +; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[B_SQRT]], %[[REDUCTION]] ; CHECK-NEXT: %[[FOR2_INDEX_NEXT]] = add nuw nsw <4 x i32> %[[FOR2_INDEX]], ; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i32> %[[FOR2_INDEX_NEXT]], ; CHECK-NEXT: %[[EXIT_COND:.*]] = extractelement <4 x i1> %[[VEC_PTR]], i32 0 @@ -51,30 +42,31 @@ entry: br label %for1.header -for1.header: ; preds = %entry +for1.header: %indvar1 = phi i64 [ 0, %entry ], [ %indvar11, %for1.latch ] %a.ptr = getelementptr inbounds double, double* %a.in, i64 %indvar1 %a = load double, double* %a.ptr, align 8 %b.ptr = getelementptr inbounds double, double* %b.in, i64 %indvar1 %b = load double, double* %b.ptr, align 8 + %b.sqrt = call double @llvm.sqrt.f64(double %b) br label %for2.header -for2.header: ; preds = %for1.header, %for2.header +for2.header: %indvar2 = phi i32 [ 0, %for1.header ], [ %indvar21, %for2.header ] %a.reduction = phi double [ %a, %for1.header ], [ %a.reduction1, %for2.header ] - %a.reduction1 = fadd double %b, %a.reduction + %a.reduction1 = fadd double %b.sqrt, %a.reduction %indvar21 = add nuw nsw i32 %indvar2, 1 %for2.cond = icmp eq i32 %indvar21, 10000 br i1 %for2.cond, label %for1.latch, label %for2.header -for1.latch: ; preds = %for2.header +for1.latch: %c.ptr = getelementptr inbounds double, double* %c.out, i64 %indvar1 store double %a.reduction1, double* %c.ptr, align 8 %indvar11 = add nuw nsw i64 %indvar1, 1 %for1.cond = icmp eq i64 %indvar11, 1000 br i1 %for1.cond, label %exit, label %for1.header, !llvm.loop !0 -exit: ; preds = %for1.latch +exit: ret void } diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp --- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp @@ -130,8 +130,8 @@ LoopVectorizationLegality::InductionList Inductions; SmallPtrSet DeadInstructions; - VPlanTransforms::VPInstructionsToVPRecipes(LI->getLoopFor(LoopHeader), Plan, - Inductions, DeadInstructions); + VPlanTransforms::VPInstructionsToVPRecipes( + *PSE, LI->getLoopFor(LoopHeader), Plan, Inductions, DeadInstructions); } TEST_F(VPlanHCFGTest, testVPInstructionToVPRecipesInner) { @@ -160,8 +160,8 @@ LoopVectorizationLegality::InductionList Inductions; SmallPtrSet DeadInstructions; - VPlanTransforms::VPInstructionsToVPRecipes(LI->getLoopFor(LoopHeader), Plan, - Inductions, DeadInstructions); + VPlanTransforms::VPInstructionsToVPRecipes( + *PSE, LI->getLoopFor(LoopHeader), Plan, Inductions, DeadInstructions); VPBlockBase *Entry = Plan->getEntry()->getEntryBasicBlock(); EXPECT_NE(nullptr, Entry->getSingleSuccessor()); diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h --- a/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h +++ b/llvm/unittests/Transforms/Vectorize/VPlanTestBase.h @@ -17,6 +17,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/AsmParser/Parser.h" #include "llvm/IR/Dominators.h" #include "llvm/Support/SourceMgr.h" @@ -28,12 +29,24 @@ /// given loop entry block. class VPlanTestBase : public testing::Test { protected: + TargetLibraryInfoImpl TLII; + TargetLibraryInfo TLI; + DataLayout DL; + std::unique_ptr Ctx; std::unique_ptr M; std::unique_ptr LI; std::unique_ptr DT; + std::unique_ptr AC; + std::unique_ptr SE; + std::unique_ptr PSE; - VPlanTestBase() : Ctx(new LLVMContext) {} + VPlanTestBase() + : TLII(), TLI(TLII), + DL("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-" + "f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:" + "16:32:64-S128"), + Ctx(new LLVMContext) {} Module &parseModule(const char *ModuleString) { SMDiagnostic Err; @@ -42,13 +55,16 @@ return *M; } - void doAnalysis(Function &F) { + void doAnalysis(Function &F, BasicBlock *LoopHeader) { DT.reset(new DominatorTree(F)); LI.reset(new LoopInfo(*DT)); + AC.reset(new AssumptionCache(F)); + SE.reset(new ScalarEvolution(F, TLI, *AC, *DT, *LI)); + PSE.reset(new PredicatedScalarEvolution(*SE, *LI->getLoopFor(LoopHeader))); } VPlanPtr buildHCFG(BasicBlock *LoopHeader) { - doAnalysis(*LoopHeader->getParent()); + doAnalysis(*LoopHeader->getParent(), LoopHeader); auto Plan = std::make_unique(); VPlanHCFGBuilder HCFGBuilder(LI->getLoopFor(LoopHeader), LI.get(), *Plan); @@ -58,7 +74,7 @@ /// Build the VPlan plain CFG for the loop starting from \p LoopHeader. VPlanPtr buildPlainCFG(BasicBlock *LoopHeader) { - doAnalysis(*LoopHeader->getParent()); + doAnalysis(*LoopHeader->getParent(), LoopHeader); auto Plan = std::make_unique(); VPlanHCFGBuilder HCFGBuilder(LI->getLoopFor(LoopHeader), LI.get(), *Plan);