diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorize.h @@ -62,6 +62,7 @@ namespace llvm { +class AAResults; class AssumptionCache; class BlockFrequencyInfo; class DemandedBits; @@ -177,6 +178,7 @@ BlockFrequencyInfo *BFI; TargetLibraryInfo *TLI; DemandedBits *DB; + AAResults *AA; AssumptionCache *AC; LoopAccessInfoManager *LAIs; OptimizationRemarkEmitter *ORE; @@ -187,13 +189,12 @@ function_ref MapClassName2PassName); // Shim for old PM. - LoopVectorizeResult runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, - TargetTransformInfo &TTI_, DominatorTree &DT_, - BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, - DemandedBits &DB_, AssumptionCache &AC_, - LoopAccessInfoManager &LAIs_, - OptimizationRemarkEmitter &ORE_, - ProfileSummaryInfo *PSI_); + LoopVectorizeResult + runImpl(Function &F, ScalarEvolution &SE_, LoopInfo &LI_, + TargetTransformInfo &TTI_, DominatorTree &DT_, + BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, DemandedBits &DB_, + AAResults &AA_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, + OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_); bool processLoop(Loop *L); }; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -29,6 +29,7 @@ namespace llvm { +class AAResults; class LoopInfo; class LoopVectorizationLegality; class LoopVectorizationCostModel; @@ -274,6 +275,8 @@ PredicatedScalarEvolution &PSE; + AAResults &AA; + const LoopVectorizeHints &Hints; OptimizationRemarkEmitter *ORE; @@ -289,11 +292,11 @@ LoopVectorizationLegality *Legal, LoopVectorizationCostModel &CM, InterleavedAccessInfo &IAI, - PredicatedScalarEvolution &PSE, + PredicatedScalarEvolution &PSE, AAResults &AA, const LoopVectorizeHints &Hints, OptimizationRemarkEmitter *ORE) : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), - PSE(PSE), Hints(Hints), ORE(ORE) {} + PSE(PSE), AA(AA), Hints(Hints), ORE(ORE) {} /// Plan how to best vectorize, return the best VF and its cost, or /// std::nullopt if vectorization and interleaving should be avoided up front. diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -678,6 +678,9 @@ /// Dominator Tree. DominatorTree *DT; + /// Alias Analysis. + AAResults *AA; + /// Target Library Info. const TargetLibraryInfo *TLI; @@ -9056,7 +9059,7 @@ // Sink users of fixed-order recurrence past the recipe defining the previous // value and introduce FirstOrderRecurrenceSplice VPInstructions. if (!VPlanTransforms::adjustFixedOrderRecurrences( - *Plan, Builder, OrigLoop, PSE, + *Plan, Builder, OrigLoop, PSE, AA, [this](PHINode *PN, const InductionDescriptor &ID) -> const InductionDescriptor & { SmallPtrSet AllowedExit; @@ -9898,7 +9901,7 @@ Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT, LoopVectorizationLegality *LVL, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, + AAResults &AA, OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints, LoopVectorizationRequirements &Requirements) { @@ -9918,7 +9921,8 @@ // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, AA, Hints, + ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -10162,7 +10166,8 @@ // pipeline. if (!L->isInnermost()) return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC, - ORE, BFI, PSI, Hints, Requirements); + *AA, ORE, BFI, PSI, Hints, + Requirements); assert(L->isInnermost() && "Inner loop expected."); @@ -10261,7 +10266,8 @@ CM.collectElementTypesForWidening(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, *AA, Hints, + ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); @@ -10530,14 +10536,16 @@ LoopVectorizeResult LoopVectorizePass::runImpl( Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_, DominatorTree &DT_, BlockFrequencyInfo *BFI_, TargetLibraryInfo *TLI_, - DemandedBits &DB_, AssumptionCache &AC_, LoopAccessInfoManager &LAIs_, - OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) { + DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_, + LoopAccessInfoManager &LAIs_, OptimizationRemarkEmitter &ORE_, + ProfileSummaryInfo *PSI_) { SE = &SE_; LI = &LI_; TTI = &TTI_; DT = &DT_; BFI = BFI_; TLI = TLI_; + AA = &AA_; AC = &AC_; LAIs = &LAIs_; DB = &DB_; @@ -10605,6 +10613,7 @@ auto &TTI = AM.getResult(F); auto &DT = AM.getResult(F); auto &TLI = AM.getResult(F); + auto &AA = AM.getResult(F); auto &AC = AM.getResult(F); auto &DB = AM.getResult(F); auto &ORE = AM.getResult(F); @@ -10617,7 +10626,7 @@ if (PSI && PSI->hasProfileSummary()) BFI = &AM.getResult(F); LoopVectorizeResult Result = - runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AC, LAIs, ORE, PSI); + runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, LAIs, ORE, PSI); if (!Result.MadeAnyChange) return PreservedAnalyses::all(); PreservedAnalyses PA; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -82,6 +82,7 @@ /// not valid. static bool adjustFixedOrderRecurrences( VPlan &Plan, VPBuilder &Builder, Loop *L, PredicatedScalarEvolution &PSE, + AAResults &AA, function_ref AddInduction); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -12,12 +12,14 @@ //===----------------------------------------------------------------------===// #include "VPlanTransforms.h" -#include "VPlanDominatorTree.h" #include "VPRecipeBuilder.h" #include "VPlanCFG.h" +#include "VPlanDominatorTree.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Intrinsics.h" @@ -657,12 +659,25 @@ return VPDT.properlyDominates(ParentA, ParentB); } +static Value *getUnderlyingObject(VPValue *V) { + if (auto *R = dyn_cast(V)) { + auto *GEP = dyn_cast_or_null(R->getUnderlyingValue()); + if (GEP) + return getUnderlyingObject(R->getOperand(0)); + } + if (auto *GEP = dyn_cast(V)) + return getUnderlyingObject(GEP->getOperand(0)); + if (!V->getDefiningRecipe()) + return getUnderlyingObject(V->getLiveInIRValue()); + return nullptr; +} + /// Sink users of \p FOR after the recipe defining the previous value \p /// Previous of the recurrence. \returns true if all users of \p FOR could be /// re-arranged as needed or false if it is not possible. static bool sinkRecurrenceUsersAfterPrevious(VPFirstOrderRecurrencePHIRecipe *FOR, - VPRecipeBase *Previous, + VPRecipeBase *Previous, AAResults &AA, VPDominatorTree &VPDT) { // Collect recipes that need sinking. SmallVector WorkList; @@ -680,11 +695,24 @@ return true; if (SinkCandidate->mayHaveSideEffects()) { - if (!isa(SinkCandidate)) + auto *WriteR = dyn_cast(SinkCandidate); + if (!WriteR) return false; if (any_of(make_range(std::next(SinkCandidate->getIterator()), std::next(Previous->getIterator())), - [&](VPRecipeBase &R) { return R.mayReadOrWriteMemory(); })) + [&](VPRecipeBase &R) { + if (WriteR) { + if (auto *MemR = + dyn_cast(&R)) { + Value *ObjA = getUnderlyingObject(WriteR->getAddr()); + Value *ObjB = getUnderlyingObject(MemR->getAddr()); + if (!ObjA || !ObjB || !AA.isNoAlias(ObjA, ObjB)) + return true; + return false; + } + } + return R.mayHaveSideEffects(); + })) return false; } @@ -725,6 +753,7 @@ bool VPlanTransforms::adjustFixedOrderRecurrences( VPlan &Plan, VPBuilder &Builder, Loop *L, PredicatedScalarEvolution &PSE, + AAResults &AA, function_ref AddInduction) { @@ -752,7 +781,7 @@ Previous = PrevPhi->getBackedgeValue()->getDefiningRecipe(); } - if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, VPDT)) { + if (!sinkRecurrenceUsersAfterPrevious(FOR, Previous, AA, VPDT)) { IllegalFORs.push_back(FOR); continue; } diff --git a/llvm/test/Transforms/LoopVectorize/fixed-order-recurrences-memory-instructions.ll b/llvm/test/Transforms/LoopVectorize/fixed-order-recurrences-memory-instructions.ll --- a/llvm/test/Transforms/LoopVectorize/fixed-order-recurrences-memory-instructions.ll +++ b/llvm/test/Transforms/LoopVectorize/fixed-order-recurrences-memory-instructions.ll @@ -142,17 +142,41 @@ define void @sink_store_past_non_aliasing_load(ptr noalias %A, ptr noalias %B) { ; CHECK-LABEL: @sink_store_past_non_aliasing_load( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[WIDE_LOAD:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD]] = load <4 x i32>, ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP5]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[IV]] -; CHECK-NEXT: store i32 [[FOR]], ptr [[GEP_A]], align 4 -; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[IV]] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; CHECK-NEXT: store i32 [[SCALAR_RECUR]], ptr [[GEP_A]], align 4 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]] ; CHECK-NEXT: [[FOR_NEXT]] = load i32, ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -212,18 +236,45 @@ define void @sink_store_past_non_aliasing_store(ptr noalias %A, ptr noalias %B) { ; CHECK-LABEL: @sink_store_past_non_aliasing_store( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store <4 x i32> , ptr [[TMP3]], align 4 +; CHECK-NEXT: [[TMP4]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 1000, 1000 +; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[FOR:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[IV]] -; CHECK-NEXT: store i32 [[FOR]], ptr [[GEP_A]], align 4 -; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i32 [[IV]] +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[SCALAR_RECUR:%.*]] = phi i32 [ [[SCALAR_RECUR_INIT]], [[SCALAR_PH]] ], [ [[FOR_NEXT:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[IV]] +; CHECK-NEXT: store i32 [[SCALAR_RECUR]], ptr [[GEP_A]], align 4 +; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[IV]] ; CHECK-NEXT: store i32 123, ptr [[GEP_B]], align 4 ; CHECK-NEXT: [[FOR_NEXT]] = add i32 [[IV]], 2 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1 ; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], 1000 -; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ;