Index: llvm/include/llvm/Analysis/LoopAccessAnalysis.h =================================================================== --- llvm/include/llvm/Analysis/LoopAccessAnalysis.h +++ llvm/include/llvm/Analysis/LoopAccessAnalysis.h @@ -516,7 +516,8 @@ class LoopAccessInfo { public: LoopAccessInfo(Loop *L, ScalarEvolution *SE, const TargetLibraryInfo *TLI, - AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI); + const TargetTransformInfo *TTI, AliasAnalysis *AA, + DominatorTree *DT, LoopInfo *LI); /// Return true we can analyze the memory accesses in the loop and there are /// no memory dependence cycles. @@ -608,7 +609,8 @@ private: /// Analyze the loop. void analyzeLoop(AliasAnalysis *AA, LoopInfo *LI, - const TargetLibraryInfo *TLI, DominatorTree *DT); + const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, + DominatorTree *DT); /// Check if the structure of the loop allows it to be analyzed by this /// pass. @@ -626,7 +628,8 @@ /// /// Looks for accesses like "a[i * StrideA]" where "StrideA" is loop /// invariant. - void collectStridedAccess(Value *LoadOrStoreInst); + void collectStridedAccess(Value *LoadOrStoreInst, + const TargetTransformInfo *TTI); std::unique_ptr PSE; @@ -750,6 +753,7 @@ // The used analysis passes. ScalarEvolution *SE = nullptr; const TargetLibraryInfo *TLI = nullptr; + const TargetTransformInfo *TTI = nullptr; AliasAnalysis *AA = nullptr; DominatorTree *DT = nullptr; LoopInfo *LI = nullptr; Index: llvm/lib/Analysis/LoopAccessAnalysis.cpp =================================================================== --- llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -1789,6 +1789,7 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI, const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, DominatorTree *DT) { typedef SmallPtrSet ValueSet; @@ -1866,7 +1867,7 @@ Loads.push_back(Ld); DepChecker->addAccess(Ld); if (EnableMemAccessVersioning) - collectStridedAccess(Ld); + collectStridedAccess(Ld, TTI); continue; } @@ -1890,7 +1891,7 @@ Stores.push_back(St); DepChecker->addAccess(St); if (EnableMemAccessVersioning) - collectStridedAccess(St); + collectStridedAccess(St, TTI); } } // Next instr. } // Next block. @@ -2279,13 +2280,20 @@ return addRuntimeChecks(Loc, PtrRtChecking->getChecks()); } -void LoopAccessInfo::collectStridedAccess(Value *MemAccess) { +void LoopAccessInfo::collectStridedAccess(Value *MemAccess, + const TargetTransformInfo *TTI) { Value *Ptr = nullptr; - if (LoadInst *LI = dyn_cast(MemAccess)) + if (LoadInst *LI = dyn_cast(MemAccess)) { Ptr = LI->getPointerOperand(); - else if (StoreInst *SI = dyn_cast(MemAccess)) + if (TTI && TTI->isLegalMaskedGather(LI->getType(), + getLoadStoreAlignment(MemAccess))) + return; + } else if (StoreInst *SI = dyn_cast(MemAccess)) { Ptr = SI->getPointerOperand(); - else + if (TTI && TTI->isLegalMaskedScatter(SI->getValueOperand()->getType(), + getLoadStoreAlignment(MemAccess))) + return; + } else return; Value *Stride = getStrideFromPointer(Ptr, PSE->getSE(), TheLoop); @@ -2343,8 +2351,10 @@ } LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE, - const TargetLibraryInfo *TLI, AliasAnalysis *AA, - DominatorTree *DT, LoopInfo *LI) + const TargetLibraryInfo *TLI, + const TargetTransformInfo *TTI, + AliasAnalysis *AA, DominatorTree *DT, + LoopInfo *LI) : PSE(std::make_unique(*SE, *L)), PtrRtChecking(std::make_unique(SE)), DepChecker(std::make_unique(*PSE, L)), TheLoop(L), @@ -2352,7 +2362,7 @@ HasConvergentOp(false), HasDependenceInvolvingLoopInvariantAddress(false) { if (canAnalyzeLoop()) - analyzeLoop(AA, LI, TLI, DT); + analyzeLoop(AA, LI, TLI, TTI, DT); } void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const { @@ -2406,7 +2416,7 @@ auto &LAI = LoopAccessInfoMap[L]; if (!LAI) - LAI = std::make_unique(L, SE, TLI, AA, DT, LI); + LAI = std::make_unique(L, SE, TLI, TTI, AA, DT, LI); return *LAI.get(); } @@ -2426,6 +2436,8 @@ SE = &getAnalysis().getSE(); auto *TLIP = getAnalysisIfAvailable(); TLI = TLIP ? &TLIP->getTLI(F) : nullptr; + auto *TTIP = getAnalysisIfAvailable(); + TTI = TTIP ? &TTIP->getTTI(F) : nullptr; AA = &getAnalysis().getAAResults(); DT = &getAnalysis().getDomTree(); LI = &getAnalysis().getLoopInfo(); @@ -2457,7 +2469,7 @@ LoopAccessInfo LoopAccessAnalysis::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR) { - return LoopAccessInfo(&L, &AR.SE, &AR.TLI, &AR.AA, &AR.DT, &AR.LI); + return LoopAccessInfo(&L, &AR.SE, &AR.TLI, &AR.TTI, &AR.AA, &AR.DT, &AR.LI); } namespace llvm { Index: llvm/test/Transforms/LoopVectorize/ARM/mve-mat-mul.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-mat-mul.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-mat-mul.ll @@ -27,36 +27,36 @@ ; CHECK: for.cond8.preheader.us.us: ; CHECK-NEXT: [[J_051_US_US:%.*]] = phi i32 [ [[INC21_US_US:%.*]], [[FOR_COND8_FOR_COND_CLEANUP10_CRIT_EDGE_US_US:%.*]] ], [ 0, [[FOR_COND8_PREHEADER_US_US_PREHEADER]] ] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[L]], 4 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] -; CHECK: vector.scevcheck: -; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[M]], 1 -; CHECK-NEXT: [[TMP0:%.*]] = or i1 false, [[IDENT_CHECK]] -; CHECK-NEXT: br i1 [[TMP0]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[L]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[L]], [[N_MOD_VF]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[M]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> undef, i32 [[J_051_US_US]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> undef, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], -; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[TMP1]], [[MUL_US]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP5]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = mul i32 [[TMP1]], [[M]] -; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], [[J_051_US_US]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP8]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>* -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[TMP0]], [[MUL_US]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP8]], [[BROADCAST_SPLAT2]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], <4 x i32> [[TMP9]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP10]], i32 4, <4 x i1> , <4 x i32> undef) +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP12]] = add nsw <4 x i32> [[TMP11]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0 ; CHECK: middle.block: @@ -64,8 +64,8 @@ ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[L]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND8_FOR_COND_CLEANUP10_CRIT_EDGE_US_US]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_COND8_PREHEADER_US_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_COND8_PREHEADER_US_US]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_COND8_PREHEADER_US_US]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_COND8_PREHEADER_US_US]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[FOR_BODY11_US_US:%.*]] ; CHECK: for.cond8.for.cond.cleanup10_crit_edge.us.us: ; CHECK-NEXT: [[ADD16_US_US_LCSSA:%.*]] = phi i32 [ [[ADD16_US_US:%.*]], [[FOR_BODY11_US_US]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ] Index: llvm/test/Transforms/LoopVectorize/X86/optsize.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/optsize.ll +++ llvm/test/Transforms/LoopVectorize/X86/optsize.ll @@ -148,17 +148,17 @@ ; AUTOVF-NOT: vector.scevcheck ; AUTOVF-NOT: vector.body: ; AUTOVF-LABEL: for.body: -define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 { +define void @scev4stride1(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i32 %k) #2 { for.body.preheader: br label %for.body for.body: ; preds = %for.body.preheader, %for.body %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] %mul = mul nsw i32 %i.07, %k - %arrayidx = getelementptr inbounds i32, i32* %b, i32 %mul - %0 = load i32, i32* %arrayidx, align 4 - %arrayidx1 = getelementptr inbounds i32, i32* %a, i32 %i.07 - store i32 %0, i32* %arrayidx1, align 4 + %arrayidx = getelementptr inbounds i16, i16* %b, i32 %mul + %0 = load i16, i16* %arrayidx, align 4 + %arrayidx1 = getelementptr inbounds i16, i16* %a, i32 %i.07 + store i16 %0, i16* %arrayidx1, align 4 %inc = add nuw nsw i32 %i.07, 1 %exitcond = icmp eq i32 %inc, 256 br i1 %exitcond, label %for.end.loopexit, label %for.body Index: llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp =================================================================== --- llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp +++ llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp @@ -43,7 +43,7 @@ AARes.reset(new AAResults(TLI)); AARes->addAAResult(*BasicAA); PSE.reset(new PredicatedScalarEvolution(*SE, *L)); - LAI.reset(new LoopAccessInfo(L, &*SE, &TLI, &*AARes, &*DT, &*LI)); + LAI.reset(new LoopAccessInfo(L, &*SE, &TLI, nullptr, &*AARes, &*DT, &*LI)); IAI.reset(new InterleavedAccessInfo(*PSE, L, &*DT, &*LI, &*LAI)); IAI->analyzeInterleaving(false); return {Plan, *IAI};