diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -47,6 +47,7 @@ class GlobalValue; class InstCombiner; class OptimizationRemarkEmitter; +class InterleavedAccessInfo; class IntrinsicInst; class LoadInst; class Loop; @@ -531,7 +532,8 @@ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, - LoopVectorizationLegality *LVL) const; + LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI) const; /// Query the target whether lowering of the llvm.get.active.lane.mask /// intrinsic is supported and how the mask should be used. A return value @@ -1567,12 +1569,11 @@ AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) = 0; - virtual bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, - ScalarEvolution &SE, - AssumptionCache &AC, - TargetLibraryInfo *TLI, - DominatorTree *DT, - LoopVectorizationLegality *LVL) = 0; + virtual bool + preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, + AssumptionCache &AC, TargetLibraryInfo *TLI, + DominatorTree *DT, LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI) = 0; virtual PredicationStyle emitGetActiveLaneMask() = 0; virtual Optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) = 0; @@ -1956,8 +1957,9 @@ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, - LoopVectorizationLegality *LVL) override { - return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL); + LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI) override { + return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI); } PredicationStyle emitGetActiveLaneMask() override { return Impl.emitGetActiveLaneMask(); diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -163,7 +163,8 @@ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, - LoopVectorizationLegality *LVL) const { + LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI) const { return false; } diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -811,6 +811,9 @@ /// cannot be filtered by masking the load/store. void invalidateGroupsRequiringScalarEpilogue(); + /// Returns true if we have any interleave groups. + bool hasGroups() const { return !InterleaveGroups.empty(); } + private: /// A wrapper around ScalarEvolution, used to add runtime SCEV checks. /// Simplifies SCEV expressions in the context of existing SCEV assumptions. diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -603,8 +603,9 @@ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, - LoopVectorizationLegality *LVL) { - return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL); + LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI) { + return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI); } PredicationStyle emitGetActiveLaneMask() { diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -295,9 +295,9 @@ bool TargetTransformInfo::preferPredicateOverEpilogue( Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, - TargetLibraryInfo *TLI, DominatorTree *DT, - LoopVectorizationLegality *LVL) const { - return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL); + TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI) const { + return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI); } PredicationStyle TargetTransformInfo::emitGetActiveLaneMask() const { diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -343,7 +343,8 @@ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, - LoopVectorizationLegality *LVL); + LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI); bool supportsScalableVectors() const { return ST->hasSVE(); } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3031,10 +3031,17 @@ bool AArch64TTIImpl::preferPredicateOverEpilogue( Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, - TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) { + TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI) { if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled) return false; + // We don't currently support vectorisation with interleaving for SVE - with + // such loops we're better off not using tail-folding. This gives us a chance + // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc. + if (IAI->hasGroups()) + return false; + TailFoldingKind Required; // Defaults to 0. if (LVL->getReductionVars().size()) Required.add(TailFoldingKind::TFReductions); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -295,7 +295,8 @@ bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, - LoopVectorizationLegality *LVL); + LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI); void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2232,7 +2232,8 @@ bool ARMTTIImpl::preferPredicateOverEpilogue( Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, - TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) { + TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, + InterleavedAccessInfo *IAI) { if (!EnableTailPredication) { LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n"); return false; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9712,7 +9712,7 @@ Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI, AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, - LoopVectorizationLegality &LVL) { + LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) { // 1) OptSize takes precedence over all other options, i.e. if this is set, // don't look at hints or options, and don't request a scalar epilogue. // (For PGSO, as shouldOptimizeForSize isn't currently accessible from @@ -9747,7 +9747,7 @@ }; // 4) if the TTI hook indicates this is profitable, request predication. - if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL)) + if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL, IAI)) return CM_ScalarEpilogueNotNeededUsePredicate; return CM_ScalarEpilogueAllowed; @@ -9842,7 +9842,7 @@ InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI()); ScalarEpilogueLowering SEL = getScalarEpilogueLowering( - F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL); + F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI); LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); @@ -10086,11 +10086,6 @@ return false; } - // Check the function attributes and profiles to find out if this function - // should be optimized for size. - ScalarEpilogueLowering SEL = getScalarEpilogueLowering( - F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL); - // Entrance to the VPlan-native vectorization path. Outer loops are processed // here. They may require CFG and instruction level transformations before // even evaluating whether vectorization is profitable. Since we cannot modify @@ -10102,6 +10097,22 @@ assert(L->isInnermost() && "Inner loop expected."); + InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); + bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); + + // If an override option has been passed in for interleaved accesses, use it. + if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) + UseInterleaved = EnableInterleavedMemAccesses; + + // Analyze interleaved memory accesses. + if (UseInterleaved) + IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); + + // Check the function attributes and profiles to find out if this function + // should be optimized for size. + ScalarEpilogueLowering SEL = getScalarEpilogueLowering( + F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI); + // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. auto ExpectedTC = getSmallBestKnownTC(*SE, L); @@ -10165,18 +10176,6 @@ return false; } - bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); - InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); - - // If an override option has been passed in for interleaved accesses, use it. - if (EnableInterleavedMemAccesses.getNumOccurrences() > 0) - UseInterleaved = EnableInterleavedMemAccesses; - - // Analyze interleaved memory accesses. - if (UseInterleaved) { - IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); - } - // Use the cost model. LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-option.ll @@ -213,6 +213,59 @@ ret void } +define void @interleave(float* noalias %dst, float* noalias %src, i64 %n) #0 { +; CHECK-NOTF-LABEL: @interleave( +; CHECK-NOTF: vector.body: +; CHECK-NOTF: %[[LOAD:.*]] = load <8 x float>, <8 x float> +; CHECK-NOTF: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> +; CHECK-NOTF: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> + +; CHECK-TF-LABEL: @interleave( +; CHECK-TF: vector.body: +; CHECK-TF: %[[LOAD:.*]] = load <8 x float>, <8 x float> +; CHECK-TF: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> +; CHECK-TF: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> + +; CHECK-TF-NORED-LABEL: @interleave( +; CHECK-TF-NORED: vector.body: +; CHECK-TF-NORED: %[[LOAD:.*]] = load <8 x float>, <8 x float> +; CHECK-TF-NORED: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> +; CHECK-TF-NORED: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> + +; CHECK-TF-NOREC-LABEL: @interleave( +; CHECK-TF-NOREC: vector.body: +; CHECK-TF-NOREC: %[[LOAD:.*]] = load <8 x float>, <8 x float> +; CHECK-TF-NOREC: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> +; CHECK-TF-NOREC: %{{.*}} = shufflevector <8 x float> %[[LOAD]], <8 x float> poison, <4 x i32> + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.021 = phi i64 [ %inc, %for.body ], [ 0, %entry ] + %mul = shl nuw nsw i64 %i.021, 1 + %arrayidx = getelementptr inbounds float, float* %src, i64 %mul + %0 = load float, float* %arrayidx, align 4 + %mul1 = mul nuw nsw i64 %i.021, 3 + %arrayidx2 = getelementptr inbounds float, float* %dst, i64 %mul1 + store float %0, float* %arrayidx2, align 4 + %add = or i64 %mul, 1 + %arrayidx4 = getelementptr inbounds float, float* %src, i64 %add + %1 = load float, float* %arrayidx4, align 4 + %add6 = add nuw nsw i64 %mul1, 1 + %arrayidx7 = getelementptr inbounds float, float* %dst, i64 %add6 + store float %1, float* %arrayidx7, align 4 + %add9 = add nuw nsw i64 %mul1, 2 + %arrayidx10 = getelementptr inbounds float, float* %dst, i64 %add9 + store float 3.000000e+00, float* %arrayidx10, align 4 + %inc = add nuw nsw i64 %i.021, 1 + %exitcond.not = icmp eq i64 %inc, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + attributes #0 = { "target-features"="+sve" } !0 = distinct !{!0, !1, !2, !3, !4}