Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -934,7 +934,7 @@ /// creating vectors that span multiple vector registers. /// If false, the vectorization factor will be chosen based on the /// size of the widest element type. - bool shouldMaximizeVectorBandwidth() const; + bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const; /// \return The minimum vectorization factor for types of given element /// bit width, or 0 if there is no minimum VF. The returned value only @@ -1618,7 +1618,8 @@ virtual unsigned getMinVectorRegisterBitWidth() const = 0; virtual Optional getMaxVScale() const = 0; virtual Optional getVScaleForTuning() const = 0; - virtual bool shouldMaximizeVectorBandwidth() const = 0; + virtual bool + shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const = 0; virtual ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const = 0; virtual unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const = 0; @@ -2108,8 +2109,9 @@ Optional getVScaleForTuning() const override { return Impl.getVScaleForTuning(); } - bool shouldMaximizeVectorBandwidth() const override { - return Impl.shouldMaximizeVectorBandwidth(); + bool shouldMaximizeVectorBandwidth( + TargetTransformInfo::RegisterKind K) const override { + return Impl.shouldMaximizeVectorBandwidth(K); } ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const override { Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -416,7 +416,10 @@ Optional getMaxVScale() const { return None; } Optional getVScaleForTuning() const { return None; } - bool shouldMaximizeVectorBandwidth() const { return false; } + bool + shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const { + return false; + } ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const { return ElementCount::get(0, IsScalable); Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -623,8 +623,9 @@ return TTIImpl->getVScaleForTuning(); } -bool TargetTransformInfo::shouldMaximizeVectorBandwidth() const { - return TTIImpl->shouldMaximizeVectorBandwidth(); +bool TargetTransformInfo::shouldMaximizeVectorBandwidth( + TargetTransformInfo::RegisterKind K) const { + return TTIImpl->shouldMaximizeVectorBandwidth(K); } ElementCount TargetTransformInfo::getMinimumVF(unsigned ElemWidth, Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -135,6 +135,8 @@ return ST->getVScaleForTuning(); } + bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const; + /// Try to return an estimate cost factor that can be used as a multiplier /// when scalarizing an operation for a vector with ElementCount \p VF. /// For scalable vectors this currently takes the most pessimistic view based Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -36,6 +36,11 @@ static cl::opt SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden); +static cl::opt AArch64MaximizeBandwidth( + "aarch64-vectorizer-maximize-bandwidth", cl::init(true), cl::Hidden, + cl::desc("Maximize bandwidth when selecting vectorization factor which " + "will be determined by the smallest type in loop.")); + bool AArch64TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); @@ -50,6 +55,15 @@ return (CallerBits & CalleeBits) == CalleeBits; } +bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( + TargetTransformInfo::RegisterKind K) const { + if (!AArch64MaximizeBandwidth) + return false; + if (K == TargetTransformInfo::RGK_ScalableVector) + return false; + return true; +} + /// Calculate the cost of materializing a 64-bit value. This helper /// method might only calculate a fraction of a larger immediate. Therefore it /// is valid to return a cost of ZERO. Index: llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h =================================================================== --- llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -86,12 +86,11 @@ unsigned getMinVectorRegisterBitWidth() const; ElementCount getMinimumVF(unsigned ElemWidth, bool IsScalable) const; - bool shouldMaximizeVectorBandwidth() const { + bool + shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const { return true; } - bool supportsEfficientVectorElementLoadStore() { - return false; - } + bool supportsEfficientVectorElementLoadStore() { return false; } bool hasBranchDivergence() { return false; } Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5586,8 +5586,11 @@ return ElementCount::getFixed(ClampedConstTripCount); } + TargetTransformInfo::RegisterKind K = + ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector + : TargetTransformInfo::RGK_FixedWidthVector; ElementCount MaxVF = MaxVectorElementCount; - if (TTI.shouldMaximizeVectorBandwidth() || + if (TTI.shouldMaximizeVectorBandwidth(K) || (MaximizeBandwidth && isScalarEpilogueAllowed())) { auto MaxVectorElementCountMaxBW = ElementCount::get( PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType), Index: llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll @@ -4,11 +4,12 @@ ; are not profitable. ; Test with a loop that contains memory accesses of i8 and i32 types. The -; default maximum VF for NEON is 4. And while we don't have an instruction to -; load 4 x i8, vectorization might still be profitable. +; maximum VF for NEON is calculated by 128/size of smallest type in loop. +; And while we don't have an instruction to load 4 x i8, vectorization +; might still be profitable. define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) { ; CHECK-LABEL: @test_load_i8_store_i32( -; CHECK: <4 x i8> +; CHECK: <16 x i8> ; entry: br label %loop @@ -32,7 +33,7 @@ ; Same as test_load_i8_store_i32, but with types flipped for load and store. define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) { ; CHECK-LABEL: @test_load_i32_store_i8( -; CHECK: <4 x i8> +; CHECK: <16 x i8> ; entry: br label %loop @@ -84,7 +85,7 @@ ; vectorization factor. define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) { ; CHECK-LABEL: @test_load_i8_store_i64_large -; CHECK: <2 x i64> +; CHECK: <8 x i64> ; entry: br label %loop Index: llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll @@ -116,9 +116,9 @@ } ; CHECK-LABEL: @add_d( -; CHECK: load <4 x i16> -; CHECK: add nsw <4 x i32> -; CHECK: store <4 x i32> +; CHECK: load <8 x i16> +; CHECK: add nsw <8 x i32> +; CHECK: store <8 x i32> define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 { entry: %cmp7 = icmp sgt i32 %len, 0 Index: llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll @@ -123,16 +123,16 @@ ; } ; ; CHECK: vector.body: -; CHECK: phi <8 x i16> -; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <8 x i8> -; CHECK: zext <8 x i8> [[Ld1]] to <8 x i16> -; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <8 x i8> -; CHECK: zext <8 x i8> [[Ld2]] to <8 x i16> -; CHECK: add <8 x i16> -; CHECK: add <8 x i16> +; CHECK: phi <16 x i16> +; CHECK: [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8> +; CHECK: zext <16 x i8> [[Ld1]] to <16 x i16> +; CHECK: [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8> +; CHECK: zext <16 x i8> [[Ld2]] to <16 x i16> +; CHECK: add <16 x i16> +; CHECK: add <16 x i16> ; ; CHECK: middle.block: -; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> +; CHECK: [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> ; CHECK: zext i16 [[Rdx]] to i32 ; define i16 @reduction_i16_2(i8* nocapture readonly %a, i8* nocapture readonly %b, i32 %n) { Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll @@ -29,7 +29,7 @@ ; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1). ; VF-4: <4 x i32> -; VF-VSCALE4: +; VF-VSCALE4: <16 x i32> define void @test0(i32* %a, i8* %b, i32* %c) #0 { entry: br label %loop Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll @@ -9,9 +9,9 @@ define void @test0(i32* %a, i8* %b, i32* %c) #0 { ; CHECK: LV: Checking a loop in "test0" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 -; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4 +; CHECK_SCALABLE_ON: LV: Selecting VF: 16 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16 ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: vscale x 16 entry: @@ -40,9 +40,9 @@ define void @test1(i32* %a, i8* %b) #0 { ; CHECK: LV: Checking a loop in "test1" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 -; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4 +; CHECK_SCALABLE_ON: LV: Selecting VF: 16 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4 ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 entry: @@ -72,9 +72,9 @@ define void @test2(i32* %a, i8* %b) #0 { ; CHECK: LV: Checking a loop in "test2" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2 -; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 2 +; CHECK_SCALABLE_ON: LV: Selecting VF: 16 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2 ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 entry: @@ -104,9 +104,9 @@ define void @test3(i32* %a, i8* %b) #0 { ; CHECK: LV: Checking a loop in "test3" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1 -; CHECK_SCALABLE_ON: LV: Selecting VF: 4 +; CHECK_SCALABLE_ON: LV: Selecting VF: 16 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF -; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1 ; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 entry: Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-illegal-type.ll @@ -83,11 +83,11 @@ define void @uniform_store_i1(i1* noalias %dst, i64* noalias %start, i64 %N) { ; CHECK-LABEL: @uniform_store_i1 ; CHECK: vector.body -; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <2 x i64*> {{.*}}, i64 1 -; CHECK: %[[ICMP:.*]] = icmp eq <2 x i64*> %[[GEP]], %[[SPLAT:.*]] -; CHECK: %[[EXTRACT1:.*]] = extractelement <2 x i1> %[[ICMP]], i32 0 +; CHECK: %[[GEP:.*]] = getelementptr inbounds i64, <64 x i64*> {{.*}}, i64 1 +; CHECK: %[[ICMP:.*]] = icmp eq <64 x i64*> %[[GEP]], %[[SPLAT:.*]] +; CHECK: %[[EXTRACT1:.*]] = extractelement <64 x i1> %[[ICMP]], i32 0 ; CHECK: store i1 %[[EXTRACT1]], i1* %dst -; CHECK: %[[EXTRACT2:.*]] = extractelement <2 x i1> %[[ICMP]], i32 1 +; CHECK: %[[EXTRACT2:.*]] = extractelement <64 x i1> %[[ICMP]], i32 1 ; CHECK: store i1 %[[EXTRACT2]], i1* %dst ; CHECK-NOT: vscale entry: