Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -430,6 +430,9 @@ /// \return The width of the largest scalar or vector register type. unsigned getRegisterBitWidth(bool Vector) const; + /// \return The width of the smallest vector register type. + unsigned getMinVectorRegisterBitWidth() const; + /// \return The size of a cache line in bytes. unsigned getCacheLineSize() const; @@ -641,6 +644,7 @@ Type *Ty) = 0; virtual unsigned getNumberOfRegisters(bool Vector) = 0; virtual unsigned getRegisterBitWidth(bool Vector) = 0; + virtual unsigned getMinVectorRegisterBitWidth() = 0; virtual unsigned getCacheLineSize() = 0; virtual unsigned getPrefetchDistance() = 0; virtual unsigned getMinPrefetchStride() = 0; @@ -816,6 +820,9 @@ unsigned getRegisterBitWidth(bool Vector) override { return Impl.getRegisterBitWidth(Vector); } + unsigned getMinVectorRegisterBitWidth() override { + return Impl.getMinVectorRegisterBitWidth(); + } unsigned getCacheLineSize() override { return Impl.getCacheLineSize(); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -266,6 +266,8 @@ unsigned getRegisterBitWidth(bool Vector) { return 32; } + unsigned getMinVectorRegisterBitWidth() { return 128; } + unsigned getCacheLineSize() { return 0; } unsigned getPrefetchDistance() { return 0; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -280,6 +280,8 @@ unsigned getRegisterBitWidth(bool Vector) { return 32; } + unsigned getMinVectorRegisterBitWidth() { return 128; } + unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } unsigned getArithmeticInstrCost( Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -219,6 +219,10 @@ return TTIImpl->getRegisterBitWidth(Vector); } +unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const { + return TTIImpl->getMinVectorRegisterBitWidth(); +} + unsigned TargetTransformInfo::getCacheLineSize() const { return TTIImpl->getCacheLineSize(); } Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -95,6 +95,12 @@ return 64; } + unsigned getMinVectorRegisterBitWidth() { + if (ST->hasNEON()) + return 64; + return 0; + } + unsigned getMaxInterleaveFactor(unsigned VF); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3430,7 +3430,10 @@ else MaxVecRegSize = TTI->getRegisterBitWidth(true); - MinVecRegSize = MinVectorRegSizeOption; + if (MinVectorRegSizeOption.getNumOccurrences()) + MinVecRegSize = MinVectorRegSizeOption; + else + MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); // Don't vectorize when the attribute NoImplicitFloat is used. if (F.hasFnAttribute(Attribute::NoImplicitFloat)) Index: test/Transforms/SLPVectorizer/AArch64/slp-vectorized-within-64bits.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/AArch64/slp-vectorized-within-64bits.ll @@ -0,0 +1,28 @@ +;RUN: opt -slp-vectorizer -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +; CHECK: @foo +; CHECK: load <4 x i16> +; CHECK: store <4 x i16> +; CHECK: ret i32 + +%struct.A = type { i16, i16 } +%struct.B = type { i16, i16 } + +@array_A = global [10 x %struct.A] zeroinitializer, align 2 +@array_B = global [10 x %struct.B] zeroinitializer, align 2 + +define i32 @foo() #0 { +entry: + %0 = load i16, i16* getelementptr inbounds ([10 x %struct.A], [10 x %struct.A]* @array_A, i64 0, i64 0, i32 0), align 2 + store i16 %0, i16* getelementptr inbounds ([10 x %struct.B], [10 x %struct.B]* @array_B, i64 0, i64 0, i32 0), align 2 + %1 = load i16, i16* getelementptr inbounds ([10 x %struct.A], [10 x %struct.A]* @array_A, i64 0, i64 0, i32 1), align 2 + store i16 %1, i16* getelementptr inbounds ([10 x %struct.B], [10 x %struct.B]* @array_B, i64 0, i64 0, i32 1), align 2 + %2 = load i16, i16* getelementptr inbounds ([10 x %struct.A], [10 x %struct.A]* @array_A, i64 0, i64 1, i32 0), align 2 + store i16 %2, i16* getelementptr inbounds ([10 x %struct.B], [10 x %struct.B]* @array_B, i64 0, i64 1, i32 0), align 2 + %3 = load i16, i16* getelementptr inbounds ([10 x %struct.A], [10 x %struct.A]* @array_A, i64 0, i64 1, i32 1), align 2 + store i16 %3, i16* getelementptr inbounds ([10 x %struct.B], [10 x %struct.B]* @array_B, i64 0, i64 1, i32 1), align 2 + ret i32 0 +}