Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -531,6 +531,9 @@ /// \return The width of the largest scalar or vector register type. unsigned getRegisterBitWidth(bool Vector) const; + /// \return The width of the smallest vector register type. + unsigned getMinVectorRegisterBitWidth() const; + /// \return True if it should be considered for address type promotion. /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is /// profitable without finding other extensions fed by the same input. @@ -816,6 +819,7 @@ Type *Ty) = 0; virtual unsigned getNumberOfRegisters(bool Vector) = 0; virtual unsigned getRegisterBitWidth(bool Vector) = 0; + virtual unsigned getMinVectorRegisterBitWidth() = 0; virtual bool shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0; virtual unsigned getCacheLineSize() = 0; @@ -1049,6 +1053,9 @@ unsigned getRegisterBitWidth(bool Vector) override { return Impl.getRegisterBitWidth(Vector); } + unsigned getMinVectorRegisterBitWidth() override { + return Impl.getMinVectorRegisterBitWidth(); + } bool shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override { return Impl.shouldConsiderAddressTypePromotion( Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -305,6 +305,8 @@ unsigned getRegisterBitWidth(bool Vector) { return 32; } + unsigned getMinVectorRegisterBitWidth() { return 128; } + bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -273,6 +273,10 @@ return TTIImpl->getRegisterBitWidth(Vector); } +unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const { + return TTIImpl->getMinVectorRegisterBitWidth(); +} + bool TargetTransformInfo::shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const { return TTIImpl->shouldConsiderAddressTypePromotion( Index: lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- lib/Target/AArch64/AArch64Subtarget.h +++ lib/Target/AArch64/AArch64Subtarget.h @@ -83,6 +83,9 @@ // NegativeImmediates - transform instructions with negative immediates bool NegativeImmediates = true; + // Enable 64-bit vectorization in SLP. + unsigned MinVectorRegisterBitWidth = 64; + bool UseAA = false; bool PredictableSelectIsExpensive = false; bool BalanceFPOps = false; @@ -188,6 +191,10 @@ bool isXRaySupported() const override { return true; } + unsigned getMinVectorRegisterBitWidth() const { + return MinVectorRegisterBitWidth; + } + bool isX18Reserved() const { return ReserveX18; } bool hasFPARMv8() const { return HasFPARMv8; } bool hasNEON() const { return HasNEON; } Index: lib/Target/AArch64/AArch64Subtarget.cpp =================================================================== --- lib/Target/AArch64/AArch64Subtarget.cpp +++ lib/Target/AArch64/AArch64Subtarget.cpp @@ -72,6 +72,8 @@ case Falkor: MaxInterleaveFactor = 4; VectorInsertExtractBaseCost = 2; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; break; case Kryo: MaxInterleaveFactor = 4; @@ -80,6 +82,8 @@ PrefetchDistance = 740; MinPrefetchStride = 1024; MaxPrefetchIterationsAhead = 11; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; break; case ThunderX2T99: CacheLineSize = 64; @@ -89,6 +93,8 @@ PrefetchDistance = 128; MinPrefetchStride = 1024; MaxPrefetchIterationsAhead = 4; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; break; case ThunderX: case ThunderXT88: @@ -97,6 +103,8 @@ CacheLineSize = 128; PrefFunctionAlignment = 3; PrefLoopAlignment = 2; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; break; case CortexA35: break; case CortexA53: break; Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -84,6 +84,10 @@ return 64; } + unsigned getMinVectorRegisterBitWidth() { + return ST->getMinVectorRegisterBitWidth(); + } + unsigned getMaxInterleaveFactor(unsigned VF); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -331,7 +331,10 @@ else MaxVecRegSize = TTI->getRegisterBitWidth(true); - MinVecRegSize = MinVectorRegSizeOption; + if (MinVectorRegSizeOption.getNumOccurrences()) + MinVecRegSize = MinVectorRegSizeOption; + else + MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); } /// \brief Vectorize the tree that starts with the elements in \p VL. Index: test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll @@ -0,0 +1,22 @@ +; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic < %s | FileCheck %s +; RUN: opt -S -slp-vectorizer -mtriple=aarch64-apple-ios -mcpu=cyclone < %s | FileCheck %s +; Currently disabled for a few subtargets (e.g. Kryo): +; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=kryo < %s | FileCheck --check-prefix=NO_SLP %s +; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu -mcpu=generic -slp-min-reg-size=128 < %s | FileCheck --check-prefix=NO_SLP %s + +define void @f(float* %r, float* %w) { + %r0 = getelementptr inbounds float, float* %r, i64 0 + %r1 = getelementptr inbounds float, float* %r, i64 1 + %f0 = load float, float* %r0 + %f1 = load float, float* %r1 + %add0 = fadd float %f0, %f0 +; CHECK: fadd <2 x float> +; NO_SLP: fadd float +; NO_SLP: fadd float + %add1 = fadd float %f1, %f1 + %w0 = getelementptr inbounds float, float* %w, i64 0 + %w1 = getelementptr inbounds float, float* %w, i64 1 + store float %add0, float* %w0 + store float %add1, float* %w1 + ret void +}