Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -526,6 +526,9 @@ /// \return The width of the largest scalar or vector register type. unsigned getRegisterBitWidth(bool Vector) const; + /// \return The width of the smallest vector register type. + unsigned getMinVectorRegisterBitWidth() const; + /// \return True if it should be considered for address type promotion. /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is /// profitable without finding other extensions fed by the same input. @@ -806,6 +809,7 @@ Type *Ty) = 0; virtual unsigned getNumberOfRegisters(bool Vector) = 0; virtual unsigned getRegisterBitWidth(bool Vector) = 0; + virtual unsigned getMinVectorRegisterBitWidth() = 0; virtual bool shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0; virtual unsigned getCacheLineSize() = 0; @@ -1034,6 +1038,9 @@ unsigned getRegisterBitWidth(bool Vector) override { return Impl.getRegisterBitWidth(Vector); } + unsigned getMinVectorRegisterBitWidth() override { + return Impl.getMinVectorRegisterBitWidth(); + } bool shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override { return Impl.shouldConsiderAddressTypePromotion( Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -303,6 +303,8 @@ unsigned getRegisterBitWidth(bool Vector) { return 32; } + unsigned getMinVectorRegisterBitWidth() { return 128; } + bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -269,6 +269,10 @@ return TTIImpl->getRegisterBitWidth(Vector); } +unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const { + return TTIImpl->getMinVectorRegisterBitWidth(); +} + bool TargetTransformInfo::shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const { return TTIImpl->shouldConsiderAddressTypePromotion( Index: lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- lib/Target/AArch64/AArch64Subtarget.h +++ lib/Target/AArch64/AArch64Subtarget.h @@ -83,6 +83,8 @@ // NegativeImmediates - transform instructions with negative immediates bool NegativeImmediates = true; + unsigned MinVectorRegisterBitWidth = 128; + bool UseAA = false; bool PredictableSelectIsExpensive = false; bool BalanceFPOps = false; @@ -188,6 +190,10 @@ bool isXRaySupported() const override { return true; } + unsigned getMinVectorRegisterBitWidth() const { + return MinVectorRegisterBitWidth; + } + bool isX18Reserved() const { return ReserveX18; } bool hasFPARMv8() const { return HasFPARMv8; } bool hasNEON() const { return HasNEON; } Index: lib/Target/AArch64/AArch64Subtarget.cpp =================================================================== --- lib/Target/AArch64/AArch64Subtarget.cpp +++ lib/Target/AArch64/AArch64Subtarget.cpp @@ -59,6 +59,8 @@ PrefetchDistance = 280; MinPrefetchStride = 2048; MaxPrefetchIterationsAhead = 3; + // Enable 64-bit vectorization in SLP. + MinVectorRegisterBitWidth = 64; break; case CortexA57: MaxInterleaveFactor = 4; Index: lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.h +++ lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -84,6 +84,12 @@ return 64; } + unsigned getMinVectorRegisterBitWidth() { + // FIXME: This should probably be enabled for any Neon subtarget but + // currently it was only tuned for Cyclone. + return ST->getMinVectorRegisterBitWidth(); + } + unsigned getMaxInterleaveFactor(unsigned VF); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src); Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -342,7 +342,10 @@ else MaxVecRegSize = TTI->getRegisterBitWidth(true); - MinVecRegSize = MinVectorRegSizeOption; + if (MinVectorRegSizeOption.getNumOccurrences()) + MinVecRegSize = MinVectorRegSizeOption; + else + MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); } /// \brief Vectorize the tree that starts with the elements in \p VL. Index: test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll @@ -0,0 +1,20 @@ +; RUN: opt -S -slp-vectorizer -mtriple=aarch64-apple-ios -mcpu=cyclone < %s | FileCheck %s +; RUN: opt -S -slp-vectorizer -mtriple=aarch64--linux-gnu < %s | FileCheck --check-prefix=NO_SLP %s +; RUN: opt -S -slp-vectorizer -mtriple=aarch64-apple-ios -mcpu=cyclone -slp-min-reg-size=128 < %s | FileCheck --check-prefix=NO_SLP %s + +define void @f(float* %r, float* %w) { + %r0 = getelementptr inbounds float, float* %r, i64 0 + %r1 = getelementptr inbounds float, float* %r, i64 1 + %f0 = load float, float* %r0 + %f1 = load float, float* %r1 + %add0 = fadd float %f0, %f0 +; CHECK: fadd <2 x float> +; NO_SLP: fadd float +; NO_SLP: fadd float + %add1 = fadd float %f1, %f1 + %w0 = getelementptr inbounds float, float* %w, i64 0 + %w1 = getelementptr inbounds float, float* %w, i64 1 + store float %add0, float* %w0 + store float %add1, float* %w1 + ret void +}