Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -3419,6 +3419,18 @@ else MaxVecRegSize = TTI->getRegisterBitWidth(true); + // If the target is AArch64, MinVecRegSize can be 64 (vectorized in double + // register) + // not 128 (vectorized in quad register). + // Thus, we can have more opportunities to vectorize as followings + // <2 x 32-bit data type>, <4 x 16-bit data type>, <8 x 8-bit data type>. + llvm::Triple TargetTriple(F.getParent()->getTargetTriple()); + bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64 || + TargetTriple.getArch() == llvm::Triple::aarch64_be; + + if (IsAArch64) + MinVectorRegSizeOption = 64; + MinVecRegSize = MinVectorRegSizeOption; // Don't vectorize when the attribute NoImplicitFloat is used. @@ -3499,7 +3511,7 @@ /// \returns true if a value was vectorized. bool tryToVectorizeList(ArrayRef VL, BoUpSLP &R, ArrayRef BuildVector = None, - bool allowReorder = false); + bool allowReorder = false, unsigned VecRegSize = 128); /// \brief Try to vectorize a chain that may start at the operands of \V; bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); @@ -3707,12 +3719,19 @@ if (!A || !B) return false; Value *VL[] = { A, B }; - return tryToVectorizeList(VL, R, None, true); + bool SuccessToVectorizeList = false; + for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) { + if (tryToVectorizeList(VL, R, None, true, Size)) { + SuccessToVectorizeList = true; + break; + } + } + return SuccessToVectorizeList; } bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, ArrayRef BuildVector, - bool allowReorder) { + bool allowReorder, unsigned VecRegSize) { if (VL.size() < 2) return false; @@ -3728,7 +3747,7 @@ // FIXME: Register size should be a parameter to this function, so we can // try different vectorization factors. unsigned Sz = R.getVectorElementSize(I0); - unsigned VF = MinVecRegSize / Sz; + unsigned VF = VecRegSize / Sz; for (Value *V : VL) { Type *Ty = V->getType(); @@ -3936,7 +3955,8 @@ MinVecRegSize(MinVecRegSize) {} /// \brief Try to find a reduction tree. - bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B) { + bool matchAssociativeReduction(PHINode *Phi, BinaryOperator *B, + unsigned VecRegSize) { assert((!Phi || std::find(Phi->op_begin(), Phi->op_end(), B) != Phi->op_end()) && "Thi phi needs to use the binary operator"); @@ -3966,7 +3986,7 @@ ReducedValueOpcode = 0; // FIXME: Register size should be a parameter to this function, so we can // try different vectorization factors. - ReduxWidth = MinVecRegSize / DL.getTypeSizeInBits(Ty); + ReduxWidth = VecRegSize / DL.getTypeSizeInBits(Ty); ReductionRoot = B; ReductionPHI = Phi; @@ -4267,7 +4287,7 @@ return false; HorizontalReduction HorRdx(MinRegSize); - if (!HorRdx.matchAssociativeReduction(P, BI)) + if (!HorRdx.matchAssociativeReduction(P, BI, MinRegSize)) return false; // If there is a sufficient number of reduction values, reduce @@ -4318,7 +4338,15 @@ // Try to vectorize them. unsigned NumElts = (SameTypeIt - IncIt); DEBUG(errs() << "SLP: Trying to vectorize starting at PHIs (" << NumElts << ")\n"); - if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R)) { + bool SuccessToVectorizeList = false; + for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) { + if (tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, None, false, + Size)) { + SuccessToVectorizeList = true; + break; + } + } + if (NumElts > 1 && SuccessToVectorizeList) { // Success start over because instructions might have been changed. HaveVectorizedPhiNodes = true; Changed = true; @@ -4354,7 +4382,14 @@ continue; // Try to match and vectorize a horizontal reduction. - if (canMatchHorizontalReduction(P, BI, R, TTI, MinVecRegSize)) { + bool SuccessToMatchHorizontalReduction = false; + for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) { + if (canMatchHorizontalReduction(P, BI, R, TTI, Size)) { + SuccessToMatchHorizontalReduction = true; + break; + } + } + if (SuccessToMatchHorizontalReduction) { Changed = true; it = BB->begin(); e = BB->end(); @@ -4381,9 +4416,18 @@ if (StoreInst *SI = dyn_cast(it)) if (BinaryOperator *BinOp = dyn_cast(SI->getValueOperand())) { - if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, - MinVecRegSize) || - tryToVectorize(BinOp, R)) { + // if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, + // MinVecRegSize) || + // tryToVectorize(BinOp, R)) { + bool SuccessToMatchHorizontalReduction = false; + for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; + Size /= 2) { + if (canMatchHorizontalReduction(nullptr, BinOp, R, TTI, Size)) { + SuccessToMatchHorizontalReduction = true; + break; + } + } + if (SuccessToMatchHorizontalReduction || tryToVectorize(BinOp, R)) { Changed = true; it = BB->begin(); e = BB->end(); @@ -4442,7 +4486,14 @@ // Vectorize starting with the build vector operands ignoring the // BuildVector instructions for the purpose of scheduling and user // extraction. - if (tryToVectorizeList(BuildVectorOpds, R, BuildVector)) { + bool SuccessToVectorizeList = false; + for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) { + if (tryToVectorizeList(BuildVectorOpds, R, BuildVector, false, Size)) { + SuccessToVectorizeList = true; + break; + } + } + if (SuccessToVectorizeList) { Changed = true; it = BB->begin(); e = BB->end(); @@ -4532,7 +4583,15 @@ // performed in parallel. It's likely that detecting this pattern in a // bottom-up phase will be simpler and less costly than building a // full-blown top-down phase beginning at the consecutive loads. - Changed |= tryToVectorizeList(Bundle, R); + // Changed |= tryToVectorizeList(Bundle, R); + bool SuccessToVectorizeList = false; + for (unsigned Size = MaxVecRegSize; Size >= MinVecRegSize; Size /= 2) { + if (tryToVectorizeList(Bundle, R, None, false, Size)) { + SuccessToVectorizeList = true; + break; + } + } + Changed |= SuccessToVectorizeList; } } return Changed; Index: test/Transforms/SLPVectorizer/AArch64/slp-vectorized-within-64bits.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/AArch64/slp-vectorized-within-64bits.ll @@ -0,0 +1,28 @@ +;RUN: opt -slp-vectorizer -S < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +; CHECK: @foo +; CHECK: load <4 x i16> +; CHECK: store <4 x i16> +; CHECK: ret i32 + +%struct.A = type { i16, i16 } +%struct.B = type { i16, i16 } + +@array_A = global [10 x %struct.A] zeroinitializer, align 2 +@array_B = global [10 x %struct.B] zeroinitializer, align 2 + +define i32 @foo() #0 { +entry: + %0 = load i16, i16* getelementptr inbounds ([10 x %struct.A], [10 x %struct.A]* @array_A, i64 0, i64 0, i32 0), align 2 + store i16 %0, i16* getelementptr inbounds ([10 x %struct.B], [10 x %struct.B]* @array_B, i64 0, i64 0, i32 0), align 2 + %1 = load i16, i16* getelementptr inbounds ([10 x %struct.A], [10 x %struct.A]* @array_A, i64 0, i64 0, i32 1), align 2 + store i16 %1, i16* getelementptr inbounds ([10 x %struct.B], [10 x %struct.B]* @array_B, i64 0, i64 0, i32 1), align 2 + %2 = load i16, i16* getelementptr inbounds ([10 x %struct.A], [10 x %struct.A]* @array_A, i64 0, i64 1, i32 0), align 2 + store i16 %2, i16* getelementptr inbounds ([10 x %struct.B], [10 x %struct.B]* @array_B, i64 0, i64 1, i32 0), align 2 + %3 = load i16, i16* getelementptr inbounds ([10 x %struct.A], [10 x %struct.A]* @array_A, i64 0, i64 1, i32 1), align 2 + store i16 %3, i16* getelementptr inbounds ([10 x %struct.B], [10 x %struct.B]* @array_B, i64 0, i64 1, i32 1), align 2 + ret i32 0 +}