Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -57,6 +57,10 @@ ShouldVectorizeHor("slp-vectorize-hor", cl::init(false), cl::Hidden, cl::desc("Attempt to vectorize horizontal reductions")); +static cl::opt +ShouldVectorizeLoadAgg("slp-vectorize-load-aggressive", cl::init(false), cl::Hidden, + cl::desc("Attempt to vectorize load aggressively")); + static cl::opt ShouldStartVectorizeHorAtStore( "slp-vectorize-hor-store", cl::init(false), cl::Hidden, cl::desc( @@ -386,6 +390,9 @@ /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef Roots, unsigned Depth); + /// This is a recursive function to calculate the max depth of a Tree + int getTreeDepth(ArrayRef VL, unsigned Depth); + /// Vectorize a single entry in the tree. Value *vectorizeTree(TreeEntry *E); @@ -592,6 +599,40 @@ } +int BoUpSLP::getTreeDepth(ArrayRef VL, unsigned Depth) { + if (Depth == RecursionMaxDepth) + return Depth; + + // Don't handle vectors return the current depth + if (VL[0]->getType()->isVectorTy()) { + return Depth; + } + + if (StoreInst *SI = dyn_cast(VL[0])) { + if (SI->getValueOperand()->getType()->isVectorTy()) { + return Depth; + } + } + + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || + !getSameOpcode(VL)) { + return Depth; + } + + Instruction *VL0 = cast(VL[0]); + unsigned maxDepth = Depth; + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + unsigned depth = getTreeDepth(Operands, Depth+1); + if (depth > maxDepth) + maxDepth = depth; + } + return maxDepth; +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { bool SameTy = getSameType(VL); (void)SameTy; assert(SameTy && "Invalid types!"); @@ -914,8 +955,22 @@ if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; reorderInputsAccordingToOpcode(VL, Left, Right); - buildTree_rec(Left, Depth + 1); - buildTree_rec(Right, Depth + 1); + if (ShouldVectorizeLoadAgg) { + unsigned leftSubTreeDepth = getTreeDepth(Left, 1); + unsigned rightSubTreeDepth = getTreeDepth(Right, 1); + if (leftSubTreeDepth >= rightSubTreeDepth) { + buildTree_rec(Left, Depth + 1); + buildTree_rec(Right, Depth + 1); + } + else { + buildTree_rec(Right, Depth + 1); + buildTree_rec(Left, Depth + 1); + } + } + else { + buildTree_rec(Left, Depth + 1); + buildTree_rec(Right, Depth + 1); + } return; } Index: test/Transforms/SLPVectorizer/X86/pr19657.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/pr19657.ll +++ test/Transforms/SLPVectorizer/X86/pr19657.ll @@ -0,0 +1,73 @@ +; RUN: opt < %s -O1 -basicaa -slp-vectorizer -slp-vectorize-load-aggressive=true -dce -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +;CHECK: load <2 x double>* +;CHECK: fadd <2 x double> +;CHECK: store <2 x double> + +; Function Attrs: nounwind uwtable +define void @foo(double* %x) #0 { + %1 = alloca double*, align 8 + store double* %x, double** %1, align 8 + %2 = load double** %1, align 8 + %3 = getelementptr inbounds double* %2, i64 0 + %4 = load double* %3, align 8 + %5 = load double** %1, align 8 + %6 = getelementptr inbounds double* %5, i64 0 + %7 = load double* %6, align 8 + %8 = fadd double %4, %7 + %9 = load double** %1, align 8 + %10 = getelementptr inbounds double* %9, i64 0 + %11 = load double* %10, align 8 + %12 = fadd double %8, %11 + %13 = load double** %1, align 8 + %14 = getelementptr inbounds double* %13, i64 0 + store double %12, double* %14, align 8 + %15 = load double** %1, align 8 + %16 = getelementptr inbounds double* %15, i64 1 + %17 = load double* %16, align 8 + %18 = load double** %1, align 8 + %19 = getelementptr inbounds double* %18, i64 1 + %20 = load double* %19, align 8 + %21 = fadd double %17, %20 + %22 = load double** %1, align 8 + %23 = getelementptr inbounds double* %22, i64 1 + %24 = load double* %23, align 8 + %25 = fadd double %21, %24 + %26 = load double** %1, align 8 + %27 = getelementptr inbounds double* %26, i64 1 + store double %25, double* %27, align 8 + %28 = load double** %1, align 8 + %29 = getelementptr inbounds double* %28, i64 2 + %30 = load double* %29, align 8 + %31 = load double** %1, align 8 + %32 = getelementptr inbounds double* %31, i64 2 + %33 = load double* %32, align 8 + %34 = fadd double %30, %33 + %35 = load double** %1, align 8 + %36 = getelementptr inbounds double* %35, i64 2 + %37 = load double* %36, align 8 + %38 = fadd double %34, %37 + %39 = load double** %1, align 8 + %40 = getelementptr inbounds double* %39, i64 2 + store double %38, double* %40, align 8 + %41 = load double** %1, align 8 + %42 = getelementptr inbounds double* %41, i64 3 + %43 = load double* %42, align 8 + %44 = load double** %1, align 8 + %45 = getelementptr inbounds double* %44, i64 3 + %46 = load double* %45, align 8 + %47 = fadd double %43, %46 + %48 = load double** %1, align 8 + %49 = getelementptr inbounds double* %48, i64 3 + %50 = load double* %49, align 8 + %51 = fadd double %47, %50 + %52 = load double** %1, align 8 + %53 = getelementptr inbounds double* %52, i64 3 + store double %51, double* %53, align 8 + ret void +} + +attributes #0 = { nounwind }