Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -386,6 +386,9 @@ /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef Roots, unsigned Depth); + /// This is a recursive function to calculate the depth of a Tree + int getTreeDepth(ArrayRef VL, unsigned Depth); + /// Vectorize a single entry in the tree. Value *vectorizeTree(TreeEntry *E); @@ -592,6 +595,202 @@ } +int BoUpSLP::getTreeDepth(ArrayRef VL, unsigned Depth) { + + if (Depth == RecursionMaxDepth) + return Depth; + + // Don't handle vectors return the current depth + if (VL[0]->getType()->isVectorTy()) { + return Depth; + } + + if (StoreInst *SI = dyn_cast(VL[0])) { + if (SI->getValueOperand()->getType()->isVectorTy()) { + return Depth; + } + } + + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || + !getSameOpcode(VL)) { + return Depth; + } + + Instruction *VL0 = cast(VL[0]); + unsigned Opcode = getSameOpcode(VL); + + switch (Opcode) { + case Instruction::PHI: { + PHINode *PH = dyn_cast(VL0); + + // Check for terminator values (e.g. invoke). + for (unsigned j = 0; j < VL.size(); ++j) + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + TerminatorInst *Term = dyn_cast( + cast(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i))); + if (Term) { + return Depth; + } + } + unsigned maxDepth = Depth; + for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getIncomingValueForBlock( + PH->getIncomingBlock(i))); + + unsigned depth = getTreeDepth(Operands, Depth + 1); + if(depth > maxDepth) + maxDepth = depth; + } + return maxDepth; + } + case Instruction::ExtractElement: { + return Depth; + } + case Instruction::Load: { + return Depth; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VL0->getOperand(0)->getType(); + for (unsigned i = 0; i < VL.size(); ++i) { + Type *Ty = cast(VL[i])->getOperand(0)->getType(); + if (Ty != SrcTy || Ty->isAggregateType() || Ty->isVectorTy()) { + return Depth; + } + } + unsigned maxDepth = Depth; + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + unsigned depth = getTreeDepth(Operands, Depth+1); + if(depth > maxDepth) + maxDepth = depth; + } + return maxDepth; + } + case Instruction::ICmp: + case Instruction::FCmp: { + CmpInst::Predicate P0 = dyn_cast(VL0)->getPredicate(); + Type *ComparedTy = cast(VL[0])->getOperand(0)->getType(); + for (unsigned i = 1, e = VL.size(); i < e; ++i) { + CmpInst *Cmp = cast(VL[i]); + if (Cmp->getPredicate() != P0 || + Cmp->getOperand(0)->getType() != ComparedTy) { + return Depth; + } + } + unsigned maxDepth = Depth; + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + unsigned depth = getTreeDepth(Operands, Depth+1); + if(depth > maxDepth) + maxDepth = depth; + } + return maxDepth; + } + case Instruction::Select: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + unsigned maxDepth = Depth; + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + unsigned depth = getTreeDepth(Operands, Depth+1); + if(depth > maxDepth) + maxDepth = depth; + } + return maxDepth; + } + case Instruction::Store: { + for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(VL[i], VL[i + 1])) { + return Depth; + } + unsigned maxDepth = Depth; + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(0)); + + unsigned depth = getTreeDepth(Operands, Depth + 1); + if(depth > maxDepth) + maxDepth = depth; + return maxDepth; + } + case Instruction::Call: { + // Check if the calls are all to the same vectorizable intrinsic. + CallInst *CI = cast(VL[0]); + unsigned maxDepth = Depth; + // Check if this is an Intrinsic call or something that can be + // represented by an intrinsic call else we dont vectorize + // return the current depth. + Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI); + if (!isTriviallyVectorizable(ID)) { + return Depth; + } + + Function *Int = CI->getCalledFunction(); + + for (unsigned i = 1, e = VL.size(); i != e; ++i) { + CallInst *CI2 = dyn_cast(VL[i]); + if (!CI2 || CI2->getCalledFunction() != Int || + getIntrinsicIDForCall(CI2, TLI) != ID) { + return Depth; + } + } + + for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { + ValueList Operands; + for (unsigned j = 0; j < VL.size(); ++j) { + CallInst *CI2 = dyn_cast(VL[j]); + Operands.push_back(CI2->getArgOperand(i)); + } + unsigned depth = getTreeDepth(Operands, Depth + 1); + if(depth > maxDepth) + maxDepth = depth; + } + return maxDepth; + } + default: + return Depth; + } + +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { bool SameTy = getSameType(VL); (void)SameTy; assert(SameTy && "Invalid types!"); @@ -914,8 +1113,18 @@ if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; reorderInputsAccordingToOpcode(VL, Left, Right); - buildTree_rec(Left, Depth + 1); - buildTree_rec(Right, Depth + 1); + unsigned leftSubTreeDepth = getTreeDepth(Left, 1); + unsigned rightSubTreeDepth = getTreeDepth(Right, 1); + + + if(leftSubTreeDepth >= rightSubTreeDepth) { + buildTree_rec(Left, Depth + 1); + buildTree_rec(Right, Depth + 1); + } + else { + buildTree_rec(Right, Depth + 1); + buildTree_rec(Left, Depth + 1); + } return; } Index: test/Transforms/SLPVectorizer/X86/pr19657.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/pr19657.ll +++ test/Transforms/SLPVectorizer/X86/pr19657.ll @@ -0,0 +1,73 @@ +; RUN: opt < %s -O1 -basicaa -slp-vectorizer -dce -S -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +;CHECK: load <2 x double>* +;CHECK: fadd <2 x double> +;CHECK: store <2 x double> + +; Function Attrs: nounwind uwtable +define void @foo(double* %x) #0 { + %1 = alloca double*, align 8 + store double* %x, double** %1, align 8 + %2 = load double** %1, align 8 + %3 = getelementptr inbounds double* %2, i64 0 + %4 = load double* %3, align 8 + %5 = load double** %1, align 8 + %6 = getelementptr inbounds double* %5, i64 0 + %7 = load double* %6, align 8 + %8 = fadd double %4, %7 + %9 = load double** %1, align 8 + %10 = getelementptr inbounds double* %9, i64 0 + %11 = load double* %10, align 8 + %12 = fadd double %8, %11 + %13 = load double** %1, align 8 + %14 = getelementptr inbounds double* %13, i64 0 + store double %12, double* %14, align 8 + %15 = load double** %1, align 8 + %16 = getelementptr inbounds double* %15, i64 1 + %17 = load double* %16, align 8 + %18 = load double** %1, align 8 + %19 = getelementptr inbounds double* %18, i64 1 + %20 = load double* %19, align 8 + %21 = fadd double %17, %20 + %22 = load double** %1, align 8 + %23 = getelementptr inbounds double* %22, i64 1 + %24 = load double* %23, align 8 + %25 = fadd double %21, %24 + %26 = load double** %1, align 8 + %27 = getelementptr inbounds double* %26, i64 1 + store double %25, double* %27, align 8 + %28 = load double** %1, align 8 + %29 = getelementptr inbounds double* %28, i64 2 + %30 = load double* %29, align 8 + %31 = load double** %1, align 8 + %32 = getelementptr inbounds double* %31, i64 2 + %33 = load double* %32, align 8 + %34 = fadd double %30, %33 + %35 = load double** %1, align 8 + %36 = getelementptr inbounds double* %35, i64 2 + %37 = load double* %36, align 8 + %38 = fadd double %34, %37 + %39 = load double** %1, align 8 + %40 = getelementptr inbounds double* %39, i64 2 + store double %38, double* %40, align 8 + %41 = load double** %1, align 8 + %42 = getelementptr inbounds double* %41, i64 3 + %43 = load double* %42, align 8 + %44 = load double** %1, align 8 + %45 = getelementptr inbounds double* %44, i64 3 + %46 = load double* %45, align 8 + %47 = fadd double %43, %46 + %48 = load double** %1, align 8 + %49 = getelementptr inbounds double* %48, i64 3 + %50 = load double* %49, align 8 + %51 = fadd double %47, %50 + %52 = load double** %1, align 8 + %53 = getelementptr inbounds double* %52, i64 3 + store double %51, double* %53, align 8 + ret void +} + +attributes #0 = { nounwind }