Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -149,6 +149,20 @@ return true; } +static unsigned checkAddSubInst(ArrayRef VL) { + Instruction *I0 = dyn_cast(VL[0]); + unsigned Opcode = I0->getOpcode(); + if (Opcode != Instruction::FAdd) + return 0; + for (int i = 1, e = VL.size(); i < e; i++) { + Instruction *I = dyn_cast(VL[i]); + if (!I || + I->getOpcode() != ((i & 1) ? Instruction::FSub : Instruction::FAdd)) + return 0; + } + return Instruction::ShuffleVector; +} + /// \returns The opcode if all of the Instructions in \p VL have the same /// opcode, or zero. static unsigned getSameOpcode(ArrayRef VL) { @@ -158,8 +172,11 @@ unsigned Opcode = I0->getOpcode(); for (int i = 1, e = VL.size(); i < e; i++) { Instruction *I = dyn_cast(VL[i]); - if (!I || Opcode != I->getOpcode()) + if (!I || Opcode != I->getOpcode()) { + if (Opcode == Instruction::FAdd && i == 1) + return checkAddSubInst(VL); return 0; + } } return Opcode; } @@ -377,6 +394,7 @@ /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); + private: struct TreeEntry; @@ -471,7 +489,9 @@ if (Vectorized) { Last->LastScalarIndex = getLastIndex(VL); for (int i = 0, e = VL.size(); i != e; ++i) { - assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); + assert(!(getSameOpcode(VL) != Instruction::ShuffleVector && + ScalarToTreeEntry.count(VL[i])) && + "Scalar already in tree!"); ScalarToTreeEntry[VL[i]] = idx; } } else { @@ -615,10 +635,9 @@ newTreeEntry(VL, false); return; } - + unsigned Opcode = getSameOpcode(VL); // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || - !getSameOpcode(VL)) { + if (allConstant(VL) || isSplat(VL) || !getSameBlock(VL) || !Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); newTreeEntry(VL, false); return; @@ -732,7 +751,7 @@ // Check that every instructions appears once in this bundle. for (unsigned i = 0, e = VL.size(); i < e; ++i) for (unsigned j = i+1; j < e; ++j) - if (VL[i] == VL[j]) { + if ((VL[i] == VL[j]) && (Opcode != Instruction::ShuffleVector)) { DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); newTreeEntry(VL, false); return; @@ -754,8 +773,6 @@ DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - unsigned Opcode = getSameOpcode(VL); - // Check if it is safe to sink the loads or the stores. if (Opcode == Instruction::Load || Opcode == Instruction::Store) { Instruction *Last = getLastInstruction(VL); @@ -987,6 +1004,19 @@ } return; } + case Instruction::ShuffleVector: { + newTreeEntry(VL, true); + DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); + for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { + ValueList Operands; + // Prepare the operand vector. + for (unsigned j = 0; j < VL.size(); ++j) + Operands.push_back(cast(VL[j])->getOperand(i)); + + buildTree_rec(Operands, Depth + 1); + } + return; + } default: newTreeEntry(VL, false); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); @@ -1010,11 +1040,9 @@ } return getGatherCost(E->Scalars); } - - assert(getSameOpcode(VL) && getSameType(VL) && getSameBlock(VL) && - "Invalid VL"); + unsigned Opcode = getSameOpcode(VL); + assert(Opcode && getSameType(VL) && getSameBlock(VL) && "Invalid VL"); Instruction *VL0 = cast(VL[0]); - unsigned Opcode = VL0->getOpcode(); switch (Opcode) { case Instruction::PHI: { return 0; @@ -1158,6 +1186,33 @@ return VecCallCost - ScalarCallCost; } + case Instruction::ShuffleVector: { + TargetTransformInfo::OperandValueKind Op1VK = + TargetTransformInfo::OK_AnyValue; + TargetTransformInfo::OperandValueKind Op2VK = + TargetTransformInfo::OK_AnyValue; + int ScalarCost = 0; + int VecCost = 0; + for (unsigned i = 0; i < VL.size(); ++i) { + Instruction *I = cast(VL[i]); + if (!I) + break; + ScalarCost += + TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); + } + // VecCost is equal to sum of the cost of creating 2 vectors + // and the cost of creating shuffle. + Instruction *I0 = cast(VL[0]); + VecCost = + TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); + Instruction *I1 = cast(VL[1]); + VecCost += + TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); + + VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); + VecCost += TTI->getCastInstrCost(Opcode, VecTy, MaskTy); + return VecCost - ScalarCost; + } default: llvm_unreachable("Unknown instruction"); } @@ -1438,9 +1493,7 @@ setInsertPointAfterBundle(E->Scalars); return Gather(E->Scalars, VecTy); } - - unsigned Opcode = VL0->getOpcode(); - assert(Opcode == getSameOpcode(E->Scalars) && "Invalid opcode"); + unsigned Opcode = getSameOpcode(E->Scalars); switch (Opcode) { case Instruction::PHI: { @@ -1673,6 +1726,49 @@ E->VectorizedValue = V; return V; } + case Instruction::ShuffleVector: { + ValueList LHSVL, RHSVL; + for (int i = 0, e = E->Scalars.size(); i < e; ++i) { + LHSVL.push_back(cast(E->Scalars[i])->getOperand(0)); + RHSVL.push_back(cast(E->Scalars[i])->getOperand(1)); + } + setInsertPointAfterBundle(E->Scalars); + + Value *LHS = vectorizeTree(LHSVL); + Value *RHS = vectorizeTree(RHSVL); + + if (Value *V = alreadyVectorized(E->Scalars)) + return V; + + // Create a vector of LHS op1 RHS + BinaryOperator *BinOp0 = cast(VL0); + Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); + + // Create a vector of LHS op2 RHS + Instruction *VL1 = cast(E->Scalars[1]); + BinaryOperator *BinOp1 = cast(VL1); + Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); + + // Create appropriate shuffle to take alternative operations from + // the vector. + std::vector Mask(E->Scalars.size()); + unsigned e = E->Scalars.size(); + for (unsigned i = 0; i < e; ++i) { + if (i & 1) + Mask[i] = Builder.getInt32(e + i); + else + Mask[i] = Builder.getInt32(i); + } + + Value *ShuffleMask = ConstantVector::get(Mask); + + Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); + E->VectorizedValue = V; + if (Instruction *I = dyn_cast(V)) + return propagateMetadata(I, E->Scalars); + + return V; + } default: llvm_unreachable("unknown inst"); } @@ -1737,11 +1833,15 @@ // For each vectorized value: for (int EIdx = 0, EE = VectorizableTree.size(); EIdx < EE; ++EIdx) { TreeEntry *Entry = &VectorizableTree[EIdx]; + SmallDenseMap ScalarEntry; // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - + // If this Scalar was already handled then continue + if (ScalarEntry.count(Scalar)) + continue; + ScalarEntry[Scalar] = Lane; // No need to handle users of gathered values. if (Entry->NeedToGather) continue; @@ -1925,7 +2025,6 @@ for (po_iterator it = po_begin(&F.getEntryBlock()), e = po_end(&F.getEntryBlock()); it != e; ++it) { BasicBlock *BB = *it; - // Vectorize trees that end at stores. if (unsigned count = collectStores(BB, R)) { (void)count; Index: test/Transforms/SLPVectorizer/X86/addsub.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/addsub.ll +++ test/Transforms/SLPVectorizer/X86/addsub.ll @@ -0,0 +1,43 @@ +; RUN: opt < %s -basicaa -slp-vectorizer -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@a = common global [4 x float] zeroinitializer, align 16 +@b = common global [4 x float] zeroinitializer, align 16 +@c = common global [4 x float] zeroinitializer, align 16 + +; CHECK-LABEL: @addsub +; CHECK: fadd <4 x float> +; CHECK: fsub <4 x float> +; CHECK: shufflevector +; Function Attrs: nounwind uwtable +define void @addsub() #0 { + %1 = load float* getelementptr inbounds ([4 x float]* @a, i64 0, i64 0), align 16, !tbaa !1 + %2 = load float* getelementptr inbounds ([4 x float]* @b, i64 0, i64 0), align 16, !tbaa !1 + %3 = fadd float %1, %2 + store float %3, float* getelementptr inbounds ([4 x float]* @c, i64 0, i64 0), align 16, !tbaa !1 + %4 = load float* getelementptr inbounds ([4 x float]* @a, i64 0, i64 1), align 4, !tbaa !1 + %5 = load float* getelementptr inbounds ([4 x float]* @b, i64 0, i64 1), align 4, !tbaa !1 + %6 = fsub float %4, %5 + store float %6, float* getelementptr inbounds ([4 x float]* @c, i64 0, i64 1), align 4, !tbaa !1 + %7 = load float* getelementptr inbounds ([4 x float]* @a, i64 0, i64 2), align 8, !tbaa !1 + %8 = load float* getelementptr inbounds ([4 x float]* @b, i64 0, i64 2), align 8, !tbaa !1 + %9 = fadd float %7, %8 + store float %9, float* getelementptr inbounds ([4 x float]* @c, i64 0, i64 2), align 8, !tbaa !1 + %10 = load float* getelementptr inbounds ([4 x float]* @a, i64 0, i64 3), align 4, !tbaa !1 + %11 = load float* getelementptr inbounds ([4 x float]* @b, i64 0, i64 3), align 4, !tbaa !1 + %12 = fsub float %10, %11 + store float %12, float* getelementptr inbounds ([4 x float]* @c, i64 0, i64 3), align 4, !tbaa !1 + ret void +} + +attributes #0 = { nounwind } + +!llvm.ident = !{!0} + +!0 = metadata !{metadata !"clang version 3.5.0 (209794)"} +!1 = metadata !{metadata !2, metadata !2, i64 0} +!2 = metadata !{metadata !"float", metadata !3, i64 0} +!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0} +!4 = metadata !{metadata !"Simple C/C++ TBAA"}