Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -655,7 +655,7 @@ /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. - int getGatherCost(Type *Ty); + int getGatherCost(Type *Ty, const DenseSet &ShuffledIndicies); /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the @@ -689,8 +689,12 @@ /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef VL) const { - assert(VL.size() == Scalars.size() && "Invalid size"); - return std::equal(VL.begin(), VL.end(), Scalars.begin()); + if (VL.size() == Scalars.size()) + return std::equal(VL.begin(), VL.end(), Scalars.begin()); + assert(VL.size() == ReuseShuffleIndicies.size() && "Invalid size"); + return std::equal( + VL.begin(), VL.end(), ReuseShuffleIndicies.begin(), + [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; }); } /// A vector of scalars. @@ -702,6 +706,9 @@ /// Do we need to gather this sequence ? bool NeedToGather = false; + /// Does this sequence require some shuffling? + SmallVector ReuseShuffleIndicies; + /// Points back to the VectorizableTree. /// /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has @@ -716,13 +723,16 @@ }; /// Create a new VectorizableTree entry. - TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized, - int &UserTreeIdx) { + TreeEntry * + newTreeEntry(ArrayRef VL, bool Vectorized, int &UserTreeIdx, + ArrayRef ReuseShuffleIndicies = None) { VectorizableTree.emplace_back(VectorizableTree); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; + Last->ReuseShuffleIndicies.append(ReuseShuffleIndicies.begin(), + ReuseShuffleIndicies.end()); if (Vectorized) { for (int i = 0, e = VL.size(); i != e; ++i) { assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); @@ -1471,13 +1481,27 @@ } // Check that every instruction appears once in this bundle. - for (unsigned i = 0, e = VL.size(); i < e; ++i) - for (unsigned j = i + 1; j < e; ++j) - if (VL[i] == VL[j]) { - DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); - return; - } + SmallVector ReuseShuffleIndicies; + SmallVector UniqueValues; + DenseMap UniquePositions; + for (Value *V : VL) { + auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); + ReuseShuffleIndicies.emplace_back(Res.first->second); + if (!Res.second) + continue; + UniqueValues.emplace_back(V); + } + if (UniqueValues.size() == VL.size()) { + ReuseShuffleIndicies.clear(); + } else { + DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); + if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { + DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + newTreeEntry(VL, false, UserTreeIdx); + return; + } + VL = UniqueValues; + } auto &BSRef = BlocksSchedules[BB]; if (!BSRef) @@ -1485,12 +1509,12 @@ BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, this, S.OpValue)) { + if (!BS.tryScheduleBundle(VL, this, VL0)) { DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1509,12 +1533,12 @@ if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1532,11 +1556,11 @@ case Instruction::ExtractElement: { bool Reuse = canReuseExtract(VL, VL0); if (Reuse) { - DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); + DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); } else { BS.cancelScheduling(VL, VL0); } - newTreeEntry(VL, Reuse, UserTreeIdx); + newTreeEntry(VL, Reuse, UserTreeIdx, ReuseShuffleIndicies); return; } case Instruction::Load: { @@ -1551,7 +1575,7 @@ if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1562,7 +1586,7 @@ LoadInst *L = cast(VL[i]); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1584,7 +1608,7 @@ if (Consecutive) { ++NumLoadsWantToKeepOrder; - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; } @@ -1599,7 +1623,7 @@ } BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); if (ReverseConsecutive) { ++NumLoadsWantToChangeOrder; @@ -1626,12 +1650,12 @@ Type *Ty = cast(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1654,13 +1678,13 @@ if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1692,7 +1716,7 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1721,7 +1745,7 @@ if (cast(VL[j])->getNumOperands() != 2) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } @@ -1734,7 +1758,7 @@ if (Ty0 != CurTy) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } @@ -1746,12 +1770,12 @@ DEBUG( dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1768,12 +1792,12 @@ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; @@ -1791,7 +1815,7 @@ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1805,7 +1829,7 @@ getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1816,7 +1840,7 @@ Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument "<< A1I<<"!=" << A1J << "\n"); @@ -1829,14 +1853,14 @@ CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1853,11 +1877,11 @@ // then do not vectorize this instruction. if (!S.IsAltShuffle) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx); + newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. @@ -1881,7 +1905,7 @@ default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -1974,13 +1998,21 @@ VecTy = VectorType::get( IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + unsigned ReuseShuffleNumbers = E->ReuseShuffleIndicies.size(); + int ReuseShuffleCost = 0; + if (ReuseShuffleNumbers) { + ReuseShuffleCost = + TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } if (E->NeedToGather) { if (allConstant(VL)) return 0; if (isSplat(VL)) { - return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); + return ReuseShuffleCost + + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } - if (getSameOpcode(VL).Opcode == Instruction::ExtractElement) { + if (getSameOpcode(VL).Opcode == Instruction::ExtractElement && + allSameType(VL) && allSameBlock(VL)) { Optional ShuffleKind = isShuffle(VL); if (ShuffleKind.hasValue()) { int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); @@ -1997,10 +2029,10 @@ IO->getZExtValue()); } } - return Cost; + return ReuseShuffleCost + Cost; } } - return getGatherCost(E->Scalars); + return ReuseShuffleCost + getGatherCost(VL); } InstructionsState S = getSameOpcode(VL); assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); @@ -2013,8 +2045,36 @@ case Instruction::ExtractValue: case Instruction::ExtractElement: + if (ReuseShuffleNumbers) { + unsigned Idx = 0; + for (unsigned I : E->ReuseShuffleIndicies) { + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *IO = cast( + cast(VL[I])->getIndexOperand()); + Idx = IO->getZExtValue(); + ReuseShuffleCost -= TTI->getVectorInstrCost( + Instruction::ExtractElement, VecTy, Idx); + } else { + ReuseShuffleCost -= TTI->getVectorInstrCost( + Instruction::ExtractElement, VecTy, Idx); + ++Idx; + } + } + Idx = ReuseShuffleNumbers; + for (Value *V : VL) { + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *IO = cast( + cast(V)->getIndexOperand()); + Idx = IO->getZExtValue(); + } else { + --Idx; + } + ReuseShuffleCost += + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); + } + } if (canReuseExtract(VL, S.OpValue)) { - int DeadCost = 0; + int DeadCost = ReuseShuffleCost; for (unsigned i = 0, e = VL.size(); i < e; ++i) { Instruction *E = cast(VL[i]); // If all users are going to be vectorized, instruction can be @@ -2022,12 +2082,12 @@ // The same, if have only one user, it will be vectorized for sure. if (areAllUsersVectorized(E)) // Take credit for instruction that will become dead. - DeadCost += + DeadCost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); } - return -DeadCost; + return DeadCost; } - return getGatherCost(VecTy); + return ReuseShuffleCost + getGatherCost(VL); case Instruction::ZExt: case Instruction::SExt: @@ -2042,6 +2102,11 @@ case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); + if (ReuseShuffleNumbers) { + ReuseShuffleCost -= + (ReuseShuffleNumbers - VL.size()) * + TTI->getCastInstrCost(S.Opcode, ScalarTy, SrcTy, VL0); + } // Calculate the cost of this instruction. int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(), @@ -2049,17 +2114,22 @@ VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0); - return VecCost - ScalarCost; + return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::FCmp: case Instruction::ICmp: case Instruction::Select: { // Calculate the cost of this instruction. + if (ReuseShuffleNumbers) { + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * + TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, + Builder.getInt1Ty(), VL0); + } VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0); int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0); - return VecCost - ScalarCost; + return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Add: case Instruction::FAdd: @@ -2117,13 +2187,19 @@ Op2VP = TargetTransformInfo::OP_PowerOf2; SmallVector Operands(VL0->operand_values()); + if (ReuseShuffleNumbers) { + ReuseShuffleCost -= + (ReuseShuffleNumbers - VL.size()) * + TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, + Op2VP, Operands); + } int ScalarCost = VecTy->getNumElements() * TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); - return VecCost - ScalarCost; + return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { TargetTransformInfo::OperandValueKind Op1VK = @@ -2131,31 +2207,46 @@ TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_UniformConstantValue; + if (ReuseShuffleNumbers) { + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * + TTI->getArithmeticInstrCost(Instruction::Add, + ScalarTy, Op1VK, Op2VK); + } int ScalarCost = VecTy->getNumElements() * TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); int VecCost = TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); - return VecCost - ScalarCost; + return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Load: { // Cost of wide load - cost of scalar loads. unsigned alignment = dyn_cast(VL0)->getAlignment(); + if (ReuseShuffleNumbers) { + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, + alignment, 0, VL0); + } int ScalarLdCost = VecTy->getNumElements() * TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); - return VecLdCost - ScalarLdCost; + return ReuseShuffleCost + VecLdCost - ScalarLdCost; } case Instruction::Store: { // We know that we can merge the stores. Calculate the cost. unsigned alignment = dyn_cast(VL0)->getAlignment(); + if (ReuseShuffleNumbers) { + ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * + TTI->getMemoryOpCost(Instruction::Store, ScalarTy, + alignment, 0, VL0); + } int ScalarStCost = VecTy->getNumElements() * TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); int VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0); - return VecStCost - ScalarStCost; + return ReuseShuffleCost + VecStCost - ScalarStCost; } case Instruction::Call: { CallInst *CI = cast(VL0); @@ -2170,6 +2261,11 @@ if (auto *FPMO = dyn_cast(CI)) FMF = FPMO->getFastMathFlags(); + if (ReuseShuffleNumbers) { + ReuseShuffleCost -= + (ReuseShuffleNumbers - VL.size()) * + TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); + } int ScalarCallCost = VecTy->getNumElements() * TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); @@ -2181,7 +2277,7 @@ << " (" << VecCallCost << "-" << ScalarCallCost << ")" << " for " << *CI << "\n"); - return VecCallCost - ScalarCallCost; + return ReuseShuffleCost + VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { TargetTransformInfo::OperandValueKind Op1VK = @@ -2189,6 +2285,22 @@ TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; int ScalarCost = 0; + if (ReuseShuffleNumbers) { + for (unsigned Idx : E->ReuseShuffleIndicies) { + Instruction *I = cast(VL[Idx]); + if (!I) + continue; + ReuseShuffleCost -= TTI->getArithmeticInstrCost( + I->getOpcode(), ScalarTy, Op1VK, Op2VK); + } + for (Value *V : VL) { + Instruction *I = cast(V); + if (!I) + continue; + ReuseShuffleCost += TTI->getArithmeticInstrCost( + I->getOpcode(), ScalarTy, Op1VK, Op2VK); + } + } int VecCost = 0; for (Value *i : VL) { Instruction *I = cast(i); @@ -2207,7 +2319,7 @@ TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); - return VecCost - ScalarCost; + return ReuseShuffleCost + VecCost - ScalarCost; } default: llvm_unreachable("Unknown instruction"); @@ -2232,7 +2344,8 @@ return true; // Gathering cost would be too much for tiny trees. - if (VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) + if ((VectorizableTree[0].NeedToGather || VectorizableTree[1].NeedToGather) && + VectorizableTree[0].Scalars.size() < 4) return false; return true; @@ -2383,10 +2496,14 @@ return Cost; } -int BoUpSLP::getGatherCost(Type *Ty) { +int BoUpSLP::getGatherCost(Type *Ty, + const DenseSet &ShuffledIndicies) { int Cost = 0; for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) - Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + if (!ShuffledIndicies.count(i)) + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + if (!ShuffledIndicies.empty()) + Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; } @@ -2397,7 +2514,17 @@ ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); // Find the cost of inserting/extracting values from the vector. - return getGatherCost(VecTy); + // Check if the same elements are inserted several times and count them as + // shuffle candidates. + DenseSet ShuffledElements; + DenseSet UniqueElements; + // Iterate in reverse order to consider insert elements with the high cost. + for (unsigned I = VL.size(); I > 0; --I) { + unsigned Idx = I - 1; + if (!UniqueElements.insert(VL[Idx]).second) + ShuffledElements.insert(Idx); + } + return getGatherCost(VecTy, ShuffledElements); } // Reorder commutative operations in alternate shuffle if the resulting vectors @@ -2751,9 +2878,15 @@ ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); + bool NeedToShuffleReuses = !E->ReuseShuffleIndicies.empty(); + if (E->NeedToGather) { setInsertPointAfterBundle(E->Scalars, VL0); auto *V = Gather(E->Scalars, VecTy); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndicies, "shuffle"); + } E->VectorizedValue = V; return V; } @@ -2793,17 +2926,29 @@ assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() && "Invalid number of incoming values"); - return NewPhi; + if (NeedToShuffleReuses) { + E->VectorizedValue = Builder.CreateShuffleVector( + NewPhi, UndefValue::get(VecTy), E->ReuseShuffleIndicies, "shuffle"); + } + return E->VectorizedValue; } case Instruction::ExtractElement: { if (canReuseExtract(E->Scalars, VL0)) { Value *V = VL0->getOperand(0); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndicies, "shuffle"); + } E->VectorizedValue = V; return V; } setInsertPointAfterBundle(E->Scalars, VL0); auto *V = Gather(E->Scalars, VecTy); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndicies, "shuffle"); + } E->VectorizedValue = V; return V; } @@ -2814,11 +2959,20 @@ PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment()); - E->VectorizedValue = V; - return propagateMetadata(V, E->Scalars); + Value *NewV = propagateMetadata(V, E->Scalars); + if (NeedToShuffleReuses) { + NewV = Builder.CreateShuffleVector( + NewV, UndefValue::get(VecTy), E->ReuseShuffleIndicies, "shuffle"); + } + E->VectorizedValue = NewV; + return NewV; } setInsertPointAfterBundle(E->Scalars, VL0); auto *V = Gather(E->Scalars, VecTy); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndicies, "shuffle"); + } E->VectorizedValue = V; return V; } @@ -2847,6 +3001,10 @@ CastInst *CI = dyn_cast(VL0); Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndicies, "shuffle"); + } E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -2874,8 +3032,12 @@ else V = Builder.CreateICmp(P0, L, R); + propagateIRFlags(V, E->Scalars, VL0); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndicies, "shuffle"); + } E->VectorizedValue = V; - propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; return V; } @@ -2897,6 +3059,10 @@ return V; Value *V = Builder.CreateSelect(Cond, True, False); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndicies, "shuffle"); + } E->VectorizedValue = V; ++NumVectorInstructions; return V; @@ -2940,13 +3106,17 @@ Value *V = Builder.CreateBinOp( static_cast(S.Opcode), LHS, RHS); + propagateIRFlags(V, E->Scalars, VL0); + if (auto *I = dyn_cast(V)) + V = propagateMetadata(I, E->Scalars); + + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndicies, "shuffle"); + } E->VectorizedValue = V; - propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; - if (Instruction *I = dyn_cast(V)) - return propagateMetadata(I, E->Scalars); - return V; } case Instruction::Load: { @@ -2974,9 +3144,14 @@ Alignment = DL->getABITypeAlignment(ScalarLoadTy); } LI->setAlignment(Alignment); - E->VectorizedValue = LI; + Value *V = propagateMetadata(LI, E->Scalars); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndicies, "shuffle"); + } + E->VectorizedValue = V; ++NumVectorInstructions; - return propagateMetadata(LI, E->Scalars); + return V; } case Instruction::Store: { StoreInst *SI = cast(VL0); @@ -3004,9 +3179,14 @@ Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); S->setAlignment(Alignment); - E->VectorizedValue = S; + Value *V = propagateMetadata(S, E->Scalars); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndicies, "shuffle"); + } + E->VectorizedValue = V; ++NumVectorInstructions; - return propagateMetadata(S, E->Scalars); + return V; } case Instruction::GetElementPtr: { setInsertPointAfterBundle(E->Scalars, VL0); @@ -3030,12 +3210,16 @@ Value *V = Builder.CreateGEP( cast(VL0)->getSourceElementType(), Op0, OpVecs); + if (Instruction *I = dyn_cast(V)) + V = propagateMetadata(I, E->Scalars); + + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndicies, "shuffle"); + } E->VectorizedValue = V; ++NumVectorInstructions; - if (Instruction *I = dyn_cast(V)) - return propagateMetadata(I, E->Scalars); - return V; } case Instruction::Call: { @@ -3082,8 +3266,12 @@ if (ScalarArg && getTreeEntry(ScalarArg)) ExternalUses.push_back(ExternalUser(ScalarArg, cast(V), 0)); + propagateIRFlags(V, E->Scalars, VL0); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndicies, "shuffle"); + } E->VectorizedValue = V; - propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; return V; } @@ -3130,10 +3318,14 @@ propagateIRFlags(V1, OddScalars); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); + if (Instruction *I = dyn_cast(V)) + V = propagateMetadata(I, E->Scalars); + if (NeedToShuffleReuses) { + V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + E->ReuseShuffleIndicies, "shuffle"); + } E->VectorizedValue = V; ++NumVectorInstructions; - if (Instruction *I = dyn_cast(V)) - return propagateMetadata(I, E->Scalars); return V; } Index: test/Transforms/SLPVectorizer/X86/PR32086.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/PR32086.ll +++ test/Transforms/SLPVectorizer/X86/PR32086.ll @@ -4,15 +4,14 @@ define void @i64_simplified(i64* noalias %st, i64* noalias %ld) { ; CHECK-LABEL: @i64_simplified( ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 -; CHECK-NEXT: [[T0:%.*]] = load i64, i64* [[LD]], align 8 -; CHECK-NEXT: [[T1:%.*]] = load i64, i64* [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 -; CHECK-NEXT: store i64 [[T0]], i64* [[ST]], align 8 -; CHECK-NEXT: store i64 [[T1]], i64* [[ARRAYIDX3]], align 8 -; CHECK-NEXT: store i64 [[T0]], i64* [[ARRAYIDX4]], align 8 -; CHECK-NEXT: store i64 [[T1]], i64* [[ARRAYIDX5]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>* +; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8 ; CHECK-NEXT: ret void ; %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1 Index: test/Transforms/SLPVectorizer/X86/blending-shuffle.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/blending-shuffle.ll +++ test/Transforms/SLPVectorizer/X86/blending-shuffle.ll @@ -137,17 +137,19 @@ define i8 @k_bb(<4 x i8> %x) { ; CHECK-LABEL: @k_bb( +; CHECK-NEXT: [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0 ; CHECK-NEXT: br label [[BB1:%.*]] ; CHECK: bb1: -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X:%.*]], [[X]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i8> [[X]], [[X]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> undef, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i8> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i8> [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i8> [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = sdiv i8 [[TMP6]], [[TMP7]] -; CHECK-NEXT: ret i8 [[TMP8]] +; CHECK-NEXT: [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3 +; CHECK-NEXT: [[X0X0:%.*]] = mul i8 [[X0]], [[X0]] +; CHECK-NEXT: [[X3X3:%.*]] = mul i8 [[X3]], [[X3]] +; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i8> [[X]], [[X]] +; CHECK-NEXT: [[TMP2:%.*]] = add i8 [[X0X0]], [[X3X3]] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = add i8 [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = sdiv i8 [[TMP2]], [[TMP5]] +; CHECK-NEXT: ret i8 [[TMP6]] ; %x0 = extractelement <4 x i8> %x, i32 0 br label %bb1