Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -172,40 +172,6 @@ !Ty->isPPC_FP128Ty(); } -/// \returns true if all of the instructions in \p VL are in the same block or -/// false otherwise. -static bool allSameBlock(ArrayRef VL) { - Instruction *I0 = dyn_cast(VL[0]); - if (!I0) - return false; - BasicBlock *BB = I0->getParent(); - for (int i = 1, e = VL.size(); i < e; i++) { - Instruction *I = dyn_cast(VL[i]); - if (!I) - return false; - - if (BB != I->getParent()) - return false; - } - return true; -} - -/// \returns True if all of the values in \p VL are constants. -static bool allConstant(ArrayRef VL) { - for (Value *i : VL) - if (!isa(i)) - return false; - return true; -} - -/// \returns True if all of the values in \p VL are identical. -static bool isSplat(ArrayRef VL) { - for (unsigned i = 1, e = VL.size(); i < e; ++i) - if (VL[i] != VL[0]) - return false; - return true; -} - /// Checks if the vector of instructions can be represented as a shuffle, like: /// %x0 = extractelement <4 x i8> %x, i32 0 /// %x3 = extractelement <4 x i8> %x, i32 3 @@ -318,51 +284,156 @@ return AltOp ? AltOp->getOpcode() : 0; } - /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { return getOpcode() != getAltOpcode(); } - - bool isOpcodeOrAlt(Instruction *I) const { - unsigned CheckedOpcode = I->getOpcode(); - return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; - } - InstructionsState() = delete; InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} }; -} // end anonymous namespace +class InstructionOrPseudo { +private: + /// Parent of the operation. + Value *Parent; + /// The operation instance. + Value *Op; + /// Instruction represantation of the operation. + Instruction *OpInst; + /// Pseudo operation after replacement. + Instruction *PseudoOp = nullptr; -/// Chooses the correct key for scheduling data. If \p Op has the same (or -/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p -/// OpValue. -static Value *isOneOf(const InstructionsState &S, Value *Op) { - auto *I = dyn_cast(Op); - if (I && S.isOpcodeOrAlt(I)) +public: + /// Main operation in a group of operations. + Instruction *MainOp = nullptr; + /// Alternative operation in a group of operations. + Instruction *AltOp = nullptr; + /// Main opcode of a group of operations. + unsigned MainOpcode = 0; + /// Alternative opcode of a group of operations. + unsigned AltOpcode = 0; + + InstructionOrPseudo(Value *Parent, Value *Op) : Parent(Parent), Op(Op) { + OpInst = dyn_cast(Op); + } + + /// Set properties of this operation after analysis of a group of operations. + void setInstructionsState(const InstructionsState &S) { + MainOp = S.MainOp; + AltOp = S.AltOp; + if (MainOp) + MainOpcode = MainOp->getOpcode(); + if (AltOp) + AltOpcode = AltOp->getOpcode(); + } + + /// Replace pseudo opration with real one. + void replaceWith(Instruction *NewOp) { + PseudoOp = OpInst; + Op = NewOp; + OpInst = NewOp; + } + + bool isPseudoEqual(Value *V) { + if (!PseudoOp) + return false; + if (PseudoOp == V) + return true; + return false; + } + + bool isPseudo() { + if (PseudoOp) + return true; + return false; + } + + bool isEqual(InstructionOrPseudo *IOP) { + return ((Op == IOP->getOp()) || isPseudoEqual(IOP->getOp()) || + IOP->isPseudoEqual(Op)); + } + + /// Returns parent of this operation + Value* getParent() { + return Parent; + } + + /// Returns a Value represntation of this operation. + Value* getOp() { return Op; - return S.OpValue; + } + + /// Returns an Instruction represntation of this operation. + Instruction* getOpInst() { + return OpInst; + } + + bool isAltShuffle() const { + return (MainOpcode != 0 && AltOpcode != 0 && MainOpcode != AltOpcode); + } + + bool isOpcodeOrAlt() const { + unsigned CheckedOpcode = OpInst->getOpcode(); + return MainOpcode == CheckedOpcode || AltOpcode == CheckedOpcode; + } + + std::pair getKey() const { + assert(Parent && "Incorrect parent!"); + return std::make_pair(Parent, MainOpcode); + } +}; +} // end anonymous namespace + +/// \returns True if all of the values in \p IL are constants. +static bool allConstant(ArrayRef IL) { + return llvm::all_of( + make_range(IL.begin(), IL.end()), + [](InstructionOrPseudo *I) -> bool { return isa(I->getOp()); }); } -/// \returns analysis of the Instructions in \p VL described in +/// \returns True if all of the values in \p IL are identical. +static bool isSplat(ArrayRef IL) { + return llvm::all_of(make_range(IL.begin(), IL.end()), + [&](InstructionOrPseudo *I) -> bool { + return I->getOp() == IL[0]->getOp(); + }); +} + +/// \returns true if all of the instructions in \p IL are in the same block or +/// false otherwise. +static bool allSameBlock(ArrayRef IL) { + if (!IL[0]->getOpInst()) + return false; + BasicBlock *BB = IL[0]->getOpInst()->getParent(); + for (int i = 1, e = IL.size(); i < e; i++) { + if (!IL[i]->getOpInst()) + return false; + + if (BB != IL[i]->getOpInst()->getParent()) + return false; + } + return true; +} + +/// \returns analysis of the Instructions in \p IL described in /// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. -static InstructionsState getSameOpcode(ArrayRef VL, +static InstructionsState getSameOpcode(ArrayRef IL, unsigned BaseIndex = 0) { // Make sure these are all Instructions. - if (llvm::any_of(VL, [](Value *V) { return !isa(V); })) - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + if (llvm::any_of(IL, [](InstructionOrPseudo *IOP) { return !IOP->getOpInst(); })) + return InstructionsState(IL[BaseIndex]->getOp(), nullptr, nullptr); - bool IsCastOp = isa(VL[BaseIndex]); - bool IsBinOp = isa(VL[BaseIndex]); - unsigned Opcode = cast(VL[BaseIndex])->getOpcode(); + Instruction* BaseOp = IL[BaseIndex]->getOpInst(); + bool IsCastOp = isa(BaseOp); + bool IsBinOp = isa(BaseOp); + unsigned Opcode = BaseOp->getOpcode(); unsigned AltOpcode = Opcode; unsigned AltIndex = BaseIndex; // Check for one alternate opcode from another BinaryOperator. // TODO - generalize to support all operators (types, calls etc.). - for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { - unsigned InstOpcode = cast(VL[Cnt])->getOpcode(); - if (IsBinOp && isa(VL[Cnt])) { + for (int Cnt = 0, E = IL.size(); Cnt < E; Cnt++) { + auto *I = IL[Cnt]->getOpInst(); + unsigned InstOpcode = I->getOpcode(); + if (IsBinOp && isa(I)) { if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; if (Opcode == AltOpcode) { @@ -370,9 +441,9 @@ AltIndex = Cnt; continue; } - } else if (IsCastOp && isa(VL[Cnt])) { - Type *Ty0 = cast(VL[BaseIndex])->getOperand(0)->getType(); - Type *Ty1 = cast(VL[Cnt])->getOperand(0)->getType(); + } else if (IsCastOp && isa(I)) { + Type *Ty0 = BaseOp->getOperand(0)->getType(); + Type *Ty1 = I->getOperand(0)->getType(); if (Ty0 == Ty1) { if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; @@ -384,22 +455,19 @@ } } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) continue; - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + return InstructionsState(BaseOp, nullptr, nullptr); } - return InstructionsState(VL[BaseIndex], cast(VL[BaseIndex]), - cast(VL[AltIndex])); + return InstructionsState(BaseOp, BaseOp, IL[AltIndex]->getOpInst()); } /// \returns true if all of the values in \p VL have the same type or false /// otherwise. static bool allSameType(ArrayRef VL) { Type *Ty = VL[0]->getType(); - for (int i = 1, e = VL.size(); i < e; i++) - if (VL[i]->getType() != Ty) - return false; - - return true; + return llvm::all_of( + make_range(VL.begin(), VL.end()), + [&](Value *V) -> bool { return V->getType() == Ty; }); } /// \returns True if Extract{Value,Element} instruction extracts element Idx. @@ -478,6 +546,7 @@ using InstrList = SmallVector; using ValueSet = SmallPtrSet; using StoreList = SmallVector; + using IOPList = SmallVector; using ExtraValueToDebugLocsMap = MapVector>; @@ -537,6 +606,7 @@ /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { + InstructionOrPseudoArray.clear(); VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); @@ -612,21 +682,21 @@ int getEntryCost(TreeEntry *E); /// This is the recursive part of buildTree. - void buildTree_rec(ArrayRef Roots, unsigned Depth, int); + void buildTree_rec(Value *Parent, ArrayRef Roots, unsigned Depth, int); - /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can + /// \returns true if the ExtractElement/ExtractValue instructions in \p IL can /// be vectorized to use the original vector (or aggregate "bitcast" to a /// vector) and sets \p CurrentOrder to the identity permutation; otherwise /// returns false, setting \p CurrentOrder to either an empty vector or a /// non-identity permutation that allows to reuse extract instructions. - bool canReuseExtract(ArrayRef VL, Value *OpValue, + bool canReuseExtract(ArrayRef IL, SmallVectorImpl &CurrentOrder) const; /// Vectorize a single entry in the tree. Value *vectorizeTree(TreeEntry *E); /// Vectorize a single entry in the tree, starting in \p VL. - Value *vectorizeTree(ArrayRef VL); + Value *vectorizeTree(Value *Parent, ArrayRef VL); /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. @@ -635,47 +705,55 @@ /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. - int getGatherCost(ArrayRef VL); + int getGatherCost(ArrayRef IL); /// Set the Builder insert point to one after the last instruction in /// the bundle - void setInsertPointAfterBundle(ArrayRef VL, - const InstructionsState &S); + void setInsertPointAfterBundle(ArrayRef IL); - /// \returns a vector from a collection of scalars in \p VL. - Value *Gather(ArrayRef VL, VectorType *Ty); + /// \returns a vector from a collection of scalars in \p IL. + Value *Gather(ArrayRef IL, VectorType *Ty); /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(); + Instruction *propagateMetadata(Instruction *I, + ArrayRef IL); + + void propagateIRFlags(Value *I, + ArrayRef IL, + Value *OpValue); + /// \reorder commutative operands in alt shuffle if they result in /// vectorized code. - void reorderAltShuffleOperands(const InstructionsState &S, - ArrayRef VL, + void reorderAltShuffleOperands(ArrayRef IL, SmallVectorImpl &Left, SmallVectorImpl &Right); /// \reorder commutative operands to get better probability of /// generating vectorized code. - void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef VL, + void reorderInputsAccordingToOpcode(ArrayRef IL, SmallVectorImpl &Left, SmallVectorImpl &Right); struct TreeEntry { TreeEntry(std::vector &Container) : Container(Container) {} - /// \returns true if the scalars in VL are equal to this entry. - bool isSame(ArrayRef VL) const { - if (VL.size() == Scalars.size()) - return std::equal(VL.begin(), VL.end(), Scalars.begin()); - return VL.size() == ReuseShuffleIndices.size() && + /// \returns true if the scalars in IL are equal to this entry. + bool isSame(ArrayRef IL) const { + if (IL.size() == Scalars.size()) + return std::equal(IL.begin(), IL.end(), Scalars.begin(), + [this](InstructionOrPseudo *I1, InstructionOrPseudo *I2) + { return I1->isEqual(I2); }); + return IL.size() == ReuseShuffleIndices.size() && std::equal( - VL.begin(), VL.end(), ReuseShuffleIndices.begin(), - [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; }); + IL.begin(), IL.end(), ReuseShuffleIndices.begin(), + [this](InstructionOrPseudo *I1, unsigned Idx) + { return I1->isEqual(Scalars[Idx]); }); } /// A vector of scalars. - ValueList Scalars; + IOPList Scalars; /// The Scalars are vectorized into this value. It is initialized to Null. Value *VectorizedValue = nullptr; @@ -703,24 +781,31 @@ }; /// Create a new VectorizableTree entry. - void newTreeEntry(ArrayRef VL, bool Vectorized, int &UserTreeIdx, + void newTreeEntry(ArrayRef IL, bool Vectorized, + int &UserTreeIdx, ArrayRef ReuseShuffleIndices = None, ArrayRef ReorderIndices = None) { VectorizableTree.emplace_back(VectorizableTree); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; - Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); + Last->Scalars.insert(Last->Scalars.begin(), IL.begin(), IL.end()); Last->NeedToGather = !Vectorized; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; if (Vectorized) { - for (int i = 0, e = VL.size(); i != e; ++i) { - assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = idx; + for (InstructionOrPseudo *IOP : IL) { + auto *I = IOP->getOpInst(); + assert( + (ScalarToTreeEntry.find(I) == ScalarToTreeEntry.end() || + ScalarToTreeEntry[I].find(IOP->getKey()) == + ScalarToTreeEntry[I].end()) && + "Scalar already in tree!"); + ScalarToTreeEntry[I][IOP->getKey()] = idx; } } else { - MustGather.insert(VL.begin(), VL.end()); + for (InstructionOrPseudo *IOP : IL) + MustGather.insert(IOP->getOp()); } if (UserTreeIdx >= 0) @@ -733,14 +818,33 @@ std::vector VectorizableTree; TreeEntry *getTreeEntry(Value *V) { - auto I = ScalarToTreeEntry.find(V); - if (I != ScalarToTreeEntry.end()) - return &VectorizableTree[I->second]; + auto *I = dyn_cast(V); + if (!I) + return nullptr; + auto It = ScalarToTreeEntry.find(I); + if (It != ScalarToTreeEntry.end()) { + auto &STT = It->second; + for (auto STTI : STT) + for (auto *IOP : VectorizableTree[STTI.second].Scalars) + if (IOP->getOpInst() == I) + return &VectorizableTree[STTI.second]; + } return nullptr; } + std::vector> InstructionOrPseudoArray; + + /// Allocates InstructionOrPseudo. + InstructionOrPseudo *allocInstructionOrPseudo(Value* Parent, Value *V) { + InstructionOrPseudoArray.emplace_back( + llvm::make_unique(Parent, V)); + int idx = InstructionOrPseudoArray.size() - 1; + return (InstructionOrPseudoArray[idx]).get(); + } + /// Maps a specific scalar to its tree entry. - SmallDenseMap ScalarToTreeEntry; + SmallDenseMap, int>> + ScalarToTreeEntry; /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -832,15 +936,34 @@ ScheduleData() = default; - void init(int BlockSchedulingRegionID, Value *OpVal) { + void init(int BlockSchedulingRegionID) { FirstInBundle = this; NextInBundle = nullptr; NextLoadStore = nullptr; IsScheduled = false; + Parent = nullptr; + Opcode = 0; + IsPseudo = false; + SchedulingRegionID = BlockSchedulingRegionID; + UnscheduledDepsInBundle = UnscheduledDeps; + clearDependencies(); + } + + // Initialize ScheduleData for pseudo instruction + // with ScheduleData for an original instruction OpOrigin. + void init(int BlockSchedulingRegionID, ScheduleData *OpOrigin, + InstructionOrPseudo *IOP) { + FirstInBundle = this; + NextInBundle = nullptr; + IsScheduled = false; + Parent = IOP->getParent(); + Opcode = IOP->MainOpcode; + IsPseudo = true; + Inst = OpOrigin->Inst; + NextLoadStore = OpOrigin->NextLoadStore; SchedulingRegionID = BlockSchedulingRegionID; UnscheduledDepsInBundle = UnscheduledDeps; clearDependencies(); - OpValue = OpVal; } /// Returns true if the dependency information has been calculated. @@ -945,8 +1068,14 @@ /// dry-run). bool IsScheduled = false; - /// Opcode of the current instruction in the schedule data. - Value *OpValue = nullptr; + /// Opcode that represents instructions to be vectorized. + unsigned Opcode = 0; + + /// Parent of this operation. + Value *Parent = nullptr; + + /// True if this is a pseudo operation. + bool IsPseudo = false; }; #ifndef NDEBUG @@ -984,22 +1113,17 @@ ++SchedulingRegionID; } - ScheduleData *getScheduleData(Value *V) { - ScheduleData *SD = ScheduleDataMap[V]; - if (SD && SD->SchedulingRegionID == SchedulingRegionID) + ScheduleData *getScheduleData(InstructionOrPseudo *IOP) { + std::pair Key = IOP->getKey(); + auto *I = IOP->getOpInst(); + ScheduleData *SD = ScheduleDataMap[I][std::make_pair(nullptr, 0)]; + if (SD && SD->Parent == Key.first && SD->Opcode == Key.second && + SD->SchedulingRegionID == SchedulingRegionID) + return SD; + SD = ScheduleDataMap[I][Key]; + if (SD && SD->Parent == Key.first && SD->Opcode == Key.second && + SD->SchedulingRegionID == SchedulingRegionID) return SD; - return nullptr; - } - - ScheduleData *getScheduleData(Value *V, Value *Key) { - if (V == Key) - return getScheduleData(V); - auto I = ExtraScheduleDataMap.find(V); - if (I != ExtraScheduleDataMap.end()) { - ScheduleData *SD = I->second[Key]; - if (SD && SD->SchedulingRegionID == SchedulingRegionID) - return SD; - } return nullptr; } @@ -1015,18 +1139,18 @@ LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); ScheduleData *BundleMember = SD; + unsigned Opcode = BundleMember->Opcode; + Value *Parent = BundleMember->Parent; while (BundleMember) { - if (BundleMember->Inst != BundleMember->OpValue) { - BundleMember = BundleMember->NextInBundle; - continue; - } + assert(BundleMember->Opcode == Opcode && + BundleMember->Parent == Parent && "Corrupt bundle member"); // Handle the def-use chain dependencies. for (Use &U : BundleMember->Inst->operands()) { auto *I = dyn_cast(U.get()); if (!I) continue; doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { - if (OpDef && OpDef->hasValidDependencies() && + if (OpDef->hasValidDependencies() && OpDef->incrementUnscheduledDeps(-1) == 0) { // There are no more unscheduled dependencies after // decrementing, so we can put the dependent instruction @@ -1057,15 +1181,15 @@ } } - void doForAllOpcodes(Value *V, + void doForAllOpcodes(Instruction *I, function_ref Action) { - if (ScheduleData *SD = getScheduleData(V)) - Action(SD); - auto I = ExtraScheduleDataMap.find(V); - if (I != ExtraScheduleDataMap.end()) - for (auto &P : I->second) - if (P.second->SchedulingRegionID == SchedulingRegionID) - Action(P.second); + if (ScheduleDataMap.find(I) == ScheduleDataMap.end()) + return; + for (auto It : ScheduleDataMap[I]) { + auto *SD = It.second; + if (SD && isInSchedulingRegion(SD)) + Action(SD); + } } /// Put all instructions into the ReadyList which are ready for scheduling. @@ -1085,18 +1209,17 @@ /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. - bool tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, - const InstructionsState &S); + bool tryScheduleBundle(ArrayRef IL, BoUpSLP *SLP); /// Un-bundles a group of instructions. - void cancelScheduling(ArrayRef VL, Value *OpValue); + void cancelScheduling(InstructionOrPseudo *IOP); /// Allocates schedule data chunk. ScheduleData *allocateScheduleDataChunks(); /// Extends the scheduling region so that V is inside the region. /// \returns true if the region size is within the limit. - bool extendSchedulingRegion(Value *V, const InstructionsState &S); + bool extendSchedulingRegion(InstructionOrPseudo *IOP); /// Initialize the ScheduleData structures for new instructions in the /// scheduling region. @@ -1126,12 +1249,11 @@ /// Attaches ScheduleData to Instruction. /// Note that the mapping survives during all vectorization iterations, i.e. - /// ScheduleData structures are recycled. - DenseMap ScheduleDataMap; - - /// Attaches ScheduleData to Instruction with the leading key. - DenseMap> - ExtraScheduleDataMap; + /// ScheduleData structures are recycled. All real instructions are placed + /// in [Instruction *][nullptr][0]. + DenseMap, ScheduleData *>> + ScheduleDataMap; struct ReadyList : SmallVector { void insert(ScheduleData *SD) { push_back(SD); } @@ -1170,6 +1292,9 @@ /// Attaches the BlockScheduling structures to basic blocks. MapVector> BlocksSchedules; + /// Remove or subsititude any pseudo instructions in the tree. + void eliminatePseudos(BlockScheduling *BS); + /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); @@ -1293,14 +1418,16 @@ std::string Str; raw_string_ostream OS(Str); if (isSplat(Entry->Scalars)) { - OS << " " << *Entry->Scalars[0]; + OS << " " << *Entry->Scalars[0]->getOp(); return Str; } - for (auto V : Entry->Scalars) { + for (auto *IOP : Entry->Scalars) { + Value *V = IOP->getOp(); OS << *V; - if (std::any_of( - R->ExternalUses.begin(), R->ExternalUses.end(), - [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; })) + if (std::any_of(R->ExternalUses.begin(), R->ExternalUses.end(), + [&](const BoUpSLP::ExternalUser &EU) { + return EU.Scalar == V; + })) OS << " "; OS << "\n"; } @@ -1330,7 +1457,7 @@ UserIgnoreList = UserIgnoreLst; if (!allSameType(Roots)) return; - buildTree_rec(Roots, 0, -1); + buildTree_rec(Roots[0], Roots, 0, -1); // Collect the values that we need to extract from the tree. for (TreeEntry &EIdx : VectorizableTree) { @@ -1342,7 +1469,7 @@ // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { - Value *Scalar = Entry->Scalars[Lane]; + auto *Scalar = Entry->Scalars[Lane]; int FoundLane = Lane; if (!Entry->ReuseShuffleIndices.empty()) { FoundLane = @@ -1351,13 +1478,14 @@ } // Check if the scalar is externally used as an extra arg. - auto ExtI = ExternallyUsedValues.find(Scalar); + Value *V = Scalar->getOp(); + auto ExtI = ExternallyUsedValues.find(V); if (ExtI != ExternallyUsedValues.end()) { LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " - << Lane << " from " << *Scalar << ".\n"); - ExternalUses.emplace_back(Scalar, nullptr, FoundLane); + << Lane << " from " << *V << ".\n"); + ExternalUses.emplace_back(V, nullptr, FoundLane); } - for (User *U : Scalar->users()) { + for (User *U : V->users()) { LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); Instruction *UserInst = dyn_cast(U); @@ -1365,13 +1493,13 @@ continue; // Skip in-tree scalars that become vectors - if (TreeEntry *UseEntry = getTreeEntry(U)) { - Value *UseScalar = UseEntry->Scalars[0]; + if (TreeEntry *UseEntry = getTreeEntry(UserInst)) { + Value *UseScalar = UseEntry->Scalars[0]->getOp(); // Some in-tree scalars will remain as scalar in vectorized // instructions. If that is the case, the one in Lane 0 will // be used. if (UseScalar != U || - !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { + !InTreeUserNeedToExtract(V, UserInst, TLI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); assert(!UseEntry->NeedToGather && "Bad state"); @@ -1384,42 +1512,48 @@ continue; LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " - << Lane << " from " << *Scalar << ".\n"); - ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane)); + << Lane << " from " << *V << ".\n"); + ExternalUses.push_back(ExternalUser(V, U, FoundLane)); } } } } -void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, +void BoUpSLP::buildTree_rec(Value *Parent, ArrayRef VL, unsigned Depth, int UserTreeIdx) { - assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); + SmallVector IL; + for (Value *V: VL) + IL.push_back(allocInstructionOrPseudo(Parent, V)); + InstructionsState S = getSameOpcode(IL); + for (InstructionOrPseudo *IOP : IL) + IOP->setInstructionsState(S); + + assert((allConstant(IL) || allSameType(VL)) && "Invalid types!"); - InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } if (StoreInst *SI = dyn_cast(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { + if (allConstant(IL) || isSplat(IL) || !allSameBlock(IL) || !S.getOpcode()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } @@ -1427,40 +1561,39 @@ // the same block. // Don't vectorize ephemeral values. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (EphValues.count(VL[i])) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] + for (auto *IOP : IL) { + if (EphValues.count(IOP->getOp())) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *IOP->getOp() << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } } // Check if this is a duplicate of another entry. - if (TreeEntry *E = getTreeEntry(S.OpValue)) { - LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); - if (!E->isSame(VL)) { - LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx); - return; - } + TreeEntry *E = getTreeEntry(IL[0]->MainOp); + if (E && E->isSame(IL)) { + LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " + << *IL[0]->MainOp << ".\n"); // Record the reuse of the tree node. FIXME, currently this is only used to // properly draw the graph rather than for the actual vectorization. E->UserTreeIndices.push_back(UserTreeIdx); - LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue + LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *IL[0]->MainOp << ".\n"); return; } // Check that none of the instructions in the bundle are already in the tree. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - auto *I = dyn_cast(VL[i]); - if (!I) - continue; - if (getTreeEntry(I)) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] + for (auto *IOP : IL) { + auto *I = IOP->getOpInst(); + // Avoid any non-binary duplicate operation. + if ((getTreeEntry(I) && (!isa(I) || IL[0]->MainOp == I)) || + (ScalarToTreeEntry.find(I) != ScalarToTreeEntry.end() && + ScalarToTreeEntry[I].find(IOP->getKey()) != + ScalarToTreeEntry[I].end())) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *I << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } } @@ -1468,47 +1601,52 @@ // If any of the scalars is marked as a value that needs to stay scalar, then // we need to gather the scalars. // The reduction nodes (stored in UserIgnoreList) also should stay scalar. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) { + for (auto *IOP : IL) { + if (MustGather.count(IOP->getOp()) || + is_contained(UserIgnoreList, IOP->getOp())) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } } // Check that all of the users of the scalars that we want to vectorize are // schedulable. - auto *VL0 = cast(S.OpValue); + auto *VL0 = IL[0]->MainOp; BasicBlock *BB = VL0->getParent(); if (!DT->isReachableFromEntry(BB)) { // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } // Check that every instruction appears once in this bundle. SmallVector ReuseShuffleIndicies; - SmallVector UniqueValues; + SmallVector UniqueValues; DenseMap UniquePositions; - for (Value *V : VL) { + for (auto *I : IL) { + Value *V = I->getOp(); auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); ReuseShuffleIndicies.emplace_back(Res.first->second); - if (Res.second) - UniqueValues.emplace_back(V); + if (Res.second) { + InstructionOrPseudo *IOP = allocInstructionOrPseudo(Parent, V); + IOP->setInstructionsState(S); + UniqueValues.emplace_back(IOP); + } } - if (UniqueValues.size() == VL.size()) { + if (UniqueValues.size() == IL.size()) { ReuseShuffleIndicies.clear(); } else { LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } - VL = UniqueValues; + IL = UniqueValues; } auto &BSRef = BlocksSchedules[BB]; @@ -1517,59 +1655,59 @@ BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, this, S)) { + if (!BS.tryScheduleBundle(IL, this)) { LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); - assert((!BS.getScheduleData(VL0) || - !BS.getScheduleData(VL0)->isPartOfBundle()) && + assert((!BS.getScheduleData(IL[0]) || + !BS.getScheduleData(IL[0])->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); return; } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + unsigned ShuffleOrOp = IL[0]->isAltShuffle() ? + (unsigned) Instruction::ShuffleVector : IL[0]->MainOpcode; switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); // Check for terminator values (e.g. invoke). - for (unsigned j = 0; j < VL.size(); ++j) + for (unsigned j = 0; j < IL.size(); ++j) for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { Instruction *Term = dyn_cast( - cast(VL[j])->getIncomingValueForBlock( + cast(IL[j]->getOp())->getIncomingValueForBlock( PH->getIncomingBlock(i))); if (Term && Term->isTerminator()) { LLVM_DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getIncomingValueForBlock( - PH->getIncomingBlock(i))); - - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + for (auto *j : IL) + Operands.push_back(cast( + j->getOp())->getIncomingValueForBlock( + PH->getIncomingBlock(i))); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; } case Instruction::ExtractValue: case Instruction::ExtractElement: { OrdersType CurrentOrder; - bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); + bool Reuse = canReuseExtract(IL, CurrentOrder); if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies); return; } @@ -1586,13 +1724,13 @@ auto StoredCurrentOrderAndNum = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++StoredCurrentOrderAndNum->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, + newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, StoredCurrentOrderAndNum->getFirst()); return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); - newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); - BS.cancelScheduling(VL, VL0); + newTreeEntry(IL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); return; } case Instruction::Load: { @@ -1606,21 +1744,21 @@ if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } // Make sure all loads in the bundle are simple - we can't vectorize // atomic or volatile loads. - SmallVector PointerOps(VL.size()); + SmallVector PointerOps(IL.size()); auto POIter = PointerOps.begin(); - for (Value *V : VL) { - auto *L = cast(V); + for (auto *I : IL) { + auto *L = cast(I->getOpInst()); if (!L->isSimple()) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1646,18 +1784,18 @@ dyn_cast(SE->getMinusSCEV(ScevN, Scev0)); uint64_t Size = DL->getTypeAllocSize(ScalarTy); // Check that the sorted loads are consecutive. - if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) { + if (Diff && Diff->getAPInt().getZExtValue() == (IL.size() - 1) * Size) { if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, I->getFirst()); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } @@ -1666,8 +1804,8 @@ } LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -1683,26 +1821,26 @@ case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); - for (unsigned i = 0; i < VL.size(); ++i) { - Type *Ty = cast(VL[i])->getOperand(0)->getType(); + for (auto *IOP : IL) { + Type *Ty = IOP->getOpInst()->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (auto *I : IL) + Operands.push_back(I->getOpInst()->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; } @@ -1711,28 +1849,28 @@ // Check that all of the compares have the same predicate. CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Type *ComparedTy = VL0->getOperand(0)->getType(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast(VL[i]); + for (auto *IOP : IL) { + CmpInst *Cmp = cast(IOP->getOpInst()); if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (auto *I : IL) + Operands.push_back(I->getOpInst()->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; } @@ -1755,36 +1893,36 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to // have the same opcode. if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right); - buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + reorderInputsAccordingToOpcode(IL, Left, Right); + buildTree_rec(VL0, Left, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Right, Depth + 1, UserTreeIdx); return; } for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (auto *Inst: IL) + Operands.push_back(Inst->getOpInst()->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. - for (unsigned j = 0; j < VL.size(); ++j) { - if (cast(VL[j])->getNumOperands() != 2) { + for (auto *IOP : IL) { + if (IOP->getOpInst()->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } @@ -1792,59 +1930,60 @@ // We can't combine several GEPs into one vector if they operate on // different types. Type *Ty0 = VL0->getOperand(0)->getType(); - for (unsigned j = 0; j < VL.size(); ++j) { - Type *CurTy = cast(VL[j])->getOperand(0)->getType(); + for (auto *IOP : IL) { + Type *CurTy = IOP->getOpInst()->getOperand(0)->getType(); if (Ty0 != CurTy) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } // We don't combine GEPs with non-constant indexes. - for (unsigned j = 0; j < VL.size(); ++j) { - auto Op = cast(VL[j])->getOperand(1); + for (auto *IOP : IL) { + auto Op = IOP->getOpInst()->getOperand(1); if (!isa(Op)) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (auto *j : IL) + Operands.push_back(j->getOpInst()->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; } case Instruction::Store: { // Check if the stores are consecutive or of we need to swizzle them. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + for (unsigned i = 0, e = IL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(IL[i]->getOp(), IL[i + 1]->getOp(), + *DL, *SE)) { + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(0)); + for (auto *j : IL) + Operands.push_back(j->getOpInst()->getOperand(0)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); return; } case Instruction::Call: { @@ -1854,8 +1993,8 @@ // represented by an intrinsic call Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1863,15 +2002,15 @@ Value *A1I = nullptr; if (hasVectorInstrinsicScalarOpd(ID, 1)) A1I = CI->getArgOperand(1); - for (unsigned i = 1, e = VL.size(); i != e; ++i) { - CallInst *CI2 = dyn_cast(VL[i]); + for (unsigned i = 1, e = IL.size(); i != e; ++i) { + CallInst *CI2 = dyn_cast(IL[i]->getOpInst()); if (!CI2 || CI2->getCalledFunction() != Int || getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] - << "\n"); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << + *IL[i]->getOp() << "\n"); return; } // ctlz,cttz and powi are special intrinsics whose second argument @@ -1879,8 +2018,8 @@ if (hasVectorInstrinsicScalarOpd(ID, 1)) { Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument " << A1I << "!=" << A1J << "\n"); return; @@ -1891,60 +2030,60 @@ !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" - << *CI << "!=" << *VL[i] << '\n'); + << *CI << "!=" << *IL[i]->getOp() << '\n'); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) { - CallInst *CI2 = dyn_cast(j); + for (auto *VecOp : IL) { + CallInst *CI2 = dyn_cast(VecOp->getOpInst()); Operands.push_back(CI2->getArgOperand(i)); } - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; } case Instruction::ShuffleVector: // If this is not an alternate sequence of opcode like add-sub // then do not vectorize this instruction. - if (!S.isAltShuffle()) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + if (!IL[0]->isAltShuffle()) { + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; - reorderAltShuffleOperands(S, VL, Left, Right); - buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + reorderAltShuffleOperands(IL, Left, Right); + buildTree_rec(VL0, Left, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Right, Depth + 1, UserTreeIdx); return; } for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (auto *VecOp : IL) + Operands.push_back(VecOp->getOpInst()->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; default: - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -1975,12 +2114,12 @@ return N; } -bool BoUpSLP::canReuseExtract(ArrayRef VL, Value *OpValue, +bool BoUpSLP::canReuseExtract(ArrayRef IL, SmallVectorImpl &CurrentOrder) const { - Instruction *E0 = cast(OpValue); + Instruction *E0 = IL[0]->MainOp; assert(E0->getOpcode() == Instruction::ExtractElement || E0->getOpcode() == Instruction::ExtractValue); - assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode"); + assert(E0->getOpcode() == getSameOpcode(IL).getOpcode() && "Invalid opcode"); // Check if all of the extracts come from the same vector and from the // correct offset. Value *Vec = E0->getOperand(0); @@ -1996,18 +2135,18 @@ return false; // Check if load can be rewritten as load of vector. LoadInst *LI = dyn_cast(Vec); - if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) + if (!LI || !LI->isSimple() || !LI->hasNUses(IL.size())) return false; } else { NElts = Vec->getType()->getVectorNumElements(); } - if (NElts != VL.size()) + if (NElts != IL.size()) return false; // Check that all of the indices extract from the correct offset. bool ShouldKeepOrder = true; - unsigned E = VL.size(); + unsigned E = IL.size(); // Assign to all items the initial value E + 1 so we can check if the extract // instruction index was used already. // Also, later we can check that all the indices are used and we have a @@ -2016,7 +2155,7 @@ CurrentOrder.assign(E, E + 1); unsigned I = 0; for (; I < E; ++I) { - auto *Inst = cast(VL[I]); + auto *Inst = IL[I]->getOpInst(); if (Inst->getOperand(0) != Vec) break; Optional Idx = getExtractIndex(Inst); @@ -2027,6 +2166,7 @@ if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1) break; ShouldKeepOrder = false; + ShouldKeepOrder = false; CurrentOrder[ExtIdx] = I; } else { if (CurrentOrder[I] != E + 1) @@ -2045,25 +2185,26 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { return I->hasOneUse() || std::all_of(I->user_begin(), I->user_end(), [this](User *U) { - return ScalarToTreeEntry.count(U) > 0; + return ScalarToTreeEntry.count(dyn_cast(U)) > 0; }); } int BoUpSLP::getEntryCost(TreeEntry *E) { - ArrayRef VL = E->Scalars; + ArrayRef IL = E->Scalars; + Value *V = E->Scalars[0]->getOp(); - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) + Type *ScalarTy = V->getType(); + if (StoreInst *SI = dyn_cast(V)) ScalarTy = SI->getValueOperand()->getType(); - else if (CmpInst *CI = dyn_cast(VL[0])) + else if (CmpInst *CI = dyn_cast(V)) ScalarTy = CI->getOperand(0)->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + VectorType *VecTy = VectorType::get(ScalarTy, IL.size()); // If we have computed a smaller type for the expression, update VecTy so // that the costs will be accurate. - if (MinBWs.count(VL[0])) + if (MinBWs.count(V)) VecTy = VectorType::get( - IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + IntegerType::get(F->getContext(), MinBWs[V].first), IL.size()); unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); @@ -2073,26 +2214,29 @@ TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } if (E->NeedToGather) { - if (allConstant(VL)) + if (allConstant(IL)) return 0; - if (isSplat(VL)) { + if (isSplat(IL)) { return ReuseShuffleCost + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } - if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement && - allSameType(VL) && allSameBlock(VL)) { + if (IL[0]->MainOpcode == Instruction::ExtractElement && allSameBlock(IL)) { + ValueList VL; + for (auto *IOP : IL) + VL.push_back(IOP->getOp()); Optional ShuffleKind = isShuffle(VL); if (ShuffleKind.hasValue()) { int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); - for (auto *V : VL) { + for (auto *IOP : IL) { // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the // vectorized tree. - if (areAllUsersVectorized(cast(V)) && - !ScalarToTreeEntry.count(V)) { + auto *I = IOP->getOpInst(); + if (areAllUsersVectorized(I) && + !ScalarToTreeEntry.count(I)) { auto *IO = cast( - cast(V)->getIndexOperand()); + cast(I)->getIndexOperand()); Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, IO->getZExtValue()); } @@ -2100,13 +2244,12 @@ return ReuseShuffleCost + Cost; } } - return ReuseShuffleCost + getGatherCost(VL); + return ReuseShuffleCost + getGatherCost(IL); } - InstructionsState S = getSameOpcode(VL); - assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); - Instruction *VL0 = cast(S.OpValue); - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + auto *VL0 = E->Scalars[0]->MainOp; + assert(IL[0]->MainOpcode != 0 && allSameBlock(IL) && "Invalid IL"); + unsigned ShuffleOrOp = IL[0]->isAltShuffle() ? + (unsigned) Instruction::ShuffleVector : IL[0]->MainOpcode; switch (ShuffleOrOp) { case Instruction::PHI: return 0; @@ -2116,9 +2259,10 @@ if (NeedToShuffleReuses) { unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { + auto *Inst = IL[I]->getOpInst(); if (ShuffleOrOp == Instruction::ExtractElement) { auto *IO = cast( - cast(VL[I])->getIndexOperand()); + cast(Inst)->getIndexOperand()); Idx = IO->getZExtValue(); ReuseShuffleCost -= TTI->getVectorInstrCost( Instruction::ExtractElement, VecTy, Idx); @@ -2129,10 +2273,10 @@ } } Idx = ReuseShuffleNumbers; - for (Value *V : VL) { + for (auto *IOP : IL) { if (ShuffleOrOp == Instruction::ExtractElement) { auto *IO = cast( - cast(V)->getIndexOperand()); + cast(IOP->getOpInst())->getIndexOperand()); Idx = IO->getZExtValue(); } else { --Idx; @@ -2148,11 +2292,11 @@ DeadCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *E = cast(VL[i]); + for (unsigned i = 0, e = IL.size(); i < e; ++i) { // If all users are going to be vectorized, instruction can be // considered as dead. // The same, if have only one user, it will be vectorized for sure. + auto *E = IL[i]->getOpInst(); if (areAllUsersVectorized(E)) { // Take credit for instruction that will become dead. if (E->hasOneUse()) { @@ -2166,7 +2310,8 @@ Ext->getOpcode(), Ext->getType(), VecTy, i); // Add back the cost of s|zext which is subtracted separately. DeadCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), E->getType(), Ext); + Ext->getOpcode(), Ext->getType(), + E->getType(), Ext); continue; } } @@ -2176,7 +2321,7 @@ } return DeadCost; } - return ReuseShuffleCost + getGatherCost(VL); + return ReuseShuffleCost + getGatherCost(IL); case Instruction::ZExt: case Instruction::SExt: @@ -2192,20 +2337,21 @@ case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); int ScalarEltCost = - TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0); + TTI->getCastInstrCost(IL[0]->MainOpcode, ScalarTy, SrcTy, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost; } // Calculate the cost of this instruction. - int ScalarCost = VL.size() * ScalarEltCost; + int ScalarCost = IL.size() * ScalarEltCost; - VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); + VectorType *SrcVecTy = VectorType::get(SrcTy, IL.size()); int VecCost = 0; // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { VecCost = ReuseShuffleCost + - TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0); + TTI->getCastInstrCost(IL[0]->MainOpcode, VecTy, + SrcVecTy, VL0); } return VecCost - ScalarCost; } @@ -2213,14 +2359,17 @@ case Instruction::ICmp: case Instruction::Select: { // Calculate the cost of this instruction. - int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy, - Builder.getInt1Ty(), VL0); + int ScalarEltCost = TTI->getCmpSelInstrCost(IL[0]->MainOpcode, + ScalarTy, + Builder.getInt1Ty(), + VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost; } - VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); + VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), IL.size()); int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0); + int VecCost = TTI->getCmpSelInstrCost(IL[0]->MainOpcode, VecTy, + MaskTy, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Add: @@ -2257,9 +2406,10 @@ // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt0 = nullptr; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - const Instruction *I = cast(VL[i]); + ConstantInt *CInt0 = dyn_cast( + IL[0]->getOpInst()->getOperand(1)); + for (auto *IOP : IL) { + auto *I = IOP->getOpInst(); ConstantInt *CInt = dyn_cast(I->getOperand(1)); if (!CInt) { Op2VK = TargetTransformInfo::OK_AnyValue; @@ -2269,23 +2419,20 @@ if (Op2VP == TargetTransformInfo::OP_PowerOf2 && !CInt->getValue().isPowerOf2()) Op2VP = TargetTransformInfo::OP_None; - if (i == 0) { - CInt0 = CInt; - continue; - } if (CInt0 != CInt) Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } SmallVector Operands(VL0->operand_values()); int ScalarEltCost = TTI->getArithmeticInstrCost( - S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); + IL[0]->MainOpcode, ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK, - Op2VK, Op1VP, Op2VP, Operands); + int VecCost = TTI->getArithmeticInstrCost(IL[0]->MainOpcode, VecTy, + Op1VK, Op2VK, Op1VP, Op2VP, + Operands); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -2297,7 +2444,7 @@ int ScalarEltCost = TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; int VecCost = @@ -2310,7 +2457,7 @@ int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost; } int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; int VecLdCost = @@ -2328,7 +2475,7 @@ int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost; } int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; int VecStCost = @@ -2351,7 +2498,7 @@ int ScalarEltCost = TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost; } int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; @@ -2366,44 +2513,42 @@ return ReuseShuffleCost + VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(IL[0]->isAltShuffle() && + ((Instruction::isBinaryOp(IL[0]->MainOpcode) && + Instruction::isBinaryOp(IL[0]->AltOpcode)) || + (Instruction::isCast(IL[0]->MainOpcode) && + Instruction::isCast(IL[0]->AltOpcode))) && "Invalid Shuffle Vector Operand"); int ScalarCost = 0; if (NeedToShuffleReuses) { for (unsigned Idx : E->ReuseShuffleIndices) { - Instruction *I = cast(VL[Idx]); ReuseShuffleCost -= TTI->getInstructionCost( - I, TargetTransformInfo::TCK_RecipThroughput); + IL[Idx]->getOpInst(), TargetTransformInfo::TCK_RecipThroughput); } - for (Value *V : VL) { - Instruction *I = cast(V); + for (auto *IOP : IL) { ReuseShuffleCost += TTI->getInstructionCost( - I, TargetTransformInfo::TCK_RecipThroughput); + IOP->getOpInst(), TargetTransformInfo::TCK_RecipThroughput); } } - for (Value *i : VL) { - Instruction *I = cast(i); - assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + for (auto *IOP : IL) { + assert(IOP->isOpcodeOrAlt() && "Unexpected main/alternate opcode"); ScalarCost += TTI->getInstructionCost( - I, TargetTransformInfo::TCK_RecipThroughput); + IOP->getOpInst(), TargetTransformInfo::TCK_RecipThroughput); } // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. int VecCost = 0; - if (Instruction::isBinaryOp(S.getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy); - VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy); + if (Instruction::isBinaryOp(IL[0]->MainOpcode)) { + VecCost = TTI->getArithmeticInstrCost(IL[0]->MainOpcode, VecTy); + VecCost += TTI->getArithmeticInstrCost(IL[0]->AltOpcode, VecTy); } else { - Type *Src0SclTy = S.MainOp->getOperand(0)->getType(); - Type *Src1SclTy = S.AltOp->getOperand(0)->getType(); - VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size()); - VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size()); - VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty); - VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty); + Type *Src0SclTy = IL[0]->MainOp->getOperand(0)->getType(); + Type *Src1SclTy = IL[0]->AltOp->getOperand(0)->getType(); + VectorType *Src0Ty = VectorType::get(Src0SclTy, IL.size()); + VectorType *Src1Ty = VectorType::get(Src1SclTy, IL.size()); + VecCost = TTI->getCastInstrCost(IL[0]->MainOpcode, VecTy, Src0Ty); + VecCost += TTI->getCastInstrCost(IL[0]->AltOpcode, VecTy, + Src1Ty); } VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); return ReuseShuffleCost + VecCost - ScalarCost; @@ -2437,6 +2582,23 @@ return true; } +Instruction *BoUpSLP::propagateMetadata(Instruction *I, + ArrayRef IL) { + ValueList VL; + for (auto *IOP: IL) + VL.push_back(IOP->getOp()); + return llvm::propagateMetadata(I, VL); +} + +void BoUpSLP::propagateIRFlags(Value *I, + ArrayRef IL, + Value *OpValue) { + ValueList VL; + for (auto *IOP: IL) + VL.push_back(IOP->getOp()); + llvm::propagateIRFlags(I, VL, OpValue); +} + bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() { // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. @@ -2469,7 +2631,7 @@ Instruction *PrevInst = nullptr; for (const auto &N : VectorizableTree) { - Instruction *Inst = dyn_cast(N.Scalars[0]); + Instruction *Inst = N.Scalars[0]->getOpInst(); if (!Inst) continue; @@ -2481,8 +2643,9 @@ // Update LiveValues. LiveValues.erase(PrevInst); for (auto &J : PrevInst->operands()) { - if (isa(&*J) && getTreeEntry(&*J)) - LiveValues.insert(cast(&*J)); + auto *I = dyn_cast(&*J); + if (I && getTreeEntry(I)) + LiveValues.insert(I); } LLVM_DEBUG({ @@ -2553,8 +2716,8 @@ int C = getEntryCost(&TE); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C - << " for bundle that starts with " << *TE.Scalars[0] - << ".\n"); + << " for bundle that starts with " + << *TE.Scalars[0]->getOp() << ".\n"); Cost += C; } @@ -2575,7 +2738,7 @@ // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0].Scalars[0]->getOp(); if (MinBWs.count(ScalarRoot)) { auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto Extend = @@ -2618,21 +2781,22 @@ return Cost; } -int BoUpSLP::getGatherCost(ArrayRef VL) { - // Find the type of the operands in VL. - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) +int BoUpSLP::getGatherCost(ArrayRef IL) { + // Find the type of the operands in IL. + Value *V = IL[0]->getOp(); + Type *ScalarTy = V->getType(); + if (StoreInst *SI = dyn_cast(V)) ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + VectorType *VecTy = VectorType::get(ScalarTy, IL.size()); // Find the cost of inserting/extracting values from the vector. // Check if the same elements are inserted several times and count them as // shuffle candidates. DenseSet ShuffledElements; DenseSet UniqueElements; // Iterate in reverse order to consider insert elements with the high cost. - for (unsigned I = VL.size(); I > 0; --I) { + for (unsigned I = IL.size(); I > 0; --I) { unsigned Idx = I - 1; - if (!UniqueElements.insert(VL[Idx]).second) + if (!UniqueElements.insert(IL[Idx]->getOp()).second) ShuffledElements.insert(Idx); } return getGatherCost(VecTy, ShuffledElements); @@ -2647,25 +2811,24 @@ // load a[3] + load b[3] // Reordering the second load b[1] load a[1] would allow us to vectorize this // code. -void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S, - ArrayRef VL, +void BoUpSLP::reorderAltShuffleOperands(ArrayRef IL, SmallVectorImpl &Left, SmallVectorImpl &Right) { // Push left and right operands of binary operation into Left and Right - for (Value *V : VL) { - auto *I = cast(V); - assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector"); + for (auto *Inst : IL) { + auto *I = Inst->getOpInst(); + assert(Inst->isOpcodeOrAlt() && "Incorrect instruction in vector"); Left.push_back(I->getOperand(0)); Right.push_back(I->getOperand(1)); } // Reorder if we have a commutative operation and consecutive access // are on either side of the alternate instructions. - for (unsigned j = 0; j < VL.size() - 1; ++j) { + for (unsigned j = 0; j < IL.size() - 1; ++j) { if (LoadInst *L = dyn_cast(Left[j])) { if (LoadInst *L1 = dyn_cast(Right[j + 1])) { - Instruction *VL1 = cast(VL[j]); - Instruction *VL2 = cast(VL[j + 1]); + Instruction *VL1 = IL[j]->getOpInst(); + Instruction *VL2 = IL[j + 1]->getOpInst(); if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { std::swap(Left[j], Right[j]); continue; @@ -2679,8 +2842,8 @@ } if (LoadInst *L = dyn_cast(Right[j])) { if (LoadInst *L1 = dyn_cast(Left[j + 1])) { - Instruction *VL1 = cast(VL[j]); - Instruction *VL2 = cast(VL[j + 1]); + Instruction *VL1 = IL[j]->getOpInst(); + Instruction *VL2 = IL[j + 1]->getOpInst(); if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { std::swap(Left[j], Right[j]); continue; @@ -2762,14 +2925,13 @@ return false; } -void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode, - ArrayRef VL, +void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef IL, SmallVectorImpl &Left, SmallVectorImpl &Right) { - if (!VL.empty()) { + if (!IL.empty()) { // Peel the first iteration out of the loop since there's nothing // interesting to do anyway and it simplifies the checks in the loop. - auto *I = cast(VL[0]); + auto *I = IL[0]->getOpInst(); Value *VLeft = I->getOperand(0); Value *VRight = I->getOperand(1); if (!isa(VRight) && isa(VLeft)) @@ -2778,6 +2940,7 @@ Left.push_back(VLeft); Right.push_back(VRight); } + unsigned Opcode = IL[0]->MainOpcode; // Keep track if we have instructions with all the same opcode on one side. bool AllSameOpcodeLeft = isa(Left[0]); @@ -2786,8 +2949,8 @@ bool SplatLeft = true; bool SplatRight = true; - for (unsigned i = 1, e = VL.size(); i != e; ++i) { - Instruction *I = cast(VL[i]); + for (unsigned i = 1, e = IL.size(); i != e; ++i) { + Instruction *I = IL[i]->getOpInst(); assert(((I->getOpcode() == Opcode && I->isCommutative()) || (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && "Can only process commutative instruction"); @@ -2834,7 +2997,7 @@ // add a[1],c[2] load b[1] // b[2] load b[2] // add a[3],c[3] load b[3] - for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) { + for (unsigned j = 0, e = IL.size() - 1; j < e; ++j) { if (LoadInst *L = dyn_cast(Left[j])) { if (LoadInst *L1 = dyn_cast(Right[j + 1])) { if (isConsecutiveAccess(L, L1, *DL, *SE)) { @@ -2855,31 +3018,27 @@ } } -void BoUpSLP::setInsertPointAfterBundle(ArrayRef VL, - const InstructionsState &S) { +void BoUpSLP::setInsertPointAfterBundle(ArrayRef IL) { // Get the basic block this bundle is in. All instructions in the bundle // should be in this block. - auto *Front = cast(S.OpValue); + auto *Front = IL[0]->MainOp; auto *BB = Front->getParent(); - assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { - auto *I = cast(V); - return !S.isOpcodeOrAlt(I) || I->getParent() == BB; - })); + assert(llvm::all_of( + make_range(IL.begin(), IL.end()), [=](InstructionOrPseudo *I) -> bool { + return !I->isOpcodeOrAlt() || I->getOpInst()->getParent() == BB; + })); // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; - // Find the last instruction. The common case should be that BB has been - // scheduled, and the last instruction is VL.back(). So we start with - // VL.back() and iterate over schedule data until we reach the end of the - // bundle. The end of the bundle is marked by null ScheduleData. + // Find the last instruction. If the bundle is not scheduled then + // the first in the bundle is the last one in BB, because we discover + // bundles in backward walk. if (BlocksSchedules.count(BB)) { - auto *Bundle = - BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back())); + BlockScheduling *BS = BlocksSchedules[BB].get(); + auto *Bundle = BS->getScheduleData(IL[0]); if (Bundle && Bundle->isPartOfBundle()) - for (; Bundle; Bundle = Bundle->NextInBundle) - if (Bundle->OpValue == Bundle->Inst) - LastInst = Bundle->Inst; + LastInst = Bundle->FirstInBundle->Inst; } // LastInst can still be null at this point if there's either not an entry @@ -2901,9 +3060,11 @@ // we both exit early from buildTree_rec and that the bundle be out-of-order // (causing us to iterate all the way to the end of the block). if (!LastInst) { - SmallPtrSet Bundle(VL.begin(), VL.end()); + SmallPtrSet Bundle; + for (auto *IOP : IL) + Bundle.insert(IOP->getOp()); for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { - if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I)) + if (Bundle.erase(&I)) LastInst = &I; if (Bundle.empty()) break; @@ -2916,22 +3077,23 @@ Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } -Value *BoUpSLP::Gather(ArrayRef VL, VectorType *Ty) { +Value *BoUpSLP::Gather(ArrayRef IL, VectorType *Ty) { Value *Vec = UndefValue::get(Ty); // Generate the 'InsertElement' instruction. for (unsigned i = 0; i < Ty->getNumElements(); ++i) { - Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); + auto *Op = IL[i]->getOp(); + Vec = Builder.CreateInsertElement(Vec, Op, Builder.getInt32(i)); if (Instruction *Insrt = dyn_cast(Vec)) { GatherSeq.insert(Insrt); CSEBlocks.insert(Insrt->getParent()); // Add to our 'need-to-extract' list. - if (TreeEntry *E = getTreeEntry(VL[i])) { + if (TreeEntry *E = getTreeEntry(Op)) { // Find which lane we need to extract. int FoundLane = -1; for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) { // Is this the lane of the scalar that we are looking for ? - if (E->Scalars[Lane] == VL[i]) { + if (E->Scalars[Lane]->isEqual(IL[i])) { FoundLane = Lane; break; } @@ -2942,7 +3104,7 @@ std::distance(E->ReuseShuffleIndices.begin(), llvm::find(E->ReuseShuffleIndices, FoundLane)); } - ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); + ExternalUses.push_back(ExternalUser(Op, Insrt, FoundLane)); } } } @@ -2950,29 +3112,33 @@ return Vec; } -Value *BoUpSLP::vectorizeTree(ArrayRef VL) { - InstructionsState S = getSameOpcode(VL); - if (S.getOpcode()) { - if (TreeEntry *E = getTreeEntry(S.OpValue)) { - if (E->isSame(VL)) { - Value *V = vectorizeTree(E); - if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) { - // We need to get the vectorized value but without shuffle. - if (auto *SV = dyn_cast(V)) { - V = SV->getOperand(0); - } else { - // Reshuffle to get only unique values. - SmallVector UniqueIdxs; - SmallSet UsedIdxs; - for(unsigned Idx : E->ReuseShuffleIndices) - if (UsedIdxs.insert(Idx).second) +Value *BoUpSLP::vectorizeTree(Value *Parent, ArrayRef VL) { + IOPList IL; + for (Value *V: VL) + IL.push_back(allocInstructionOrPseudo(Parent, V)); + InstructionsState S = getSameOpcode(IL); + for (InstructionOrPseudo *IOP: IL) + IOP->setInstructionsState(S); + if (IL[0]->MainOp) { + TreeEntry *E = getTreeEntry(IL[0]->MainOp); + if (E && E->isSame(IL)) { + Value *V = vectorizeTree(E); + if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) { + // We need to get the vectorized value but without shuffle. + if (auto *SV = dyn_cast(V)) { + V = SV->getOperand(0); + } else { + // Reshuffle to get only unique values. + SmallVector UniqueIdxs; + SmallSet UsedIdxs; + for (unsigned Idx : E->ReuseShuffleIndices) + if (UsedIdxs.insert(Idx).second) UniqueIdxs.emplace_back(Idx); - V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), - UniqueIdxs); - } + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), + UniqueIdxs); } - return V; } + return V; } } @@ -2982,26 +3148,27 @@ // Check that every instruction appears once in this bundle. SmallVector ReuseShuffleIndicies; - SmallVector UniqueValues; - if (VL.size() > 2) { + IOPList UniqueValues; + if (IL.size() > 2) { DenseMap UniquePositions; - for (Value *V : VL) { + for (auto *I : IL) { + Value *V = I->getOp(); auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); ReuseShuffleIndicies.emplace_back(Res.first->second); if (Res.second || isa(V)) - UniqueValues.emplace_back(V); + UniqueValues.emplace_back(I); } // Do not shuffle single element or if number of unique values is not power // of 2. - if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 || + if (UniqueValues.size() == IL.size() || UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) ReuseShuffleIndicies.clear(); else - VL = UniqueValues; + IL = UniqueValues; } - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + VectorType *VecTy = VectorType::get(ScalarTy, IL.size()); - Value *V = Gather(VL, VecTy); + Value *V = Gather(IL, VecTy); if (!ReuseShuffleIndicies.empty()) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), ReuseShuffleIndicies, "shuffle"); @@ -3026,12 +3193,12 @@ IRBuilder<>::InsertPointGuard Guard(Builder); if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " + << *E->Scalars[0]->MainOp << ".\n"); return E->VectorizedValue; } - InstructionsState S = getSameOpcode(E->Scalars); - Instruction *VL0 = cast(S.OpValue); + auto *VL0 = E->Scalars[0]->MainOp; Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast(VL0)) ScalarTy = SI->getValueOperand()->getType(); @@ -3040,7 +3207,7 @@ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->NeedToGather) { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3054,8 +3221,8 @@ return V; } - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + unsigned ShuffleOrOp = E->Scalars[0]->isAltShuffle() ? + (unsigned) Instruction::ShuffleVector : E->Scalars[0]->MainOpcode; switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); @@ -3083,12 +3250,13 @@ } // Prepare the operand vector. - for (Value *V : E->Scalars) - Operands.push_back(cast(V)->getIncomingValueForBlock(IBB)); + for (auto *IOP : E->Scalars) + Operands.push_back( + cast(IOP->getOp())->getIncomingValueForBlock(IBB)); Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - Value *Vec = vectorizeTree(Operands); + Value *Vec = vectorizeTree(VL0, Operands); NewPhi->addIncoming(Vec, IBB); } @@ -3117,7 +3285,7 @@ E->VectorizedValue = V; return V; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3152,7 +3320,7 @@ E->VectorizedValue = NewV; return NewV; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3178,12 +3346,12 @@ case Instruction::FPTrunc: case Instruction::BitCast: { ValueList INVL; - for (Value *V : E->Scalars) - INVL.push_back(cast(V)->getOperand(0)); + for (auto *IOP : E->Scalars) + INVL.push_back(IOP->getOpInst()->getOperand(0)); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); - Value *InVec = vectorizeTree(INVL); + Value *InVec = vectorizeTree(VL0, INVL); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3203,15 +3371,16 @@ case Instruction::FCmp: case Instruction::ICmp: { ValueList LHSV, RHSV; - for (Value *V : E->Scalars) { - LHSV.push_back(cast(V)->getOperand(0)); - RHSV.push_back(cast(V)->getOperand(1)); + for (auto *IOP : E->Scalars) { + auto *I = IOP->getOpInst(); + LHSV.push_back(I->getOperand(0)); + RHSV.push_back(I->getOperand(1)); } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); - Value *L = vectorizeTree(LHSV); - Value *R = vectorizeTree(RHSV); + Value *L = vectorizeTree(VL0, LHSV); + Value *R = vectorizeTree(VL0, RHSV); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3220,7 +3389,7 @@ CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V; - if (S.getOpcode() == Instruction::FCmp) + if (E->Scalars[0]->MainOpcode == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); else V = Builder.CreateICmp(P0, L, R); @@ -3236,17 +3405,18 @@ } case Instruction::Select: { ValueList TrueVec, FalseVec, CondVec; - for (Value *V : E->Scalars) { - CondVec.push_back(cast(V)->getOperand(0)); - TrueVec.push_back(cast(V)->getOperand(1)); - FalseVec.push_back(cast(V)->getOperand(2)); + for (auto *IOP : E->Scalars) { + auto *I = IOP->getOpInst(); + CondVec.push_back(I->getOperand(0)); + TrueVec.push_back(I->getOperand(1)); + FalseVec.push_back(I->getOperand(2)); } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); - Value *Cond = vectorizeTree(CondVec); - Value *True = vectorizeTree(TrueVec); - Value *False = vectorizeTree(FalseVec); + Value *Cond = vectorizeTree(VL0, CondVec); + Value *True = vectorizeTree(VL0, TrueVec); + Value *False = vectorizeTree(VL0, FalseVec); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3281,20 +3451,19 @@ case Instruction::Or: case Instruction::Xor: { ValueList LHSVL, RHSVL; - if (isa(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL, - RHSVL); - else - for (Value *V : E->Scalars) { - auto *I = cast(V); - LHSVL.push_back(I->getOperand(0)); - RHSVL.push_back(I->getOperand(1)); - } + if (isa(VL0) && VL0->isCommutative()) { + reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); + } else + for (auto *IOP : E->Scalars) { + auto *I = IOP->getOpInst(); + LHSVL.push_back(I->getOperand(0)); + RHSVL.push_back(I->getOperand(1)); + } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); - Value *LHS = vectorizeTree(LHSVL); - Value *RHS = vectorizeTree(RHSVL); + Value *LHS = vectorizeTree(VL0, LHSVL); + Value *RHS = vectorizeTree(VL0, RHSVL); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3302,7 +3471,7 @@ } Value *V = Builder.CreateBinOp( - static_cast(S.getOpcode()), LHS, RHS); + static_cast(VL0->getOpcode()), LHS, RHS); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); @@ -3321,10 +3490,11 @@ // sink them all the way down past store instructions. bool IsReorder = !E->ReorderIndices.empty(); if (IsReorder) { - S = getSameOpcode(E->Scalars, E->ReorderIndices.front()); + InstructionsState S = getSameOpcode(E->Scalars, + E->ReorderIndices.front()); VL0 = cast(S.OpValue); } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); LoadInst *LI = cast(VL0); Type *ScalarLoadTy = LI->getType(); @@ -3368,12 +3538,13 @@ unsigned AS = SI->getPointerAddressSpace(); ValueList ScalarStoreValues; - for (Value *V : E->Scalars) - ScalarStoreValues.push_back(cast(V)->getValueOperand()); + for (auto *IOP : E->Scalars) + ScalarStoreValues.push_back( + cast(IOP->getOpInst())->getValueOperand()); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); - Value *VecValue = vectorizeTree(ScalarStoreValues); + Value *VecValue = vectorizeTree(VL0, ScalarStoreValues); Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); @@ -3398,22 +3569,26 @@ return V; } case Instruction::GetElementPtr: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); ValueList Op0VL; - for (Value *V : E->Scalars) - Op0VL.push_back(cast(V)->getOperand(0)); + for (auto *IOP : E->Scalars) { + auto *I = IOP->getOpInst(); + Op0VL.push_back(cast(I)->getOperand(0)); + } - Value *Op0 = vectorizeTree(Op0VL); + Value *Op0 = vectorizeTree(VL0, Op0VL); std::vector OpVecs; for (int j = 1, e = cast(VL0)->getNumOperands(); j < e; ++j) { ValueList OpVL; - for (Value *V : E->Scalars) - OpVL.push_back(cast(V)->getOperand(j)); + for (auto *IOP : E->Scalars) { + auto *I = IOP->getOpInst(); + OpVL.push_back(cast(I)->getOperand(j)); + } - Value *OpVec = vectorizeTree(OpVL); + Value *OpVec = vectorizeTree(VL0, OpVL); OpVecs.push_back(OpVec); } @@ -3433,7 +3608,7 @@ } case Instruction::Call: { CallInst *CI = cast(VL0); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); Function *FI; Intrinsic::ID IID = Intrinsic::not_intrinsic; Value *ScalarArg = nullptr; @@ -3451,12 +3626,12 @@ OpVecs.push_back(CEI->getArgOperand(j)); continue; } - for (Value *V : E->Scalars) { - CallInst *CEI = cast(V); + for (auto *IOP : E->Scalars) { + CallInst *CEI = cast(IOP->getOpInst()); OpVL.push_back(CEI->getArgOperand(j)); } - Value *OpVec = vectorizeTree(OpVL); + Value *OpVec = vectorizeTree(VL0, OpVL); LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); } @@ -3486,25 +3661,25 @@ } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(E->Scalars[0]->isAltShuffle() && + ((Instruction::isBinaryOp(E->Scalars[0]->MainOpcode) && + Instruction::isBinaryOp(E->Scalars[0]->AltOpcode)) || + (Instruction::isCast(E->Scalars[0]->MainOpcode) && + Instruction::isCast(E->Scalars[0]->AltOpcode))) && "Invalid Shuffle Vector Operand"); Value *LHS, *RHS; - if (Instruction::isBinaryOp(S.getOpcode())) { - reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL); - setInsertPointAfterBundle(E->Scalars, S); - LHS = vectorizeTree(LHSVL); - RHS = vectorizeTree(RHSVL); + if (Instruction::isBinaryOp(E->Scalars[0]->MainOpcode)) { + reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); + setInsertPointAfterBundle(E->Scalars); + LHS = vectorizeTree(VL0, LHSVL); + RHS = vectorizeTree(VL0, RHSVL); } else { ValueList INVL; - for (Value *V : E->Scalars) - INVL.push_back(cast(V)->getOperand(0)); - setInsertPointAfterBundle(E->Scalars, S); - LHS = vectorizeTree(INVL); + for (auto *IOP : E->Scalars) + INVL.push_back(IOP->getOpInst()->getOperand(0)); + setInsertPointAfterBundle(E->Scalars); + LHS = vectorizeTree(VL0, INVL); } if (E->VectorizedValue) { @@ -3513,28 +3688,31 @@ } Value *V0, *V1; - if (Instruction::isBinaryOp(S.getOpcode())) { + if (Instruction::isBinaryOp(E->Scalars[0]->MainOpcode)) { V0 = Builder.CreateBinOp( - static_cast(S.getOpcode()), LHS, RHS); + static_cast(E->Scalars[0]->MainOpcode), LHS, + RHS); V1 = Builder.CreateBinOp( - static_cast(S.getAltOpcode()), LHS, RHS); + static_cast(E->Scalars[0]->AltOpcode), LHS, + RHS); } else { V0 = Builder.CreateCast( - static_cast(S.getOpcode()), LHS, VecTy); + static_cast(E->Scalars[0]->MainOpcode), LHS, + VecTy); V1 = Builder.CreateCast( - static_cast(S.getAltOpcode()), LHS, VecTy); + static_cast(E->Scalars[0]->AltOpcode), LHS, + VecTy); } // Create shuffle to take alternate operations from the vector. // Also, gather up main and alt scalar ops to propagate IR flags to // each vector operation. - ValueList OpScalars, AltScalars; + IOPList OpScalars, AltScalars; unsigned e = E->Scalars.size(); SmallVector Mask(e); for (unsigned i = 0; i < e; ++i) { - auto *OpInst = cast(E->Scalars[i]); - assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); - if (OpInst->getOpcode() == S.getAltOpcode()) { + auto *I = E->Scalars[i]->getOpInst(); + if (I->getOpcode() == E->Scalars[0]->AltOpcode) { Mask[i] = Builder.getInt32(e + i); AltScalars.push_back(E->Scalars[i]); } else { @@ -3544,8 +3722,10 @@ } Value *ShuffleMask = ConstantVector::get(Mask); - propagateIRFlags(V0, OpScalars); - propagateIRFlags(V1, AltScalars); + InstructionsState S = getSameOpcode(OpScalars); + propagateIRFlags(V0, OpScalars, S.OpValue); + S = getSameOpcode(AltScalars); + propagateIRFlags(V1, AltScalars, S.OpValue); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); if (Instruction *I = dyn_cast(V)) @@ -3572,9 +3752,10 @@ Value * BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { - // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + BlockScheduling *BS = BSIter.second.get(); + eliminatePseudos(BS); + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); @@ -3583,7 +3764,7 @@ // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0].Scalars[0]->MainOp; if (MinBWs.count(ScalarRoot)) { if (auto *I = dyn_cast(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); @@ -3698,7 +3879,7 @@ // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { - Value *Scalar = Entry->Scalars[Lane]; + Value *Scalar = Entry->Scalars[Lane]->getOp(); Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { @@ -3707,7 +3888,8 @@ LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); // It is legal to replace users in the ignorelist by undef. - assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && + assert((getTreeEntry(U) || + is_contained(UserIgnoreList, U)) && "Replacing out-of-tree value with undef"); } #endif @@ -3809,10 +3991,9 @@ // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. -bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, - BoUpSLP *SLP, - const InstructionsState &S) { - if (isa(S.OpValue)) +bool BoUpSLP::BlockScheduling::tryScheduleBundle( + ArrayRef IL, BoUpSLP *SLP) { + if (isa(IL[0]->MainOp)) return true; // Initialize the instruction bundle. @@ -3820,19 +4001,25 @@ ScheduleData *PrevInBundle = nullptr; ScheduleData *Bundle = nullptr; bool ReSchedule = false; - LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); + bool IsPseudo = false; + LLVM_DEBUG(dbgs() << "SLP: bundle: " << *IL[0]->MainOp << "\n"); // Make sure that the scheduling region contains all // instructions of the bundle. - for (Value *V : VL) { - if (!extendSchedulingRegion(V, S)) + for (InstructionOrPseudo *IOP : IL) + if (!extendSchedulingRegion(IOP)) return false; - } - for (Value *V : VL) { - ScheduleData *BundleMember = getScheduleData(V); + for (InstructionOrPseudo *IOP : IL) { + ScheduleData *BundleMember = + ScheduleDataMap[IOP->getOpInst()][std::make_pair(nullptr, 0)]; + if (BundleMember->isPartOfBundle()) { + BundleMember = getScheduleData(IOP); + IsPseudo = true; + } assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); + assert(!BundleMember->isPartOfBundle() && "Already part of another bundle"); if (BundleMember->IsScheduled) { // A bundle member was scheduled as single instruction before and now // needs to be scheduled as part of the bundle. We just get rid of the @@ -3849,13 +4036,15 @@ Bundle = BundleMember; } BundleMember->UnscheduledDepsInBundle = 0; + BundleMember->Opcode = IOP->MainOpcode; + BundleMember->Parent = IOP->getParent(); Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; // Group the instructions to a bundle. BundleMember->FirstInBundle = Bundle; PrevInBundle = BundleMember; } - if (ScheduleEnd != OldScheduleEnd) { + if (ScheduleEnd != OldScheduleEnd || IsPseudo) { // The scheduling region got new instructions at the lower end (or it is a // new region for the first bundle). This makes it necessary to // recalculate all dependencies. @@ -3892,18 +4081,29 @@ } } if (!Bundle->isReady()) { - cancelScheduling(VL, S.OpValue); + cancelScheduling(IL[0]); + // We have to clear all dependencies, since all values + // were calculated for the vectorized bundle. + for (auto *I = ScheduleStart; I != ScheduleEnd; + I = I->getNextNode()) { + doForAllOpcodes(I, [](ScheduleData *SD) { + SD->clearDependencies(); + }); + } + resetSchedule(); return false; } return true; } -void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL, - Value *OpValue) { - if (isa(OpValue)) +void BoUpSLP::BlockScheduling::cancelScheduling(InstructionOrPseudo *IOP) { + if (!IOP->MainOp) return; - - ScheduleData *Bundle = getScheduleData(OpValue); + if (isa(IOP->MainOp)) + return; + auto Key = IOP->getKey(); + ScheduleData *Bundle = getScheduleData(IOP)->FirstInBundle; + assert(Bundle && "Counld not find bundle"); LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); @@ -3913,13 +4113,23 @@ // Un-bundle: make single instructions out of the bundle. ScheduleData *BundleMember = Bundle; while (BundleMember) { - assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); + assert(BundleMember->FirstInBundle == Bundle && "Corrupt bundle links"); + assert(BundleMember->Parent == Key.first && + BundleMember->Opcode == Key.second && "Corrupt bundle"); BundleMember->FirstInBundle = BundleMember; ScheduleData *Next = BundleMember->NextInBundle; BundleMember->NextInBundle = nullptr; BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; - if (BundleMember->UnscheduledDepsInBundle == 0) { - ReadyInsts.insert(BundleMember); + if (BundleMember->IsPseudo) { + ScheduleDataMap[BundleMember->Inst].erase(Key); + BundleMember->Opcode = 0; + BundleMember->Parent = nullptr; + } else { + BundleMember->Opcode = 0; + BundleMember->Parent = nullptr; + if (BundleMember->UnscheduledDepsInBundle == 0) { + ReadyInsts.insert(BundleMember); + } } BundleMember = Next; } @@ -3934,23 +4144,22 @@ return &(ScheduleDataChunks.back()[ChunkPos++]); } -bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, - const InstructionsState &S) { - if (getScheduleData(V, isOneOf(S, V))) +bool BoUpSLP::BlockScheduling::extendSchedulingRegion( + InstructionOrPseudo *IOP) { + if (getScheduleData(IOP)) return true; - Instruction *I = dyn_cast(V); - assert(I && "bundle member must be an instruction"); - assert(!isa(I) && "phi nodes don't need to be scheduled"); - auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool { - ScheduleData *ISD = getScheduleData(I); - if (!ISD) + auto *I = IOP->getOpInst(); + assert(!isa(I) + && "phi nodes don't need to be scheduled"); + auto &&CheckSheduleForI = [this, &IOP](Instruction *I) -> bool { + ScheduleData *ISD = ScheduleDataMap[I][std::make_pair(nullptr, 0)]; + if (!ISD || ISD->SchedulingRegionID != SchedulingRegionID) return false; - assert(isInSchedulingRegion(ISD) && - "ScheduleData not in scheduling region"); - ScheduleData *SD = allocateScheduleDataChunks(); - SD->Inst = I; - SD->init(SchedulingRegionID, S.OpValue); - ExtraScheduleDataMap[I][S.OpValue] = SD; + if (ISD->isPartOfBundle()) { + ScheduleData *SD = allocateScheduleDataChunks(); + SD->init(SchedulingRegionID, ISD, IOP); + ScheduleDataMap[I][IOP->getKey()] = SD; + } return true; }; if (CheckSheduleForI(I)) @@ -3960,10 +4169,9 @@ initScheduleData(I, I->getNextNode(), nullptr, nullptr); ScheduleStart = I; ScheduleEnd = I->getNextNode(); - if (isOneOf(S, I) != I) - CheckSheduleForI(I); assert(ScheduleEnd && "tried to vectorize a terminator?"); - LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); + LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I + << "\n"); return true; } // Search up and down at the same time, because we don't know if the new @@ -3981,23 +4189,20 @@ if (UpIter != UpperEnd) { if (&*UpIter == I) { - initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); + initScheduleData(I, ScheduleStart, nullptr, + FirstLoadStoreInRegion); ScheduleStart = I; - if (isOneOf(S, I) != I) - CheckSheduleForI(I); - LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I - << "\n"); + LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " + << *I << "\n"); return true; } UpIter++; } if (DownIter != LowerEnd) { if (&*DownIter == I) { - initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, - nullptr); + initScheduleData(ScheduleEnd, I->getNextNode(), + LastLoadStoreInRegion, nullptr); ScheduleEnd = I->getNextNode(); - if (isOneOf(S, I) != I) - CheckSheduleForI(I); assert(ScheduleEnd && "tried to vectorize a terminator?"); LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); @@ -4011,21 +4216,20 @@ return true; } -void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, - Instruction *ToI, - ScheduleData *PrevLoadStore, - ScheduleData *NextLoadStore) { +void BoUpSLP::BlockScheduling::initScheduleData( + Instruction *FromI, Instruction *ToI, ScheduleData *PrevLoadStore, + ScheduleData *NextLoadStore) { ScheduleData *CurrentLoadStore = PrevLoadStore; for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { - ScheduleData *SD = ScheduleDataMap[I]; + ScheduleData *SD = ScheduleDataMap[I][std::make_pair(nullptr, 0)]; if (!SD) { SD = allocateScheduleDataChunks(); - ScheduleDataMap[I] = SD; + ScheduleDataMap[I][std::make_pair(nullptr, 0)] = SD; SD->Inst = I; } assert(!isInSchedulingRegion(SD) && "new ScheduleData already in scheduling region"); - SD->init(SchedulingRegionID, I); + SD->init(SchedulingRegionID); if (I->mayReadOrWriteMemory() && (!isa(I) || @@ -4060,8 +4264,13 @@ WorkList.pop_back(); ScheduleData *BundleMember = SD; + unsigned Opcode = BundleMember->Opcode; + Value *Parent = BundleMember->Parent; while (BundleMember) { - assert(isInSchedulingRegion(BundleMember)); + assert(isInSchedulingRegion(BundleMember) && + BundleMember->Opcode == Opcode && + BundleMember->Parent == Parent && "Corrupt bundle member"); + if (!BundleMember->hasValidDependencies()) { LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember @@ -4070,35 +4279,22 @@ BundleMember->resetUnscheduledDeps(); // Handle def-use chain dependencies. - if (BundleMember->OpValue != BundleMember->Inst) { - ScheduleData *UseSD = getScheduleData(BundleMember->Inst); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { - BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) - BundleMember->incrementUnscheduledDeps(1); - if (!DestBundle->hasValidDependencies()) - WorkList.push_back(DestBundle); - } - } else { - for (User *U : BundleMember->Inst->users()) { - if (isa(U)) { - ScheduleData *UseSD = getScheduleData(U); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { - BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) - BundleMember->incrementUnscheduledDeps(1); - if (!DestBundle->hasValidDependencies()) - WorkList.push_back(DestBundle); - } - } else { - // I'm not sure if this can ever happen. But we need to be safe. - // This lets the instruction/bundle never be scheduled and - // eventually disable vectorization. + for (User *U : BundleMember->Inst->users()) { + if (auto *I = dyn_cast(U)) { + doForAllOpcodes(I, [&](ScheduleData *UseSD) { BundleMember->Dependencies++; - BundleMember->incrementUnscheduledDeps(1); - } + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) + BundleMember->incrementUnscheduledDeps(1); + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); + }); + } else { + // I'm not sure if this can ever happen. But we need to be safe. + // This lets the instruction/bundle never be scheduled and + // eventually disable vectorization. + BundleMember->Dependencies++; + BundleMember->incrementUnscheduledDeps(1); } } @@ -4107,7 +4303,7 @@ if (DepDest) { Instruction *SrcInst = BundleMember->Inst; MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA); - bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); + bool SrcMayWrite = SrcInst->mayWriteToMemory(); unsigned numAliased = 0; unsigned DistToSrc = 1; @@ -4131,15 +4327,17 @@ // balance between reduced runtime and accurate dependencies. numAliased++; - DepDest->MemoryDependencies.push_back(BundleMember); - BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; - if (!DestBundle->IsScheduled) { - BundleMember->incrementUnscheduledDeps(1); - } - if (!DestBundle->hasValidDependencies()) { - WorkList.push_back(DestBundle); - } + doForAllOpcodes(DepDest->Inst, [&](ScheduleData *Dep) { + Dep->MemoryDependencies.push_back(BundleMember); + BundleMember->Dependencies++; + ScheduleData *DestBundle = Dep->FirstInBundle; + if (!DestBundle->IsScheduled) { + BundleMember->incrementUnscheduledDeps(1); + } + if (!DestBundle->hasValidDependencies()) { + WorkList.push_back(DestBundle); + } + }); } DepDest = DepDest->NextLoadStore; @@ -4176,9 +4374,7 @@ assert(ScheduleStart && "tried to reset schedule on block which has not been scheduled"); for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - doForAllOpcodes(I, [&](ScheduleData *SD) { - assert(isInSchedulingRegion(SD) && - "ScheduleData not in scheduling region"); + doForAllOpcodes(I, [](ScheduleData *SD) { SD->IsScheduled = false; SD->resetUnscheduledDeps(); }); @@ -4186,6 +4382,84 @@ ReadyInsts.clear(); } +void BoUpSLP::eliminatePseudos(BlockScheduling *BS) { + if (!BS->ScheduleStart) + return; + bool Changed = false; + for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + SmallSet, 2> Keys; + SmallDenseMap NewOps; + BS->doForAllOpcodes(I, [&](ScheduleData *SD) { + if (!SD->IsPseudo) + return; + std::pair Key = std::make_pair( + SD->Parent, SD->Opcode); + // Remove any pseudo instruction that + // remaines after bundles construction. + if (!SD->isPartOfBundle()) { + Keys.insert(Key); + return; + } + Changed = true; + + // Emitting a real instruction. + Builder.SetInsertPoint(I->getNextNode()); + Instruction *NewOp = nullptr; + NewOp = I->clone(); + BS->BB->getInstList().push_back(NewOp); + NewOp->moveAfter(I); + LLVM_DEBUG(dbgs() << "SLP: Replace pseudo instruction" + << *I << " with " << *NewOp + << " in " << *SD->FirstInBundle << "\n"); + + // Change instruction in ScheduleData. + SD->Inst = NewOp; + SD->IsPseudo = false; + + // Remove the pseudo instruction from ScheduleDataMap + // and add the real instruction. + NewOps[NewOp] = SD; + Keys.insert(Key); + int Idx = ScalarToTreeEntry[I][Key]; + ScalarToTreeEntry[NewOp][Key] = Idx; + ScalarToTreeEntry[I].erase(Key); + TreeEntry *E = &VectorizableTree[Idx]; + llvm::propagateIRFlags(NewOp, E->Scalars[0]->MainOp); + propagateMetadata(NewOp, E->Scalars); + + // Replace instruction in TreeEntry. + for (auto *IOP : E->Scalars) { + if (IOP->getOpInst() == I) + IOP->replaceWith(NewOp); + } + }); + for (auto Key : Keys) + BS->ScheduleDataMap[I].erase(Key); + for (auto Ops : NewOps) { + auto *SD = Ops.second; + BS->ScheduleDataMap[Ops.first][std::make_pair(SD->Parent, SD->Opcode)] = + SD; + } + } + + // We have to recalulate all dependancies in + // the scheduling region if any pseudo instructions + // were replaced. + if (Changed) { + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + BS->doForAllOpcodes(I, [&](ScheduleData *SD) { + SD->clearDependencies(); + }); + } + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -4211,8 +4485,11 @@ for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { + std::pair Key = std::make_pair(SD->Parent, SD->Opcode); assert(SD->isPartOfBundle() == - (getTreeEntry(SD->Inst) != nullptr) && + (ScalarToTreeEntry.find(SD->Inst) != ScalarToTreeEntry.end() && + ScalarToTreeEntry[SD->Inst].find(Key) != + ScalarToTreeEntry[SD->Inst].end()) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { @@ -4233,20 +4510,33 @@ // Move the scheduled instruction(s) to their dedicated places, if not // there yet. ScheduleData *BundleMember = picked; + unsigned Opcode = BundleMember->Opcode; + Value *Parent = BundleMember->Parent; while (BundleMember) { - Instruction *pickedInst = BundleMember->Inst; - if (LastScheduledInst->getNextNode() != pickedInst) { - BS->BB->getInstList().remove(pickedInst); + assert(Opcode == BundleMember->Opcode && + Parent == BundleMember->Parent && "Corrupt bundle member"); + Instruction *PickedInst = BundleMember->Inst; + if (LastScheduledInst != PickedInst) { + BS->BB->getInstList().remove(PickedInst); BS->BB->getInstList().insert(LastScheduledInst->getIterator(), - pickedInst); + PickedInst); } - LastScheduledInst = pickedInst; + LastScheduledInst = PickedInst; BundleMember = BundleMember->NextInBundle; } - BS->schedule(picked, ReadyInsts); NumToSchedule--; } +#ifndef NDEBUG + if (NumToSchedule != 0) { + for (BasicBlock::iterator I = BS->BB->begin(), E = BS->BB->end(); I != E; + ++I) { + BS->doForAllOpcodes(&*I, [](ScheduleData *SD) { + LLVM_DEBUG(dbgs() << "SLP: Failed to schedule: " << *SD << "\n"); + }); + } + } +#endif assert(NumToSchedule == 0 && "could not schedule all instructions"); // Avoid duplicate scheduling of the block. @@ -4391,7 +4681,7 @@ // We only attempt to truncate integer expressions. auto &TreeRoot = VectorizableTree[0].Scalars; - auto *TreeRootIT = dyn_cast(TreeRoot[0]->getType()); + auto *TreeRootIT = dyn_cast(TreeRoot[0]->getOp()->getType()); if (!TreeRootIT) return; @@ -4401,7 +4691,9 @@ // This means that if a tree entry other than a root is used externally, it // must have multiple uses and InstCombine will not rewrite it. The code // below ensures that only the roots are used externally. - SmallPtrSet Expr(TreeRoot.begin(), TreeRoot.end()); + SmallPtrSet Expr; + for (auto *IOP : TreeRoot) + Expr.insert(IOP->getOp()); for (auto &EU : ExternalUses) if (!Expr.erase(EU.Scalar)) return; @@ -4412,13 +4704,16 @@ // context to determine which values can be demoted. If we see a truncation, // we mark it as seeding another demotion. for (auto &Entry : VectorizableTree) - Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end()); + for (auto *IOP: Entry.Scalars) + Expr.insert(IOP->getOp()); // Ensure the roots of the vectorizable tree don't form a cycle. They must // have a single external user that is not in the vectorizable tree. - for (auto *Root : TreeRoot) - if (!Root->hasOneUse() || Expr.count(*Root->user_begin())) + for (auto *Root : TreeRoot) { + Value *V = Root->getOp(); + if (!V->hasOneUse() || Expr.count(*V->user_begin())) return; + } // Conservatively determine if we can actually truncate the roots of the // expression. Collect the values that can be demoted in ToDemote and @@ -4426,7 +4721,7 @@ SmallVector ToDemote; SmallVector Roots; for (auto *Root : TreeRoot) - if (!collectValuesToDemote(Root, Expr, ToDemote, Roots)) + if (!collectValuesToDemote(Root->getOp(), Expr, ToDemote, Roots)) return; // The maximum bit width required to represent all the values that can be @@ -4437,7 +4732,7 @@ // We first check if all the bits of the roots are demanded. If they're not, // we can truncate the roots to this narrower type. for (auto *Root : TreeRoot) { - auto Mask = DB->getDemandedBits(cast(Root)); + auto Mask = DB->getDemandedBits(Root->getOpInst()); MaxBitWidth = std::max( Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth); } @@ -4456,17 +4751,18 @@ // We start by looking at each entry that can be demoted. We compute the // maximum bit width required to store the scalar by using ValueTracking to // compute the number of high-order bits we can truncate. - if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) && - llvm::all_of(TreeRoot, [](Value *R) { - assert(R->hasOneUse() && "Root should have only one use!"); - return isa(R->user_back()); + if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getOp()->getType()) && + llvm::all_of(TreeRoot, [](InstructionOrPseudo *R) { + Value *V = R->getOp(); + assert(V->hasOneUse() && "Root should have only one use!"); + return isa(V->user_back()); })) { MaxBitWidth = 8u; // Determine if the sign bit of all the roots is known to be zero. If not, // IsKnownPositive is set to False. - IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) { - KnownBits Known = computeKnownBits(R, *DL); + IsKnownPositive = llvm::all_of(TreeRoot, [&](InstructionOrPseudo *R) { + KnownBits Known = computeKnownBits(R->getOp(), *DL); return Known.isNonNegative(); }); @@ -4864,7 +5160,14 @@ // Check that all of the parts are scalar instructions of the same type, // we permit an alternate opcode via InstructionsState. - InstructionsState S = getSameOpcode(VL); + SmallVector ArrayIL; + SmallVector IL; + + for (Value *V: VL) + ArrayIL.push_back(InstructionOrPseudo(VL[0], V)); + for (unsigned idx=0; idx < ArrayIL.size(); idx++) + IL.push_back(&ArrayIL[idx]); + InstructionsState S = getSameOpcode(IL); if (!S.getOpcode()) return false; Index: test/Transforms/SLPVectorizer/X86/duplicate.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/duplicate.ll @@ -0,0 +1,118 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 < %s | FileCheck %s + +; Reusing %sub.i1247 operation in two different bundles. + +%class.j = type { %class.c* } +%class.c = type { [3 x double] } + +@h = dso_local local_unnamed_addr global i64 0, align 8 +@i = dso_local local_unnamed_addr global i64 0, align 8 + +; Function Attrs: uwtable +define dso_local void @_ZN1j1kEv(%class.j* nocapture readonly %this) local_unnamed_addr #0 align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { +; CHECK-LABEL: @_ZN1j1kEv( +; CHECK-NEXT: invoke.cont18: +; CHECK-NEXT: [[M:%.*]] = alloca [[CLASS_C:%.*]], align 8 +; CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[CLASS_C]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast %class.c* [[M]] to i8* +; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull [[TMP0]]) +; CHECK-NEXT: [[L:%.*]] = getelementptr inbounds [[CLASS_J:%.*]], %class.j* [[THIS:%.*]], i64 0, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load %class.c*, %class.c** [[L]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i64, i64* @h, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[CLASS_C]], %class.c* [[TMP1]], i64 [[TMP2]], i32 0, i64 0 +; CHECK-NEXT: [[ARRAYIDX5_I49:%.*]] = getelementptr inbounds [[CLASS_C]], %class.c* [[TMP1]], i64 [[TMP2]], i32 0, i64 1 +; CHECK-NEXT: [[ARRAYIDX9_I51:%.*]] = getelementptr inbounds [[CLASS_C]], %class.c* [[TMP1]], i64 [[TMP2]], i32 0, i64 2 +; CHECK-NEXT: [[TMP4:%.*]] = load double, double* [[ARRAYIDX9_I51]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load i64, i64* @i, align 8 +; CHECK-NEXT: [[ARRAYIDX_I41:%.*]] = getelementptr inbounds [[CLASS_C]], %class.c* [[TMP1]], i64 [[TMP5]], i32 0, i64 0 +; CHECK-NEXT: [[ARRAYIDX5_I44:%.*]] = getelementptr inbounds [[CLASS_C]], %class.c* [[TMP1]], i64 [[TMP5]], i32 0, i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX_I41]] to <2 x double>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = bitcast double* [[TMP3]] to <2 x double>* +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, <2 x double>* [[TMP8]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = fsub fast <2 x double> [[TMP7]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x double> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x double> undef, double [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x double> [[TMP7]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x double> [[TMP12]], double [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x double> , double [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = fmul fast <2 x double> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = fsub fast <2 x double> [[TMP14]], [[TMP15]] +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x double> [[TMP16]], <2 x double> [[TMP17]], <2 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> undef, double [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP21:%.*]] = fmul fast <2 x double> [[TMP10]], [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = fsub fast <2 x double> [[TMP21]], [[TMP18]] +; CHECK-NEXT: [[P_SROA_0_0__SROA_CAST65:%.*]] = bitcast %class.c* [[AGG_TMP]] to i64* +; CHECK-NEXT: store i64 4607182418800017408, i64* [[P_SROA_0_0__SROA_CAST65]], align 8 +; CHECK-NEXT: [[P_SROA_5_0__SROA_IDX68:%.*]] = getelementptr inbounds [[CLASS_C]], %class.c* [[AGG_TMP]], i64 0, i32 0, i64 1 +; CHECK-NEXT: [[P_SROA_6_0__SROA_IDX72:%.*]] = getelementptr inbounds [[CLASS_C]], %class.c* [[AGG_TMP]], i64 0, i32 0, i64 2 +; CHECK-NEXT: [[TMP23:%.*]] = bitcast double* [[P_SROA_5_0__SROA_IDX68]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP22]], <2 x double>* [[TMP23]], align 8 +; CHECK-NEXT: invoke void @_ZN1cIdEmlES0_(%class.c* nonnull [[M]], %class.c* nonnull [[AGG_TMP]]) +; CHECK-NEXT: to label [[INVOKE_CONT24:%.*]] unwind label [[LPAD23:%.*]] +; CHECK: invoke.cont24: +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull [[TMP0]]) +; CHECK-NEXT: ret void +; CHECK: lpad23: +; CHECK-NEXT: [[TMP24:%.*]] = landingpad { i8*, i32 } +; CHECK-NEXT: cleanup +; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull [[TMP0]]) +; CHECK-NEXT: resume { i8*, i32 } [[TMP24]] +; +invoke.cont18: + %m = alloca %class.c, align 8 + %agg.tmp = alloca %class.c, align 8 + %0 = bitcast %class.c* %m to i8* + call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %0) #3 + %l = getelementptr inbounds %class.j, %class.j* %this, i64 0, i32 0 + %1 = load %class.c*, %class.c** %l, align 8 + %2 = load i64, i64* @h, align 8 + %3 = getelementptr inbounds %class.c, %class.c* %1, i64 %2, i32 0, i64 0 + %4 = load double, double* %3, align 8 + %arrayidx5.i49 = getelementptr inbounds %class.c, %class.c* %1, i64 %2, i32 0, i64 1 + %arrayidx9.i51 = getelementptr inbounds %class.c, %class.c* %1, i64 %2, i32 0, i64 2 + %5 = load double, double* %arrayidx9.i51, align 8 + %6 = load i64, i64* @i, align 8 + %arrayidx.i41 = getelementptr inbounds %class.c, %class.c* %1, i64 %6, i32 0, i64 0 + %7 = load double, double* %arrayidx.i41, align 8 + %sub.i43 = fsub fast double %7, %4 + %arrayidx5.i44 = getelementptr inbounds %class.c, %class.c* %1, i64 %6, i32 0, i64 1 + %8 = load double, double* %arrayidx5.i44, align 8 + %9 = load double, double* %arrayidx5.i49, align 8 + %sub8.i = fsub fast double %8, %9 + %mul.i = fmul fast double %sub.i43, %5 + %mul8.i = fmul fast double %4, 2.000000e+00 + %sub.i = fsub fast double %mul.i, %mul8.i + %mul13.i = fmul fast double %sub8.i, %4 + %sub16.i = fsub fast double %mul13.i, %sub.i43 + %p.sroa.0.0..sroa_cast65 = bitcast %class.c* %agg.tmp to i64* + store i64 4607182418800017408, i64* %p.sroa.0.0..sroa_cast65, align 8 + %p.sroa.5.0..sroa_idx68 = getelementptr inbounds %class.c, %class.c* %agg.tmp, i64 0, i32 0, i64 1 + store double %sub.i, double* %p.sroa.5.0..sroa_idx68, align 8 + %p.sroa.6.0..sroa_idx72 = getelementptr inbounds %class.c, %class.c* %agg.tmp, i64 0, i32 0, i64 2 + store double %sub16.i, double* %p.sroa.6.0..sroa_idx72, align 8 + invoke void @_ZN1cIdEmlES0_(%class.c* nonnull %m, %class.c* nonnull %agg.tmp) + to label %invoke.cont24 unwind label %lpad23 + +invoke.cont24: ; preds = %invoke.cont18 + call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %0) #3 + ret void + +lpad23: ; preds = %invoke.cont18 + %10 = landingpad { i8*, i32 } + cleanup + call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %0) #3 + resume { i8*, i32 } %10 +} + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1 + +declare dso_local i32 @__gxx_personality_v0(...) + +; Function Attrs: argmemonly nounwind +declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1 + +declare dso_local void @_ZN1cIdEmlES0_(%class.c*, %class.c*) local_unnamed_addr #2