Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -173,40 +173,6 @@ !Ty->isPPC_FP128Ty(); } -/// \returns true if all of the instructions in \p VL are in the same block or -/// false otherwise. -static bool allSameBlock(ArrayRef VL) { - Instruction *I0 = dyn_cast(VL[0]); - if (!I0) - return false; - BasicBlock *BB = I0->getParent(); - for (int i = 1, e = VL.size(); i < e; i++) { - Instruction *I = dyn_cast(VL[i]); - if (!I) - return false; - - if (BB != I->getParent()) - return false; - } - return true; -} - -/// \returns True if all of the values in \p VL are constants. -static bool allConstant(ArrayRef VL) { - for (Value *i : VL) - if (!isa(i)) - return false; - return true; -} - -/// \returns True if all of the values in \p VL are identical. -static bool isSplat(ArrayRef VL) { - for (unsigned i = 1, e = VL.size(); i < e; ++i) - if (VL[i] != VL[0]) - return false; - return true; -} - /// Checks if the vector of instructions can be represented as a shuffle, like: /// %x0 = extractelement <4 x i8> %x, i32 0 /// %x3 = extractelement <4 x i8> %x, i32 3 @@ -299,6 +265,47 @@ : TargetTransformInfo::SK_PermuteSingleSrc; } +static bool isRemainder(unsigned Opcode) { + return (Opcode == Instruction::URem || + Opcode == Instruction::SRem || Opcode == Instruction::FRem); +} + +/// Checks if the \p Opcode can be considered as an operand of a (possibly) +/// binary operation \p I. +/// \returns The code of the binary operation of instruction \p I if the +/// instruction with \p Opcode can be considered as an operand of \p I with the +/// default value. +static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) { + if (Opcode != Instruction::PHI && + Opcode != Instruction::Invoke && + ((I->getType()->isIntegerTy() && + (I->getOpcode() == Instruction::Add || + I->getOpcode() == Instruction::And || + I->getOpcode() == Instruction::AShr || + I->getOpcode() == Instruction::BitCast || + I->getOpcode() == Instruction::Call || + I->getOpcode() == Instruction::ExtractElement || + I->getOpcode() == Instruction::ExtractValue || + I->getOpcode() == Instruction::ICmp || + I->getOpcode() == Instruction::Load || + I->getOpcode() == Instruction::LShr || + I->getOpcode() == Instruction::Mul || + I->getOpcode() == Instruction::Or || + I->getOpcode() == Instruction::PtrToInt || + I->getOpcode() == Instruction::SDiv || + I->getOpcode() == Instruction::Select || + I->getOpcode() == Instruction::SExt || + I->getOpcode() == Instruction::Shl || + I->getOpcode() == Instruction::Sub || + I->getOpcode() == Instruction::Trunc || + I->getOpcode() == Instruction::UDiv || + I->getOpcode() == Instruction::Xor || + I->getOpcode() == Instruction::ZExt)) || + (isa(I) && cast(I)->isFast()))) + return I->getOpcode(); + return 0; +} + namespace { /// Main data required for vectorization of instructions. @@ -319,17 +326,65 @@ return AltOp ? AltOp->getOpcode() : 0; } - /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { return getOpcode() != getAltOpcode(); } + InstructionsState() = default; + InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) + : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} +}; + +class InstructionOrPseudo { +private: + // Parent of the operation. + Value *Parent; - bool isOpcodeOrAlt(Instruction *I) const { - unsigned CheckedOpcode = I->getOpcode(); - return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; +public: + // The operation instance. + Value *Op; + // Instruction represantation of the operation. + Instruction *OpInst; + // Main operation in a group of operations. + Instruction *MainOp = nullptr; + // Alternative operation in a group of operations. + Instruction *AltOp = nullptr; + // Pseudo operation after replacement. + Instruction *PseudoOp = nullptr; + // Main opcode of a group of operations. + unsigned MainOpcode = 0; + // Alternative opcode of a group of operations. + unsigned AltOpcode = 0; + + InstructionOrPseudo(Value *Parent, Value *Op) : Parent(Parent), Op(Op) { + OpInst = dyn_cast(Op); } - InstructionsState() = delete; - InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) - : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} + void setInstructionsState(const InstructionsState &S) { + MainOp = S.MainOp; + AltOp = S.AltOp; + if (MainOp) + MainOpcode = MainOp->getOpcode(); + if (AltOp) + AltOpcode = AltOp->getOpcode(); + } + + bool isNonAlt() { + if (!OpInst) + return false; + unsigned Opcode = OpInst->getOpcode(); + return ((Opcode != MainOpcode) && (Opcode != AltOpcode)); + } + + bool isAltShuffle() const { + return (MainOpcode != 0 && AltOpcode != 0 && MainOpcode != AltOpcode); + } + + bool isOpcodeOrAlt() const { + unsigned CheckedOpcode = OpInst->getOpcode(); + return MainOpcode == CheckedOpcode || AltOpcode == CheckedOpcode; + } + + std::pair getKey() const { + assert(Parent && "Incorrect parent!"); + return std::make_pair(Parent, MainOpcode); + } }; } // end anonymous namespace @@ -339,68 +394,155 @@ /// OpValue. static Value *isOneOf(const InstructionsState &S, Value *Op) { auto *I = dyn_cast(Op); - if (I && S.isOpcodeOrAlt(I)) - return Op; + if (I) { + unsigned CheckedOpcode = I->getOpcode(); + if (S.getOpcode() == CheckedOpcode || S.getAltOpcode() == CheckedOpcode) + return Op; + } return S.OpValue; } +/// \returns True if all of the values in \p IL are constants. +static bool allConstant(ArrayRef IL) { + return llvm::all_of( + make_range(IL.begin(), IL.end()), + [](InstructionOrPseudo *I) -> bool { return isa(I->Op); }); +} + +/// \returns True if all of the values in \p IL are identical. +static bool isSplat(ArrayRef IL) { + return llvm::all_of( + make_range(IL.begin(), IL.end()), + [&](InstructionOrPseudo *I) -> bool { return I->Op == IL[0]->Op; }); +} + +/// \returns true if all of the instructions in \p VL are in the same block or +/// false otherwise. +static bool allSameBlock(ArrayRef IL) { + if (!IL[0]->OpInst) + return false; + BasicBlock *BB = IL[0]->OpInst->getParent(); + for (int i = 1, e = IL.size(); i < e; i++) { + if (!IL[i]->OpInst) + return false; + + if (BB != IL[i]->OpInst->getParent()) + return false; + } + return true; +} + /// \returns analysis of the Instructions in \p VL described in /// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. -static InstructionsState getSameOpcode(ArrayRef VL, +static InstructionsState getSameOpcode(ArrayRef IL, unsigned BaseIndex = 0) { // Make sure these are all Instructions. - if (llvm::any_of(VL, [](Value *V) { return !isa(V); })) - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + if (llvm::any_of(IL, [](InstructionOrPseudo *IOP) { return !IOP->OpInst; })) + return InstructionsState(IL[BaseIndex]->Op, nullptr, nullptr); - bool IsCastOp = isa(VL[BaseIndex]); - bool IsBinOp = isa(VL[BaseIndex]); - unsigned Opcode = cast(VL[BaseIndex])->getOpcode(); + unsigned Opcode = IL[BaseIndex]->OpInst->getOpcode(); + bool IsCastOp = isa(IL[BaseIndex]->OpInst); + bool IsBinOp = isa(IL[BaseIndex]->OpInst); + bool IsNonAlt = false; unsigned AltOpcode = Opcode; + // Number of Instructions in VL with Main opcode. + unsigned OpcodeNum = 0; + // Number of Instructions in VL with Alternative opcode. + unsigned AltOpcodeNum = 0; + // Number of Instructions in VL with Non-Alternative opcode. + unsigned NonAltNum = 0; + // Non-Alternative operation offset in VL. + unsigned NonAltIndex = 0; unsigned AltIndex = BaseIndex; - // Check for one alternate opcode from another BinaryOperator. - // TODO - generalize to support all operators (types, calls etc.). - for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { - unsigned InstOpcode = cast(VL[Cnt])->getOpcode(); - if (IsBinOp && isa(VL[Cnt])) { - if (InstOpcode == Opcode || InstOpcode == AltOpcode) - continue; - if (Opcode == AltOpcode) { - AltOpcode = InstOpcode; - AltIndex = Cnt; - continue; - } - } else if (IsCastOp && isa(VL[Cnt])) { - Type *Ty0 = cast(VL[BaseIndex])->getOperand(0)->getType(); - Type *Ty1 = cast(VL[Cnt])->getOperand(0)->getType(); + // Check for an alternate opcode pattern. + for (int Cnt = 0, E = IL.size(); Cnt < E; Cnt++) { + auto *I = IL[Cnt]->OpInst; + unsigned InstOpcode = I->getOpcode(); + if (IsCastOp && isa(IL[Cnt]->OpInst)) { + Type *Ty0 = IL[BaseIndex]->OpInst->getOperand(0)->getType(); + Type *Ty1 = IL[Cnt]->OpInst->getOperand(0)->getType(); if (Ty0 == Ty1) { - if (InstOpcode == Opcode || InstOpcode == AltOpcode) + if (InstOpcode == Opcode) { + OpcodeNum++; + continue; + } + if (AltOpcode != Opcode && InstOpcode == AltOpcode) { + AltOpcodeNum++; continue; + } if (Opcode == AltOpcode) { AltOpcode = InstOpcode; AltIndex = Cnt; + AltOpcodeNum++; continue; } } - } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) + return InstructionsState(IL[BaseIndex]->Op, nullptr, nullptr); + } + if (InstOpcode == Opcode) { + OpcodeNum++; continue; - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } + if (AltOpcode != Opcode && InstOpcode == AltOpcode) { + AltOpcodeNum++; + continue; + } + if (InstOpcode != Opcode && InstOpcode != AltOpcode) { + if (IsBinOp && AltOpcode == Opcode && isa(I)) { + AltOpcode = InstOpcode; + AltOpcodeNum++; + AltIndex = Cnt; + continue; + } + if (tryToRepresentAsInstArg(Opcode, I) || + (IsBinOp && + tryToRepresentAsInstArg(InstOpcode, + IL[BaseIndex]->OpInst))) { + if (!IsNonAlt) { + NonAltIndex = Cnt; + IsNonAlt = true; + } + NonAltNum++; + continue; + } + return InstructionsState(IL[BaseIndex]->Op, nullptr, nullptr); + } } - return InstructionsState(VL[BaseIndex], cast(VL[BaseIndex]), - cast(VL[AltIndex])); + if (IsNonAlt && IL.size() > 2 && (OpcodeNum + AltOpcodeNum) <= NonAltNum) { + BaseIndex = NonAltIndex; + AltIndex = BaseIndex; + Opcode = IL[BaseIndex]->OpInst->getOpcode(); + AltOpcode = Opcode; + IsBinOp = isa(IL[BaseIndex]->Op); + for (int Cnt = 0, E = IL.size(); Cnt < E; Cnt++) { + auto *I = IL[Cnt]->OpInst; + unsigned InstOpcode = I->getOpcode(); + if (Opcode == AltOpcode && IsBinOp && isa(I)) { + AltOpcode = InstOpcode; + AltIndex = Cnt; + } + } + } + + if (IsNonAlt && (!IsBinOp || + isRemainder(Opcode) || + isRemainder(AltOpcode))) + return InstructionsState(IL[BaseIndex]->Op, nullptr, nullptr); + + return InstructionsState(IL[BaseIndex]->Op, IL[BaseIndex]->OpInst, + IL[AltIndex]->OpInst); } /// \returns true if all of the values in \p VL have the same type or false /// otherwise. static bool allSameType(ArrayRef VL) { Type *Ty = VL[0]->getType(); - for (int i = 1, e = VL.size(); i < e; i++) - if (VL[i]->getType() != Ty) - return false; - - return true; + return llvm::all_of( + make_range(VL.begin(), VL.end()), + [&](Value *V) -> bool { return V->getType() == Ty; }); } /// \returns True if Extract{Value,Element} instruction extracts element Idx. @@ -479,6 +621,7 @@ using InstrList = SmallVector; using ValueSet = SmallPtrSet; using StoreList = SmallVector; + using IOPList = SmallVector; using ExtraValueToDebugLocsMap = MapVector>; @@ -538,6 +681,7 @@ /// Clear the internal data structures that are created by 'buildTree'. void deleteTree() { + InstructionOrPseudoArray.clear(); VectorizableTree.clear(); ScalarToTreeEntry.clear(); MustGather.clear(); @@ -613,21 +757,22 @@ int getEntryCost(TreeEntry *E); /// This is the recursive part of buildTree. - void buildTree_rec(ArrayRef Roots, unsigned Depth, int); + void buildTree_rec(Value *Parent, ArrayRef Roots, unsigned Depth, + int); /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can /// be vectorized to use the original vector (or aggregate "bitcast" to a /// vector) and sets \p CurrentOrder to the identity permutation; otherwise /// returns false, setting \p CurrentOrder to either an empty vector or a /// non-identity permutation that allows to reuse extract instructions. - bool canReuseExtract(ArrayRef VL, Value *OpValue, + bool canReuseExtract(ArrayRef IL, SmallVectorImpl &CurrentOrder) const; /// Vectorize a single entry in the tree. Value *vectorizeTree(TreeEntry *E); /// Vectorize a single entry in the tree, starting in \p VL. - Value *vectorizeTree(ArrayRef VL); + Value *vectorizeTree(ArrayRef VL, Value *Parent); /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. @@ -636,47 +781,58 @@ /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the /// roots. This method calculates the cost of extracting the values. - int getGatherCost(ArrayRef VL); + int getGatherCost(ArrayRef IL); /// Set the Builder insert point to one after the last instruction in /// the bundle - void setInsertPointAfterBundle(ArrayRef VL, - const InstructionsState &S); + void setInsertPointAfterBundle(ArrayRef IL); /// \returns a vector from a collection of scalars in \p VL. - Value *Gather(ArrayRef VL, VectorType *Ty); + Value *Gather(ArrayRef IL, VectorType *Ty); /// \returns whether the VectorizableTree is fully vectorizable and will /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(); + Instruction *propagateMetadata(Instruction *I, + ArrayRef IL); + + void propagateIRFlags(Value *I, + ArrayRef IL, + Value *OpValue); + /// \reorder commutative operands in alt shuffle if they result in /// vectorized code. - void reorderAltShuffleOperands(const InstructionsState &S, - ArrayRef VL, + void reorderAltShuffleOperands(ArrayRef IL, SmallVectorImpl &Left, SmallVectorImpl &Right); /// \reorder commutative operands to get better probability of /// generating vectorized code. - void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef VL, + void reorderInputsAccordingToOpcode(unsigned Opcode, + ArrayRef IL, SmallVectorImpl &Left, SmallVectorImpl &Right); struct TreeEntry { TreeEntry(std::vector &Container) : Container(Container) {} /// \returns true if the scalars in VL are equal to this entry. - bool isSame(ArrayRef VL) const { - if (VL.size() == Scalars.size()) - return std::equal(VL.begin(), VL.end(), Scalars.begin()); - return VL.size() == ReuseShuffleIndices.size() && + bool isSame(ArrayRef IL) const { + if (IL.size() == Scalars.size()) + return std::equal(IL.begin(), IL.end(), Scalars.begin(), + [this](InstructionOrPseudo *I1, InstructionOrPseudo *I2) + { return ((I1->Op == I2->Op) || + (I1->PseudoOp && (I1->PseudoOp == I2->Op)) || + (I2->PseudoOp && (I2->PseudoOp == I1->Op))); }); + return IL.size() == ReuseShuffleIndices.size() && std::equal( - VL.begin(), VL.end(), ReuseShuffleIndices.begin(), - [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; }); + IL.begin(), IL.end(), ReuseShuffleIndices.begin(), + [this](InstructionOrPseudo *I1, unsigned Idx) + { return ((I1->Op == Scalars[Idx]->Op) || (Scalars[Idx]->PseudoOp && I1->Op == Scalars[Idx]->PseudoOp)); }); } /// A vector of scalars. - ValueList Scalars; + IOPList Scalars; /// The Scalars are vectorized into this value. It is initialized to Null. Value *VectorizedValue = nullptr; @@ -704,24 +860,30 @@ }; /// Create a new VectorizableTree entry. - void newTreeEntry(ArrayRef VL, bool Vectorized, int &UserTreeIdx, + void newTreeEntry(ArrayRef IL, bool Vectorized, + int &UserTreeIdx, ArrayRef ReuseShuffleIndices = None, ArrayRef ReorderIndices = None) { VectorizableTree.emplace_back(VectorizableTree); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; - Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); + Last->Scalars.insert(Last->Scalars.begin(), IL.begin(), IL.end()); Last->NeedToGather = !Vectorized; Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; if (Vectorized) { - for (int i = 0, e = VL.size(); i != e; ++i) { - assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = idx; + for (InstructionOrPseudo *IOP : IL) { + assert( + (ScalarToTreeEntry.find(IOP->OpInst) == ScalarToTreeEntry.end() || + ScalarToTreeEntry[IOP->OpInst].find(IOP->getKey()) == + ScalarToTreeEntry[IOP->OpInst].end()) && + "Scalar already in tree!"); + ScalarToTreeEntry[IOP->OpInst][IOP->getKey()] = idx; } } else { - MustGather.insert(VL.begin(), VL.end()); + for (InstructionOrPseudo *IOP : IL) + MustGather.insert(IOP->Op); } if (UserTreeIdx >= 0) @@ -733,15 +895,32 @@ /// Holds all of the tree entries. std::vector VectorizableTree; - TreeEntry *getTreeEntry(Value *V) { - auto I = ScalarToTreeEntry.find(V); - if (I != ScalarToTreeEntry.end()) - return &VectorizableTree[I->second]; + TreeEntry *getTreeEntry(Instruction *I) { + if (!I) + return nullptr; + auto It = ScalarToTreeEntry.find(I); + if (It != ScalarToTreeEntry.end()) { + auto &STT = It->second; + for (auto STTI : STT) + for (auto *IOP : VectorizableTree[STTI.second].Scalars) + if (IOP->OpInst == I && !IOP->isNonAlt()) + return &VectorizableTree[STTI.second]; + } return nullptr; } + std::vector> InstructionOrPseudoArray; + + InstructionOrPseudo *allocInstructionOrPseudo(Value* Parent, Value *V) { + InstructionOrPseudoArray.emplace_back( + llvm::make_unique(Parent, V)); + int idx = InstructionOrPseudoArray.size() - 1; + return (InstructionOrPseudoArray[idx]).get(); + } + /// Maps a specific scalar to its tree entry. - SmallDenseMap ScalarToTreeEntry; + SmallDenseMap, int>> + ScalarToTreeEntry; /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -833,15 +1012,32 @@ ScheduleData() = default; - void init(int BlockSchedulingRegionID, Value *OpVal) { + void init(int BlockSchedulingRegionID) { FirstInBundle = this; NextInBundle = nullptr; NextLoadStore = nullptr; IsScheduled = false; + IsPseudo = false; + SchedulingRegionID = BlockSchedulingRegionID; + UnscheduledDepsInBundle = UnscheduledDeps; + clearDependencies(); + } + + // Initialize ScheduleData for pseudo instruction + // with ScheduleData for an original instruction OpOrigin. + void init(int BlockSchedulingRegionID, ScheduleData *OpOrigin, + Value *OpParent, unsigned OpCode) { + FirstInBundle = this; + NextInBundle = nullptr; + IsScheduled = false; + IsPseudo = true; + Opcode = OpCode; + Parent = OpParent; + Inst = OpOrigin->Inst; + NextLoadStore = OpOrigin->NextLoadStore; SchedulingRegionID = BlockSchedulingRegionID; UnscheduledDepsInBundle = UnscheduledDeps; clearDependencies(); - OpValue = OpVal; } /// Returns true if the dependency information has been calculated. @@ -946,8 +1142,14 @@ /// dry-run). bool IsScheduled = false; - /// Opcode of the current instruction in the schedule data. - Value *OpValue = nullptr; + /// Opcode that represents instructions to be vectorized. + unsigned Opcode = 0; + + // Parent of this operation. + Value *Parent = nullptr; + + // True if this is a pseudo operation. + bool IsPseudo = false; }; #ifndef NDEBUG @@ -985,22 +1187,16 @@ ++SchedulingRegionID; } - ScheduleData *getScheduleData(Value *V) { - ScheduleData *SD = ScheduleDataMap[V]; - if (SD && SD->SchedulingRegionID == SchedulingRegionID) + ScheduleData *getScheduleData(InstructionOrPseudo *IOP) { + std::pair Key = IOP->getKey(); + ScheduleData *SD = ScheduleDataMap[IOP->OpInst][std::make_pair(nullptr, 0)]; + if (SD && SD->Parent == Key.first && SD->Opcode == Key.second && + SD->SchedulingRegionID == SchedulingRegionID) + return SD; + SD = ScheduleDataMap[IOP->OpInst][Key]; + if (SD && SD->Parent == Key.first && SD->Opcode == Key.second && + SD->SchedulingRegionID == SchedulingRegionID) return SD; - return nullptr; - } - - ScheduleData *getScheduleData(Value *V, Value *Key) { - if (V == Key) - return getScheduleData(V); - auto I = ExtraScheduleDataMap.find(V); - if (I != ExtraScheduleDataMap.end()) { - ScheduleData *SD = I->second[Key]; - if (SD && SD->SchedulingRegionID == SchedulingRegionID) - return SD; - } return nullptr; } @@ -1016,18 +1212,22 @@ LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); ScheduleData *BundleMember = SD; + unsigned Opcode = BundleMember->Opcode; + Value *Parent = BundleMember->Parent; while (BundleMember) { - if (BundleMember->Inst != BundleMember->OpValue) { - BundleMember = BundleMember->NextInBundle; - continue; - } + assert(BundleMember->Opcode == Opcode && + BundleMember->Parent == Parent && "Corrupt bundle member"); // Handle the def-use chain dependencies. for (Use &U : BundleMember->Inst->operands()) { auto *I = dyn_cast(U.get()); if (!I) continue; - doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { - if (OpDef && OpDef->hasValidDependencies() && + if (ScheduleDataMap.find(I) == ScheduleDataMap.end()) + continue; + for (auto It : ScheduleDataMap[I]) { + auto *OpDef = It.second; + if (OpDef && isInSchedulingRegion(OpDef) && + OpDef->hasValidDependencies() && OpDef->incrementUnscheduledDeps(-1) == 0) { // There are no more unscheduled dependencies after // decrementing, so we can put the dependent instruction @@ -1039,7 +1239,7 @@ LLVM_DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n"); } - }); + } } // Handle the memory dependencies. for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { @@ -1058,46 +1258,38 @@ } } - void doForAllOpcodes(Value *V, - function_ref Action) { - if (ScheduleData *SD = getScheduleData(V)) - Action(SD); - auto I = ExtraScheduleDataMap.find(V); - if (I != ExtraScheduleDataMap.end()) - for (auto &P : I->second) - if (P.second->SchedulingRegionID == SchedulingRegionID) - Action(P.second); - } - /// Put all instructions into the ReadyList which are ready for scheduling. template void initialFillReadyList(ReadyListType &ReadyList) { for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - doForAllOpcodes(I, [&](ScheduleData *SD) { - if (SD->isSchedulingEntity() && SD->isReady()) { + if (ScheduleDataMap.find(I) == ScheduleDataMap.end()) + continue; + for (auto It : ScheduleDataMap[I]) { + auto *SD = It.second; + if (SD && isInSchedulingRegion(SD) && + SD->isSchedulingEntity() && SD->isReady()) { ReadyList.insert(SD); LLVM_DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); } - }); + } } } /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. - bool tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, - const InstructionsState &S); + bool tryScheduleBundle(ArrayRef IL, BoUpSLP *SLP); /// Un-bundles a group of instructions. - void cancelScheduling(ArrayRef VL, Value *OpValue); + void cancelScheduling(InstructionOrPseudo *IOP); /// Allocates schedule data chunk. ScheduleData *allocateScheduleDataChunks(); - /// Extends the scheduling region so that V is inside the region. + /// Extends the scheduling region so that I is inside the region. /// \returns true if the region size is within the limit. - bool extendSchedulingRegion(Value *V, const InstructionsState &S); + bool extendSchedulingRegion(InstructionOrPseudo *IOP); /// Initialize the ScheduleData structures for new instructions in the /// scheduling region. @@ -1127,12 +1319,11 @@ /// Attaches ScheduleData to Instruction. /// Note that the mapping survives during all vectorization iterations, i.e. - /// ScheduleData structures are recycled. - DenseMap ScheduleDataMap; - - /// Attaches ScheduleData to Instruction with the leading key. - DenseMap> - ExtraScheduleDataMap; + /// ScheduleData structures are recycled. All real instructions are placed + /// in [Instruction *][nullptr][0]. + DenseMap, ScheduleData *>> + ScheduleDataMap; struct ReadyList : SmallVector { void insert(ScheduleData *SD) { push_back(SD); } @@ -1171,6 +1362,9 @@ /// Attaches the BlockScheduling structures to basic blocks. MapVector> BlocksSchedules; + /// Remove or subsititude any pseudo instructions in the tree. + void eliminatePseudos(BlockScheduling *BS); + /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. void scheduleBlock(BlockScheduling *BS); @@ -1294,14 +1488,15 @@ std::string Str; raw_string_ostream OS(Str); if (isSplat(Entry->Scalars)) { - OS << " " << *Entry->Scalars[0]; + OS << " " << *Entry->Scalars[0]->Op; return Str; } - for (auto V : Entry->Scalars) { - OS << *V; - if (std::any_of( - R->ExternalUses.begin(), R->ExternalUses.end(), - [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; })) + for (auto *IOP : Entry->Scalars) { + OS << *IOP->Op; + if (std::any_of(R->ExternalUses.begin(), R->ExternalUses.end(), + [&](const BoUpSLP::ExternalUser &EU) { + return EU.Scalar == IOP->Op; + })) OS << " "; OS << "\n"; } @@ -1331,7 +1526,7 @@ UserIgnoreList = UserIgnoreLst; if (!allSameType(Roots)) return; - buildTree_rec(Roots, 0, -1); + buildTree_rec(Roots[0], Roots, 0, -1); // Collect the values that we need to extract from the tree. for (TreeEntry &EIdx : VectorizableTree) { @@ -1343,8 +1538,10 @@ // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { - Value *Scalar = Entry->Scalars[Lane]; + auto *Scalar = Entry->Scalars[Lane]; int FoundLane = Lane; + if (Scalar->isNonAlt()) + continue; if (!Entry->ReuseShuffleIndices.empty()) { FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(), @@ -1352,13 +1549,13 @@ } // Check if the scalar is externally used as an extra arg. - auto ExtI = ExternallyUsedValues.find(Scalar); + auto ExtI = ExternallyUsedValues.find(Scalar->Op); if (ExtI != ExternallyUsedValues.end()) { LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " - << Lane << " from " << *Scalar << ".\n"); - ExternalUses.emplace_back(Scalar, nullptr, FoundLane); + << Lane << " from " << *Scalar->Op << ".\n"); + ExternalUses.emplace_back(Scalar->Op, nullptr, FoundLane); } - for (User *U : Scalar->users()) { + for (User *U : Scalar->Op->users()) { LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n"); Instruction *UserInst = dyn_cast(U); @@ -1366,13 +1563,13 @@ continue; // Skip in-tree scalars that become vectors - if (TreeEntry *UseEntry = getTreeEntry(U)) { - Value *UseScalar = UseEntry->Scalars[0]; + if (TreeEntry *UseEntry = getTreeEntry(UserInst)) { + Value *UseScalar = UseEntry->Scalars[0]->Op; // Some in-tree scalars will remain as scalar in vectorized // instructions. If that is the case, the one in Lane 0 will // be used. if (UseScalar != U || - !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { + !InTreeUserNeedToExtract(Scalar->Op, UserInst, TLI)) { LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); assert(!UseEntry->NeedToGather && "Bad state"); @@ -1385,42 +1582,48 @@ continue; LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane " - << Lane << " from " << *Scalar << ".\n"); - ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane)); + << Lane << " from " << *Scalar->Op << ".\n"); + ExternalUses.push_back(ExternalUser(Scalar->Op, U, FoundLane)); } } } } -void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, +void BoUpSLP::buildTree_rec(Value *Parent, ArrayRef VL, unsigned Depth, int UserTreeIdx) { - assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); + SmallVector IL; + for (Value *V: VL) + IL.push_back(allocInstructionOrPseudo(Parent, V)); + InstructionsState S = getSameOpcode(IL); + for (InstructionOrPseudo *IOP : IL) + IOP->setInstructionsState(S); + + assert((allConstant(IL) || allSameType(VL)) && "Invalid types!"); - InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } if (StoreInst *SI = dyn_cast(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { + if (allConstant(IL) || isSplat(IL) || !allSameBlock(IL) || !S.getOpcode()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } @@ -1428,23 +1631,19 @@ // the same block. // Don't vectorize ephemeral values. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (EphValues.count(VL[i])) { + for (unsigned i = 0, e = IL.size(); i != e; ++i) { + if (EphValues.count(IL[i]->Op)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } } // Check if this is a duplicate of another entry. - if (TreeEntry *E = getTreeEntry(S.OpValue)) { + TreeEntry *E = getTreeEntry(cast(S.OpValue)); + if (E && E->isSame(IL)) { LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); - if (!E->isSame(VL)) { - LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx); - return; - } // Record the reuse of the tree node. FIXME, currently this is only used to // properly draw the graph rather than for the actual vectorization. E->UserTreeIndices.push_back(UserTreeIdx); @@ -1454,14 +1653,15 @@ } // Check that none of the instructions in the bundle are already in the tree. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - auto *I = dyn_cast(VL[i]); - if (!I) - continue; - if (getTreeEntry(I)) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] + for (unsigned i = 0, e = IL.size(); i != e; ++i) { + auto *I = IL[i]->OpInst; + if (getTreeEntry(I) || + (ScalarToTreeEntry.find(I) != ScalarToTreeEntry.end() && + ScalarToTreeEntry[I].find(IL[i]->getKey()) != + ScalarToTreeEntry[I].end())) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *I << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } } @@ -1469,47 +1669,50 @@ // If any of the scalars is marked as a value that needs to stay scalar, then // we need to gather the scalars. // The reduction nodes (stored in UserIgnoreList) also should stay scalar. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) { + for (unsigned i = 0, e = IL.size(); i != e; ++i) { + if (MustGather.count(IL[i]->Op)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } } // Check that all of the users of the scalars that we want to vectorize are // schedulable. - auto *VL0 = cast(S.OpValue); + auto *VL0 = IL[0]->MainOp; BasicBlock *BB = VL0->getParent(); if (!DT->isReachableFromEntry(BB)) { // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } // Check that every instruction appears once in this bundle. SmallVector ReuseShuffleIndicies; - SmallVector UniqueValues; + SmallVector UniqueValues; DenseMap UniquePositions; - for (Value *V : VL) { - auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); + for (auto *I : IL) { + auto Res = UniquePositions.try_emplace(I->Op, UniqueValues.size()); ReuseShuffleIndicies.emplace_back(Res.first->second); - if (Res.second) - UniqueValues.emplace_back(V); + if (Res.second) { + InstructionOrPseudo *IOP = allocInstructionOrPseudo(Parent, I->Op); + IOP->setInstructionsState(S); + UniqueValues.emplace_back(IOP); + } } - if (UniqueValues.size() == VL.size()) { + if (UniqueValues.size() == IL.size()) { ReuseShuffleIndicies.clear(); } else { LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(IL, false, UserTreeIdx); return; } - VL = UniqueValues; + IL = UniqueValues; } auto &BSRef = BlocksSchedules[BB]; @@ -1518,59 +1721,59 @@ BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, this, S)) { + if (!BS.tryScheduleBundle(IL, this)) { LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); - assert((!BS.getScheduleData(VL0) || - !BS.getScheduleData(VL0)->isPartOfBundle()) && + assert((!BS.getScheduleData(IL[0]) || + !BS.getScheduleData(IL[0])->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); return; } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - unsigned ShuffleOrOp = S.isAltShuffle() ? + unsigned ShuffleOrOp = IL[0]->isAltShuffle() ? (unsigned) Instruction::ShuffleVector : S.getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); // Check for terminator values (e.g. invoke). - for (unsigned j = 0; j < VL.size(); ++j) + for (unsigned j = 0; j < IL.size(); ++j) for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { Instruction *Term = dyn_cast( - cast(VL[j])->getIncomingValueForBlock( + cast(IL[j]->OpInst)->getIncomingValueForBlock( PH->getIncomingBlock(i))); if (Term && Term->isTerminator()) { LLVM_DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getIncomingValueForBlock( + for (auto *j : IL) + Operands.push_back(cast(j->Op)->getIncomingValueForBlock( PH->getIncomingBlock(i))); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; } case Instruction::ExtractValue: case Instruction::ExtractElement: { OrdersType CurrentOrder; - bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); + bool Reuse = canReuseExtract(IL, CurrentOrder); if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies); return; } @@ -1587,13 +1790,15 @@ auto StoredCurrentOrderAndNum = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++StoredCurrentOrderAndNum->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, + newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx, + ReuseShuffleIndicies, StoredCurrentOrderAndNum->getFirst()); return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); - newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); - BS.cancelScheduling(VL, VL0); + newTreeEntry(IL, /*Vectorized=*/false, UserTreeIdx, + ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); return; } case Instruction::Load: { @@ -1607,21 +1812,21 @@ if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } // Make sure all loads in the bundle are simple - we can't vectorize // atomic or volatile loads. - SmallVector PointerOps(VL.size()); + SmallVector PointerOps(IL.size()); auto POIter = PointerOps.begin(); - for (Value *V : VL) { - auto *L = cast(V); + for (auto *I : IL) { + auto *L = cast(I->OpInst); if (!L->isSimple()) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1647,18 +1852,18 @@ dyn_cast(SE->getMinusSCEV(ScevN, Scev0)); uint64_t Size = DL->getTypeAllocSize(ScalarTy); // Check that the sorted loads are consecutive. - if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) { + if (Diff && Diff->getAPInt().getZExtValue() == (IL.size() - 1) * Size) { if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(IL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, I->getFirst()); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } @@ -1667,8 +1872,8 @@ } LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -1684,26 +1889,26 @@ case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); - for (unsigned i = 0; i < VL.size(); ++i) { - Type *Ty = cast(VL[i])->getOperand(0)->getType(); + for (unsigned i = 0; i < IL.size(); ++i) { + Type *Ty = IL[i]->OpInst->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (auto *I : IL) + Operands.push_back(I->OpInst->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; } @@ -1712,28 +1917,28 @@ // Check that all of the compares have the same predicate. CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Type *ComparedTy = VL0->getOperand(0)->getType(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast(VL[i]); + for (unsigned i = 1, e = IL.size(); i < e; ++i) { + CmpInst *Cmp = cast(IL[i]->OpInst); if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (auto *I : IL) + Operands.push_back(I->OpInst->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; } @@ -1756,36 +1961,44 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to // have the same opcode. if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right); - buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + reorderInputsAccordingToOpcode(S.getOpcode(), IL, Left, Right); + buildTree_rec(VL0, Left, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Right, Depth + 1, UserTreeIdx); return; } for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); - - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + for (auto *Inst: IL) { + auto *I = Inst->OpInst; + if (I->getOpcode() == S.getOpcode()) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.getOpcode()) && + "Expected a binary operation."); + Operands.push_back(Inst->Op); + } + if (allSameType(Operands)) + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. - for (unsigned j = 0; j < VL.size(); ++j) { - if (cast(VL[j])->getNumOperands() != 2) { + for (unsigned j = 0; j < IL.size(); ++j) { + if (IL[j]->OpInst->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } @@ -1793,59 +2006,59 @@ // We can't combine several GEPs into one vector if they operate on // different types. Type *Ty0 = VL0->getOperand(0)->getType(); - for (unsigned j = 0; j < VL.size(); ++j) { - Type *CurTy = cast(VL[j])->getOperand(0)->getType(); + for (unsigned j = 0; j < IL.size(); ++j) { + Type *CurTy = IL[j]->OpInst->getOperand(0)->getType(); if (Ty0 != CurTy) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } // We don't combine GEPs with non-constant indexes. - for (unsigned j = 0; j < VL.size(); ++j) { - auto Op = cast(VL[j])->getOperand(1); + for (unsigned j = 0; j < IL.size(); ++j) { + auto Op = IL[j]->OpInst->getOperand(1); if (!isa(Op)) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (auto *j : IL) + Operands.push_back(j->OpInst->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; } case Instruction::Store: { // Check if the stores are consecutive or of we need to swizzle them. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + for (unsigned i = 0, e = IL.size() - 1; i < e; ++i) + if (!isConsecutiveAccess(IL[i]->Op, IL[i + 1]->Op, *DL, *SE)) { + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(0)); + for (auto *j : IL) + Operands.push_back(j->OpInst->getOperand(0)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); return; } case Instruction::Call: { @@ -1855,8 +2068,8 @@ // represented by an intrinsic call Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1864,15 +2077,15 @@ Value *A1I = nullptr; if (hasVectorInstrinsicScalarOpd(ID, 1)) A1I = CI->getArgOperand(1); - for (unsigned i = 1, e = VL.size(); i != e; ++i) { - CallInst *CI2 = dyn_cast(VL[i]); + for (unsigned i = 1, e = IL.size(); i != e; ++i) { + CallInst *CI2 = dyn_cast(IL[i]->OpInst); if (!CI2 || CI2->getCalledFunction() != Int || getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] - << "\n"); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << + *IL[i]->OpInst << "\n"); return; } // ctlz,cttz and powi are special intrinsics whose second argument @@ -1880,8 +2093,8 @@ if (hasVectorInstrinsicScalarOpd(ID, 1)) { Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument " << A1I << "!=" << A1J << "\n"); return; @@ -1892,60 +2105,78 @@ !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" - << *CI << "!=" << *VL[i] << '\n'); + << *CI << "!=" << *IL[i]->OpInst << '\n'); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) { - CallInst *CI2 = dyn_cast(j); - Operands.push_back(CI2->getArgOperand(i)); + for (auto *VecOp : IL) { + auto *I = VecOp->OpInst; + if (VecOp->isOpcodeOrAlt()) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.getOpcode()) && + "Expected a binary operation."); + Value *Operand = ConstantExpr::getBinOpIdentity(S.getOpcode(), + I->getType(), true); + Operands.push_back(Operand); } - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + if (allSameType(Operands)) + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; } case Instruction::ShuffleVector: // If this is not an alternate sequence of opcode like add-sub // then do not vectorize this instruction. - if (!S.isAltShuffle()) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + if (!IL[0]->isAltShuffle()) { + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(IL, true, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; - reorderAltShuffleOperands(S, VL, Left, Right); - buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + reorderAltShuffleOperands(IL, Left, Right); + buildTree_rec(VL0, Left, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Right, Depth + 1, UserTreeIdx); return; } for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (auto *VecOp : IL) { + if (VecOp->isOpcodeOrAlt()) { + Operands.push_back(VecOp->OpInst->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.getOpcode()) && + "Expected a binary operation."); + Value *Operand = ConstantExpr::getBinOpIdentity( + S.getOpcode(), VecOp->OpInst->getType(), true); + Operands.push_back(Operand); + } - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; default: - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(IL[0]); + newTreeEntry(IL, false, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -1976,12 +2207,12 @@ return N; } -bool BoUpSLP::canReuseExtract(ArrayRef VL, Value *OpValue, +bool BoUpSLP::canReuseExtract(ArrayRef IL, SmallVectorImpl &CurrentOrder) const { - Instruction *E0 = cast(OpValue); + Instruction *E0 = IL[0]->MainOp; assert(E0->getOpcode() == Instruction::ExtractElement || E0->getOpcode() == Instruction::ExtractValue); - assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode"); + assert(E0->getOpcode() == getSameOpcode(IL).getOpcode() && "Invalid opcode"); // Check if all of the extracts come from the same vector and from the // correct offset. Value *Vec = E0->getOperand(0); @@ -1997,18 +2228,18 @@ return false; // Check if load can be rewritten as load of vector. LoadInst *LI = dyn_cast(Vec); - if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size())) + if (!LI || !LI->isSimple() || !LI->hasNUses(IL.size())) return false; } else { NElts = Vec->getType()->getVectorNumElements(); } - if (NElts != VL.size()) + if (NElts != IL.size()) return false; // Check that all of the indices extract from the correct offset. bool ShouldKeepOrder = true; - unsigned E = VL.size(); + unsigned E = IL.size(); // Assign to all items the initial value E + 1 so we can check if the extract // instruction index was used already. // Also, later we can check that all the indices are used and we have a @@ -2017,7 +2248,7 @@ CurrentOrder.assign(E, E + 1); unsigned I = 0; for (; I < E; ++I) { - auto *Inst = cast(VL[I]); + auto *Inst = IL[I]->OpInst; if (Inst->getOperand(0) != Vec) break; Optional Idx = getExtractIndex(Inst); @@ -2046,25 +2277,25 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { return I->hasOneUse() || std::all_of(I->user_begin(), I->user_end(), [this](User *U) { - return ScalarToTreeEntry.count(U) > 0; + return ScalarToTreeEntry.count(dyn_cast(U)) > 0; }); } int BoUpSLP::getEntryCost(TreeEntry *E) { - ArrayRef VL = E->Scalars; - - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) + ArrayRef IL = E->Scalars; + Type *ScalarTy = E->Scalars[0]->Op->getType(); + if (StoreInst *SI = dyn_cast(E->Scalars[0]->Op)) ScalarTy = SI->getValueOperand()->getType(); - else if (CmpInst *CI = dyn_cast(VL[0])) + else if (CmpInst *CI = dyn_cast(E->Scalars[0]->Op)) ScalarTy = CI->getOperand(0)->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + VectorType *VecTy = VectorType::get(ScalarTy, IL.size()); // If we have computed a smaller type for the expression, update VecTy so // that the costs will be accurate. - if (MinBWs.count(VL[0])) + if (MinBWs.count(E->Scalars[0]->Op)) VecTy = VectorType::get( - IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + IntegerType::get(F->getContext(), MinBWs[E->Scalars[0]->Op].first), + IL.size()); unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); @@ -2074,26 +2305,28 @@ TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } if (E->NeedToGather) { - if (allConstant(VL)) + if (allConstant(IL)) return 0; - if (isSplat(VL)) { + if (isSplat(IL)) { return ReuseShuffleCost + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } - if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement && - allSameType(VL) && allSameBlock(VL)) { + if (IL[0]->MainOpcode == Instruction::ExtractElement && allSameBlock(IL)) { + ValueList VL; + for (auto *IOP : IL) + VL.push_back(IOP->Op); Optional ShuffleKind = isShuffle(VL); if (ShuffleKind.hasValue()) { int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); - for (auto *V : VL) { + for (auto *IOP : IL) { // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the // vectorized tree. - if (areAllUsersVectorized(cast(V)) && - !ScalarToTreeEntry.count(V)) { + if (areAllUsersVectorized(IOP->OpInst) && + !ScalarToTreeEntry.count(IOP->OpInst)) { auto *IO = cast( - cast(V)->getIndexOperand()); + cast(IOP->OpInst)->getIndexOperand()); Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, IO->getZExtValue()); } @@ -2101,13 +2334,12 @@ return ReuseShuffleCost + Cost; } } - return ReuseShuffleCost + getGatherCost(VL); + return ReuseShuffleCost + getGatherCost(IL); } - InstructionsState S = getSameOpcode(VL); - assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); - Instruction *VL0 = cast(S.OpValue); - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + auto *VL0 = E->Scalars[0]->MainOp; + assert(IL[0]->MainOpcode != 0 && allSameBlock(IL) && "Invalid VL"); + unsigned ShuffleOrOp = IL[0]->isAltShuffle() ? + (unsigned) Instruction::ShuffleVector : IL[0]->MainOpcode; switch (ShuffleOrOp) { case Instruction::PHI: return 0; @@ -2119,7 +2351,7 @@ for (unsigned I : E->ReuseShuffleIndices) { if (ShuffleOrOp == Instruction::ExtractElement) { auto *IO = cast( - cast(VL[I])->getIndexOperand()); + cast(IL[I]->OpInst)->getIndexOperand()); Idx = IO->getZExtValue(); ReuseShuffleCost -= TTI->getVectorInstrCost( Instruction::ExtractElement, VecTy, Idx); @@ -2130,10 +2362,10 @@ } } Idx = ReuseShuffleNumbers; - for (Value *V : VL) { + for (auto *IOP : IL) { if (ShuffleOrOp == Instruction::ExtractElement) { auto *IO = cast( - cast(V)->getIndexOperand()); + cast(IOP->OpInst)->getIndexOperand()); Idx = IO->getZExtValue(); } else { --Idx; @@ -2149,15 +2381,15 @@ DeadCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *E = cast(VL[i]); + for (unsigned i = 0, e = IL.size(); i < e; ++i) { // If all users are going to be vectorized, instruction can be // considered as dead. // The same, if have only one user, it will be vectorized for sure. - if (areAllUsersVectorized(E)) { + auto *I = IL[i]->OpInst; + if (areAllUsersVectorized(I)) { // Take credit for instruction that will become dead. - if (E->hasOneUse()) { - Instruction *Ext = E->user_back(); + if (I->hasOneUse()) { + Instruction *Ext = I->user_back(); if ((isa(Ext) || isa(Ext)) && all_of(Ext->users(), [](User *U) { return isa(U); })) { @@ -2167,7 +2399,8 @@ Ext->getOpcode(), Ext->getType(), VecTy, i); // Add back the cost of s|zext which is subtracted separately. DeadCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), E->getType(), Ext); + Ext->getOpcode(), Ext->getType(), + I->getType(), Ext); continue; } } @@ -2177,7 +2410,7 @@ } return DeadCost; } - return ReuseShuffleCost + getGatherCost(VL); + return ReuseShuffleCost + getGatherCost(IL); case Instruction::ZExt: case Instruction::SExt: @@ -2193,20 +2426,21 @@ case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); int ScalarEltCost = - TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0); + TTI->getCastInstrCost(IL[0]->MainOpcode, ScalarTy, SrcTy, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost; } // Calculate the cost of this instruction. - int ScalarCost = VL.size() * ScalarEltCost; + int ScalarCost = IL.size() * ScalarEltCost; - VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); + VectorType *SrcVecTy = VectorType::get(SrcTy, IL.size()); int VecCost = 0; // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { VecCost = ReuseShuffleCost + - TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0); + TTI->getCastInstrCost(IL[0]->MainOpcode, VecTy, + SrcVecTy, VL0); } return VecCost - ScalarCost; } @@ -2214,14 +2448,17 @@ case Instruction::ICmp: case Instruction::Select: { // Calculate the cost of this instruction. - int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy, - Builder.getInt1Ty(), VL0); + int ScalarEltCost = TTI->getCmpSelInstrCost(IL[0]->MainOpcode, + ScalarTy, + Builder.getInt1Ty(), + VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost; } - VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); + VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), IL.size()); int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0); + int VecCost = TTI->getCmpSelInstrCost(IL[0]->MainOpcode, VecTy, + MaskTy, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Add: @@ -2247,7 +2484,7 @@ TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; + TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueProperties Op1VP = TargetTransformInfo::OP_None; TargetTransformInfo::OperandValueProperties Op2VP = @@ -2258,35 +2495,40 @@ // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt0 = nullptr; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - const Instruction *I = cast(VL[i]); - ConstantInt *CInt = dyn_cast(I->getOperand(1)); - if (!CInt) { - Op2VK = TargetTransformInfo::OK_AnyValue; - Op2VP = TargetTransformInfo::OP_None; - break; - } - if (Op2VP == TargetTransformInfo::OP_PowerOf2 && - !CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_None; - if (i == 0) { - CInt0 = CInt; - continue; + if (auto *CInt = dyn_cast(IL[0]->MainOp->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_UniformConstantValue; + const unsigned Opcode = IL[0]->MainOpcode; + for (auto *IOP : IL) { + auto *I = IOP->OpInst; + if (I == VL0 || Opcode != I->getOpcode()) + continue; + if (!isa(I->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_AnyValue; + Op2VP = TargetTransformInfo::OP_None; + break; + } + ConstantInt *CInt_cur = cast(I->getOperand(1)); + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && + CInt != cast(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + if (Op2VP == TargetTransformInfo::OP_PowerOf2 && + !CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_None; + if (CInt != CInt_cur) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } - if (CInt0 != CInt) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } SmallVector Operands(VL0->operand_values()); int ScalarEltCost = TTI->getArithmeticInstrCost( - S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); + IL[0]->MainOpcode, ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK, - Op2VK, Op1VP, Op2VP, Operands); + int VecCost = TTI->getArithmeticInstrCost(IL[0]->MainOpcode, VecTy, + Op1VK, Op2VK, Op1VP, Op2VP, + Operands); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -2298,7 +2540,7 @@ int ScalarEltCost = TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; int VecCost = @@ -2309,13 +2551,15 @@ // Cost of wide load - cost of scalar loads. unsigned alignment = cast(VL0)->getAlignment(); int ScalarEltCost = - TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, + VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost; } int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; int VecLdCost = - TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); + TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, + VL0); if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecLdCost += TTI->getShuffleCost( @@ -2329,7 +2573,7 @@ int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost; } int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; int VecStCost = @@ -2352,7 +2596,7 @@ int ScalarEltCost = TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= (ReuseShuffleNumbers - IL.size()) * ScalarEltCost; } int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; @@ -2367,44 +2611,38 @@ return ReuseShuffleCost + VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(IL[0]->isAltShuffle() && + ((Instruction::isBinaryOp(IL[0]->MainOpcode) && + Instruction::isBinaryOp(IL[0]->AltOpcode)) || + (Instruction::isCast(IL[0]->MainOpcode) && + Instruction::isCast(IL[0]->AltOpcode))) && "Invalid Shuffle Vector Operand"); int ScalarCost = 0; if (NeedToShuffleReuses) { - for (unsigned Idx : E->ReuseShuffleIndices) { - Instruction *I = cast(VL[Idx]); + for (unsigned Idx : E->ReuseShuffleIndices) ReuseShuffleCost -= TTI->getInstructionCost( - I, TargetTransformInfo::TCK_RecipThroughput); - } - for (Value *V : VL) { - Instruction *I = cast(V); + IL[Idx]->OpInst, TargetTransformInfo::TCK_RecipThroughput); + for (auto *IOP : IL) ReuseShuffleCost += TTI->getInstructionCost( - I, TargetTransformInfo::TCK_RecipThroughput); - } + IOP->OpInst, TargetTransformInfo::TCK_RecipThroughput); } - for (Value *i : VL) { - Instruction *I = cast(i); - assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); + for (auto *IOP : IL) ScalarCost += TTI->getInstructionCost( - I, TargetTransformInfo::TCK_RecipThroughput); - } + IOP->OpInst, TargetTransformInfo::TCK_RecipThroughput); // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. int VecCost = 0; - if (Instruction::isBinaryOp(S.getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy); - VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy); + if (Instruction::isBinaryOp(IL[0]->MainOpcode)) { + VecCost = TTI->getArithmeticInstrCost(IL[0]->MainOpcode, VecTy); + VecCost += TTI->getArithmeticInstrCost(IL[0]->AltOpcode, VecTy); } else { - Type *Src0SclTy = S.MainOp->getOperand(0)->getType(); - Type *Src1SclTy = S.AltOp->getOperand(0)->getType(); - VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size()); - VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size()); - VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty); - VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty); + Type *Src0SclTy = IL[0]->MainOp->getOperand(0)->getType(); + Type *Src1SclTy = IL[0]->AltOp->getOperand(0)->getType(); + VectorType *Src0Ty = VectorType::get(Src0SclTy, IL.size()); + VectorType *Src1Ty = VectorType::get(Src1SclTy, IL.size()); + VecCost = TTI->getCastInstrCost(IL[0]->MainOpcode, VecTy, Src0Ty); + VecCost += TTI->getCastInstrCost(IL[0]->AltOpcode, VecTy, + Src1Ty); } VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); return ReuseShuffleCost + VecCost - ScalarCost; @@ -2428,7 +2666,7 @@ // Handle splat and all-constants stores. if (!VectorizableTree[0].NeedToGather && (allConstant(VectorizableTree[1].Scalars) || - isSplat(VectorizableTree[1].Scalars))) + isSplat(VectorizableTree[1].Scalars))) return true; // Gathering cost would be too much for tiny trees. @@ -2438,6 +2676,23 @@ return true; } +Instruction *BoUpSLP::propagateMetadata(Instruction *I, + ArrayRef IL) { + ValueList VL; + for (auto *IOP: IL) + VL.push_back(IOP->Op); + return llvm::propagateMetadata(I, VL); +} + +void BoUpSLP::propagateIRFlags(Value *I, + ArrayRef IL, + Value *OpValue) { + ValueList VL; + for (auto *IOP: IL) + VL.push_back(IOP->Op); + llvm::propagateIRFlags(I, VL, OpValue); +} + bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() { // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. @@ -2470,7 +2725,7 @@ Instruction *PrevInst = nullptr; for (const auto &N : VectorizableTree) { - Instruction *Inst = dyn_cast(N.Scalars[0]); + Instruction *Inst = N.Scalars[0]->MainOp; if (!Inst) continue; @@ -2482,8 +2737,9 @@ // Update LiveValues. LiveValues.erase(PrevInst); for (auto &J : PrevInst->operands()) { - if (isa(&*J) && getTreeEntry(&*J)) - LiveValues.insert(cast(&*J)); + auto *I = dyn_cast(&*J); + if (I && getTreeEntry(I)) + LiveValues.insert(I); } LLVM_DEBUG({ @@ -2554,7 +2810,7 @@ int C = getEntryCost(&TE); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C - << " for bundle that starts with " << *TE.Scalars[0] + << " for bundle that starts with " << *TE.Scalars[0]->Op << ".\n"); Cost += C; } @@ -2576,7 +2832,7 @@ // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0].Scalars[0]->Op; if (MinBWs.count(ScalarRoot)) { auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto Extend = @@ -2619,21 +2875,21 @@ return Cost; } -int BoUpSLP::getGatherCost(ArrayRef VL) { - // Find the type of the operands in VL. - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) +int BoUpSLP::getGatherCost(ArrayRef IL) { + // Find the type of the operands in IL. + Type *ScalarTy = IL[0]->Op->getType(); + if (StoreInst *SI = dyn_cast(IL[0]->Op)) ScalarTy = SI->getValueOperand()->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + VectorType *VecTy = VectorType::get(ScalarTy, IL.size()); // Find the cost of inserting/extracting values from the vector. // Check if the same elements are inserted several times and count them as // shuffle candidates. DenseSet ShuffledElements; DenseSet UniqueElements; // Iterate in reverse order to consider insert elements with the high cost. - for (unsigned I = VL.size(); I > 0; --I) { + for (unsigned I = IL.size(); I > 0; --I) { unsigned Idx = I - 1; - if (!UniqueElements.insert(VL[Idx]).second) + if (!UniqueElements.insert(IL[Idx]->Op).second) ShuffledElements.insert(Idx); } return getGatherCost(VecTy, ShuffledElements); @@ -2648,25 +2904,29 @@ // load a[3] + load b[3] // Reordering the second load b[1] load a[1] would allow us to vectorize this // code. -void BoUpSLP::reorderAltShuffleOperands(const InstructionsState &S, - ArrayRef VL, +void BoUpSLP::reorderAltShuffleOperands(ArrayRef IL, SmallVectorImpl &Left, SmallVectorImpl &Right) { // Push left and right operands of binary operation into Left and Right - for (Value *V : VL) { - auto *I = cast(V); - assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector"); - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); + for (auto *Inst : IL) { + if (Inst->isOpcodeOrAlt()) { + Left.push_back(Inst->OpInst->getOperand(0)); + Right.push_back(Inst->OpInst->getOperand(1)); + } else { + Left.push_back(Inst->OpInst); + Right.push_back(ConstantExpr::getBinOpIdentity(Inst->MainOpcode, + Inst->OpInst->getType(), + true)); + } } // Reorder if we have a commutative operation and consecutive access // are on either side of the alternate instructions. - for (unsigned j = 0; j < VL.size() - 1; ++j) { + for (unsigned j = 0; j < IL.size() - 1; ++j) { if (LoadInst *L = dyn_cast(Left[j])) { if (LoadInst *L1 = dyn_cast(Right[j + 1])) { - Instruction *VL1 = cast(VL[j]); - Instruction *VL2 = cast(VL[j + 1]); + Instruction *VL1 = IL[j]->OpInst; + Instruction *VL2 = IL[j + 1]->OpInst; if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { std::swap(Left[j], Right[j]); continue; @@ -2680,8 +2940,8 @@ } if (LoadInst *L = dyn_cast(Right[j])) { if (LoadInst *L1 = dyn_cast(Left[j + 1])) { - Instruction *VL1 = cast(VL[j]); - Instruction *VL2 = cast(VL[j + 1]); + Instruction *VL1 = IL[j]->OpInst; + Instruction *VL2 = IL[j + 1]->OpInst; if (VL1->isCommutative() && isConsecutiveAccess(L, L1, *DL, *SE)) { std::swap(Left[j], Right[j]); continue; @@ -2706,8 +2966,13 @@ int i, unsigned Opcode, Instruction &I, ArrayRef Left, ArrayRef Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { - VLeft = I.getOperand(0); - VRight = I.getOperand(1); + if (I.getOpcode() == Opcode) { + VLeft = I.getOperand(0); + VRight = I.getOperand(1); + } else { + VLeft = &I; + VRight = ConstantExpr::getBinOpIdentity(Opcode, I.getType(), true); + } // If we have "SplatRight", try to see if commuting is needed to preserve it. if (SplatRight) { if (VRight == Right[i - 1]) @@ -2764,15 +3029,22 @@ } void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode, - ArrayRef VL, + ArrayRef IL, SmallVectorImpl &Left, SmallVectorImpl &Right) { - if (!VL.empty()) { + if (!IL.empty()) { // Peel the first iteration out of the loop since there's nothing // interesting to do anyway and it simplifies the checks in the loop. - auto *I = cast(VL[0]); - Value *VLeft = I->getOperand(0); - Value *VRight = I->getOperand(1); + Value *VLeft; + Value *VRight; + if (IL[0]->OpInst->getOpcode() == Opcode) { + VLeft = IL[0]->OpInst->getOperand(0); + VRight = IL[0]->OpInst->getOperand(1); + } else { + VLeft = IL[0]->OpInst; + VRight = ConstantExpr::getBinOpIdentity(Opcode, IL[0]->OpInst->getType(), + true); + } if (!isa(VRight) && isa(VLeft)) // Favor having instruction to the right. FIXME: why? std::swap(VLeft, VRight); @@ -2787,8 +3059,8 @@ bool SplatLeft = true; bool SplatRight = true; - for (unsigned i = 1, e = VL.size(); i != e; ++i) { - Instruction *I = cast(VL[i]); + for (unsigned i = 1, e = IL.size(); i != e; ++i) { + Instruction *I = IL[i]->OpInst; assert(((I->getOpcode() == Opcode && I->isCommutative()) || (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && "Can only process commutative instruction"); @@ -2835,7 +3107,7 @@ // add a[1],c[2] load b[1] // b[2] load b[2] // add a[3],c[3] load b[3] - for (unsigned j = 0, e = VL.size() - 1; j < e; ++j) { + for (unsigned j = 0, e = IL.size() - 1; j < e; ++j) { if (LoadInst *L = dyn_cast(Left[j])) { if (LoadInst *L1 = dyn_cast(Right[j + 1])) { if (isConsecutiveAccess(L, L1, *DL, *SE)) { @@ -2856,31 +3128,27 @@ } } -void BoUpSLP::setInsertPointAfterBundle(ArrayRef VL, - const InstructionsState &S) { +void BoUpSLP::setInsertPointAfterBundle(ArrayRef IL) { // Get the basic block this bundle is in. All instructions in the bundle // should be in this block. - auto *Front = cast(S.OpValue); + auto *Front = IL[0]->MainOp; auto *BB = Front->getParent(); - assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { - auto *I = cast(V); - return !S.isOpcodeOrAlt(I) || I->getParent() == BB; - })); + assert(llvm::all_of( + make_range(IL.begin(), IL.end()), [=](InstructionOrPseudo *I) -> bool { + return !I->isOpcodeOrAlt() || I->OpInst->getParent() == BB; + })); // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; - // Find the last instruction. The common case should be that BB has been - // scheduled, and the last instruction is VL.back(). So we start with - // VL.back() and iterate over schedule data until we reach the end of the - // bundle. The end of the bundle is marked by null ScheduleData. + // Find the last instruction. If the bundle is not scheduled then + // the first in the bundle is the last one in BB, because we discover + // bundles in backward walk. if (BlocksSchedules.count(BB)) { - auto *Bundle = - BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back())); + BlockScheduling *BS = BlocksSchedules[BB].get(); + auto *Bundle = BS->getScheduleData(IL[0]); if (Bundle && Bundle->isPartOfBundle()) - for (; Bundle; Bundle = Bundle->NextInBundle) - if (Bundle->OpValue == Bundle->Inst) - LastInst = Bundle->Inst; + LastInst = Bundle->FirstInBundle->Inst; } // LastInst can still be null at this point if there's either not an entry @@ -2902,9 +3170,11 @@ // we both exit early from buildTree_rec and that the bundle be out-of-order // (causing us to iterate all the way to the end of the block). if (!LastInst) { - SmallPtrSet Bundle(VL.begin(), VL.end()); + SmallPtrSet Bundle; + for (auto *IOP : IL) + Bundle.insert(IOP->Op); for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { - if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I)) + if (Bundle.erase(&I)) LastInst = &I; if (Bundle.empty()) break; @@ -2917,22 +3187,22 @@ Builder.SetCurrentDebugLocation(Front->getDebugLoc()); } -Value *BoUpSLP::Gather(ArrayRef VL, VectorType *Ty) { +Value *BoUpSLP::Gather(ArrayRef IL, VectorType *Ty) { Value *Vec = UndefValue::get(Ty); // Generate the 'InsertElement' instruction. for (unsigned i = 0; i < Ty->getNumElements(); ++i) { - Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); + Vec = Builder.CreateInsertElement(Vec, IL[i]->Op, Builder.getInt32(i)); if (Instruction *Insrt = dyn_cast(Vec)) { GatherSeq.insert(Insrt); CSEBlocks.insert(Insrt->getParent()); // Add to our 'need-to-extract' list. - if (TreeEntry *E = getTreeEntry(VL[i])) { + if (TreeEntry *E = getTreeEntry(IL[i]->OpInst)) { // Find which lane we need to extract. int FoundLane = -1; for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) { // Is this the lane of the scalar that we are looking for ? - if (E->Scalars[Lane] == VL[i]) { + if (E->Scalars[Lane]->Op == IL[i]->Op) { FoundLane = Lane; break; } @@ -2943,7 +3213,7 @@ std::distance(E->ReuseShuffleIndices.begin(), llvm::find(E->ReuseShuffleIndices, FoundLane)); } - ExternalUses.push_back(ExternalUser(VL[i], Insrt, FoundLane)); + ExternalUses.push_back(ExternalUser(IL[i]->Op, Insrt, FoundLane)); } } } @@ -2951,29 +3221,33 @@ return Vec; } -Value *BoUpSLP::vectorizeTree(ArrayRef VL) { - InstructionsState S = getSameOpcode(VL); - if (S.getOpcode()) { - if (TreeEntry *E = getTreeEntry(S.OpValue)) { - if (E->isSame(VL)) { - Value *V = vectorizeTree(E); - if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) { - // We need to get the vectorized value but without shuffle. - if (auto *SV = dyn_cast(V)) { - V = SV->getOperand(0); - } else { - // Reshuffle to get only unique values. - SmallVector UniqueIdxs; - SmallSet UsedIdxs; - for(unsigned Idx : E->ReuseShuffleIndices) - if (UsedIdxs.insert(Idx).second) +Value *BoUpSLP::vectorizeTree(ArrayRef VL, Value *Parent) { + IOPList IL; + for (Value *V: VL) + IL.push_back(allocInstructionOrPseudo(Parent, V)); + InstructionsState S = getSameOpcode(IL); + for (InstructionOrPseudo *IOP: IL) + IOP->setInstructionsState(S); + if (IL[0]->MainOp) { + TreeEntry *E = getTreeEntry(IL[0]->MainOp); + if (E && E->isSame(IL)) { + Value *V = vectorizeTree(E); + if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) { + // We need to get the vectorized value but without shuffle. + if (auto *SV = dyn_cast(V)) { + V = SV->getOperand(0); + } else { + // Reshuffle to get only unique values. + SmallVector UniqueIdxs; + SmallSet UsedIdxs; + for (unsigned Idx : E->ReuseShuffleIndices) + if (UsedIdxs.insert(Idx).second) UniqueIdxs.emplace_back(Idx); - V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), - UniqueIdxs); - } + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), + UniqueIdxs); } - return V; } + return V; } } @@ -2983,26 +3257,26 @@ // Check that every instruction appears once in this bundle. SmallVector ReuseShuffleIndicies; - SmallVector UniqueValues; - if (VL.size() > 2) { + IOPList UniqueValues; + if (IL.size() > 2) { DenseMap UniquePositions; - for (Value *V : VL) { - auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); + for (auto *I : IL) { + auto Res = UniquePositions.try_emplace(I->Op, UniqueValues.size()); ReuseShuffleIndicies.emplace_back(Res.first->second); - if (Res.second || isa(V)) - UniqueValues.emplace_back(V); + if (Res.second || isa(I->Op)) + UniqueValues.emplace_back(I); } // Do not shuffle single element or if number of unique values is not power // of 2. - if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 || + if (UniqueValues.size() == IL.size() || UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) ReuseShuffleIndicies.clear(); else - VL = UniqueValues; + IL = UniqueValues; } - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); + VectorType *VecTy = VectorType::get(ScalarTy, IL.size()); - Value *V = Gather(VL, VecTy); + Value *V = Gather(IL, VecTy); if (!ReuseShuffleIndicies.empty()) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), ReuseShuffleIndicies, "shuffle"); @@ -3027,12 +3301,12 @@ IRBuilder<>::InsertPointGuard Guard(Builder); if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " + << *E->Scalars[0]->MainOp << ".\n"); return E->VectorizedValue; } - InstructionsState S = getSameOpcode(E->Scalars); - Instruction *VL0 = cast(S.OpValue); + auto *VL0 = E->Scalars[0]->MainOp; Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast(VL0)) ScalarTy = SI->getValueOperand()->getType(); @@ -3041,7 +3315,7 @@ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->NeedToGather) { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3055,8 +3329,8 @@ return V; } - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + unsigned ShuffleOrOp = E->Scalars[0]->isAltShuffle() ? + (unsigned) Instruction::ShuffleVector : E->Scalars[0]->MainOpcode; switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); @@ -3084,12 +3358,13 @@ } // Prepare the operand vector. - for (Value *V : E->Scalars) - Operands.push_back(cast(V)->getIncomingValueForBlock(IBB)); + for (auto *IOP : E->Scalars) + Operands.push_back( + cast(IOP->Op)->getIncomingValueForBlock(IBB)); Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - Value *Vec = vectorizeTree(Operands); + Value *Vec = vectorizeTree(Operands, VL0); NewPhi->addIncoming(Vec, IBB); } @@ -3118,7 +3393,7 @@ E->VectorizedValue = V; return V; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3153,7 +3428,7 @@ E->VectorizedValue = NewV; return NewV; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3179,12 +3454,12 @@ case Instruction::FPTrunc: case Instruction::BitCast: { ValueList INVL; - for (Value *V : E->Scalars) - INVL.push_back(cast(V)->getOperand(0)); + for (auto *IOP : E->Scalars) + INVL.push_back(IOP->OpInst->getOperand(0)); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); - Value *InVec = vectorizeTree(INVL); + Value *InVec = vectorizeTree(INVL, VL0); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3204,15 +3479,15 @@ case Instruction::FCmp: case Instruction::ICmp: { ValueList LHSV, RHSV; - for (Value *V : E->Scalars) { - LHSV.push_back(cast(V)->getOperand(0)); - RHSV.push_back(cast(V)->getOperand(1)); + for (auto *IOP : E->Scalars) { + LHSV.push_back(IOP->OpInst->getOperand(0)); + RHSV.push_back(IOP->OpInst->getOperand(1)); } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); - Value *L = vectorizeTree(LHSV); - Value *R = vectorizeTree(RHSV); + Value *L = vectorizeTree(LHSV, VL0); + Value *R = vectorizeTree(RHSV, VL0); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3221,7 +3496,7 @@ CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V; - if (S.getOpcode() == Instruction::FCmp) + if (E->Scalars[0]->MainOpcode == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); else V = Builder.CreateICmp(P0, L, R); @@ -3237,17 +3512,17 @@ } case Instruction::Select: { ValueList TrueVec, FalseVec, CondVec; - for (Value *V : E->Scalars) { - CondVec.push_back(cast(V)->getOperand(0)); - TrueVec.push_back(cast(V)->getOperand(1)); - FalseVec.push_back(cast(V)->getOperand(2)); + for (auto *IOP : E->Scalars) { + CondVec.push_back(IOP->OpInst->getOperand(0)); + TrueVec.push_back(IOP->OpInst->getOperand(1)); + FalseVec.push_back(IOP->OpInst->getOperand(2)); } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); - Value *Cond = vectorizeTree(CondVec); - Value *True = vectorizeTree(TrueVec); - Value *False = vectorizeTree(FalseVec); + Value *Cond = vectorizeTree(CondVec, VL0); + Value *True = vectorizeTree(TrueVec, VL0); + Value *False = vectorizeTree(FalseVec, VL0); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3282,20 +3557,19 @@ case Instruction::Or: case Instruction::Xor: { ValueList LHSVL, RHSVL; - if (isa(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL, - RHSVL); - else - for (Value *V : E->Scalars) { - auto *I = cast(V); - LHSVL.push_back(I->getOperand(0)); - RHSVL.push_back(I->getOperand(1)); - } + if (isa(VL0) && VL0->isCommutative()) { + reorderInputsAccordingToOpcode(E->Scalars[0]->MainOpcode, + E->Scalars, LHSVL, RHSVL); + } else + for (auto *IOP : E->Scalars) { + LHSVL.push_back(IOP->OpInst->getOperand(0)); + RHSVL.push_back(IOP->OpInst->getOperand(1)); + } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); - Value *LHS = vectorizeTree(LHSVL); - Value *RHS = vectorizeTree(RHSVL); + Value *LHS = vectorizeTree(LHSVL, VL0); + Value *RHS = vectorizeTree(RHSVL, VL0); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3303,7 +3577,7 @@ } Value *V = Builder.CreateBinOp( - static_cast(S.getOpcode()), LHS, RHS); + static_cast(VL0->getOpcode()), LHS, RHS); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); @@ -3322,10 +3596,11 @@ // sink them all the way down past store instructions. bool IsReorder = !E->ReorderIndices.empty(); if (IsReorder) { - S = getSameOpcode(E->Scalars, E->ReorderIndices.front()); + InstructionsState S = getSameOpcode(E->Scalars, + E->ReorderIndices.front()); VL0 = cast(S.OpValue); } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); LoadInst *LI = cast(VL0); Type *ScalarLoadTy = LI->getType(); @@ -3338,7 +3613,7 @@ // ExternalUses list to make sure that an extract will be generated in the // future. Value *PO = LI->getPointerOperand(); - if (getTreeEntry(PO)) + if (getTreeEntry(dyn_cast(PO))) ExternalUses.push_back(ExternalUser(PO, cast(VecPtr), 0)); unsigned Alignment = LI->getAlignment(); @@ -3369,12 +3644,13 @@ unsigned AS = SI->getPointerAddressSpace(); ValueList ScalarStoreValues; - for (Value *V : E->Scalars) - ScalarStoreValues.push_back(cast(V)->getValueOperand()); + for (auto *IOP : E->Scalars) + ScalarStoreValues.push_back( + cast(IOP->OpInst)->getValueOperand()); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); - Value *VecValue = vectorizeTree(ScalarStoreValues); + Value *VecValue = vectorizeTree(ScalarStoreValues, VL0); Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); @@ -3382,7 +3658,7 @@ // The pointer operand uses an in-tree scalar, so add the new BitCast to // ExternalUses to make sure that an extract will be generated in the // future. - if (getTreeEntry(ScalarPtr)) + if (getTreeEntry(dyn_cast(ScalarPtr))) ExternalUses.push_back(ExternalUser(ScalarPtr, cast(VecPtr), 0)); if (!Alignment) @@ -3399,22 +3675,22 @@ return V; } case Instruction::GetElementPtr: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); ValueList Op0VL; - for (Value *V : E->Scalars) - Op0VL.push_back(cast(V)->getOperand(0)); + for (auto *IOP : E->Scalars) + Op0VL.push_back(cast(IOP->OpInst)->getOperand(0)); - Value *Op0 = vectorizeTree(Op0VL); + Value *Op0 = vectorizeTree(Op0VL, VL0); std::vector OpVecs; for (int j = 1, e = cast(VL0)->getNumOperands(); j < e; ++j) { ValueList OpVL; - for (Value *V : E->Scalars) - OpVL.push_back(cast(V)->getOperand(j)); + for (auto *IOP : E->Scalars) + OpVL.push_back(cast(IOP->OpInst)->getOperand(j)); - Value *OpVec = vectorizeTree(OpVL); + Value *OpVec = vectorizeTree(OpVL, VL0); OpVecs.push_back(OpVec); } @@ -3434,7 +3710,7 @@ } case Instruction::Call: { CallInst *CI = cast(VL0); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars); Function *FI; Intrinsic::ID IID = Intrinsic::not_intrinsic; Value *ScalarArg = nullptr; @@ -3452,12 +3728,12 @@ OpVecs.push_back(CEI->getArgOperand(j)); continue; } - for (Value *V : E->Scalars) { - CallInst *CEI = cast(V); + for (auto *IOP : E->Scalars) { + CallInst *CEI = cast(IOP->OpInst); OpVL.push_back(CEI->getArgOperand(j)); } - Value *OpVec = vectorizeTree(OpVL); + Value *OpVec = vectorizeTree(OpVL, VL0); LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); } @@ -3473,7 +3749,7 @@ // The scalar argument uses an in-tree scalar so we add the new vectorized // call to ExternalUses list to make sure that an extract will be // generated in the future. - if (ScalarArg && getTreeEntry(ScalarArg)) + if (ScalarArg && getTreeEntry(dyn_cast(ScalarArg))) ExternalUses.push_back(ExternalUser(ScalarArg, cast(V), 0)); propagateIRFlags(V, E->Scalars, VL0); @@ -3487,25 +3763,25 @@ } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(E->Scalars[0]->isAltShuffle() && + ((Instruction::isBinaryOp(E->Scalars[0]->MainOpcode) && + Instruction::isBinaryOp(E->Scalars[0]->AltOpcode)) || + (Instruction::isCast(E->Scalars[0]->MainOpcode) && + Instruction::isCast(E->Scalars[0]->AltOpcode))) && "Invalid Shuffle Vector Operand"); Value *LHS, *RHS; - if (Instruction::isBinaryOp(S.getOpcode())) { - reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL); - setInsertPointAfterBundle(E->Scalars, S); - LHS = vectorizeTree(LHSVL); - RHS = vectorizeTree(RHSVL); + if (Instruction::isBinaryOp(E->Scalars[0]->MainOpcode)) { + reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); + setInsertPointAfterBundle(E->Scalars); + LHS = vectorizeTree(LHSVL, VL0); + RHS = vectorizeTree(RHSVL, VL0); } else { ValueList INVL; - for (Value *V : E->Scalars) - INVL.push_back(cast(V)->getOperand(0)); - setInsertPointAfterBundle(E->Scalars, S); - LHS = vectorizeTree(INVL); + for (auto *IOP : E->Scalars) + INVL.push_back(IOP->OpInst->getOperand(0)); + setInsertPointAfterBundle(E->Scalars); + LHS = vectorizeTree(INVL, VL0); } if (E->VectorizedValue) { @@ -3514,28 +3790,30 @@ } Value *V0, *V1; - if (Instruction::isBinaryOp(S.getOpcode())) { + if (Instruction::isBinaryOp(E->Scalars[0]->MainOpcode)) { V0 = Builder.CreateBinOp( - static_cast(S.getOpcode()), LHS, RHS); + static_cast(E->Scalars[0]->MainOpcode), LHS, + RHS); V1 = Builder.CreateBinOp( - static_cast(S.getAltOpcode()), LHS, RHS); + static_cast(E->Scalars[0]->AltOpcode), LHS, + RHS); } else { V0 = Builder.CreateCast( - static_cast(S.getOpcode()), LHS, VecTy); + static_cast(E->Scalars[0]->MainOpcode), LHS, + VecTy); V1 = Builder.CreateCast( - static_cast(S.getAltOpcode()), LHS, VecTy); + static_cast(E->Scalars[0]->AltOpcode), LHS, + VecTy); } // Create shuffle to take alternate operations from the vector. // Also, gather up main and alt scalar ops to propagate IR flags to // each vector operation. - ValueList OpScalars, AltScalars; + IOPList OpScalars, AltScalars; unsigned e = E->Scalars.size(); SmallVector Mask(e); for (unsigned i = 0; i < e; ++i) { - auto *OpInst = cast(E->Scalars[i]); - assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); - if (OpInst->getOpcode() == S.getAltOpcode()) { + if (E->Scalars[i]->OpInst->getOpcode() == E->Scalars[0]->AltOpcode) { Mask[i] = Builder.getInt32(e + i); AltScalars.push_back(E->Scalars[i]); } else { @@ -3545,8 +3823,10 @@ } Value *ShuffleMask = ConstantVector::get(Mask); - propagateIRFlags(V0, OpScalars); - propagateIRFlags(V1, AltScalars); + InstructionsState S = getSameOpcode(OpScalars); + propagateIRFlags(V0, OpScalars, S.OpValue); + S = getSameOpcode(AltScalars); + propagateIRFlags(V1, AltScalars, S.OpValue); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); if (Instruction *I = dyn_cast(V)) @@ -3573,9 +3853,10 @@ Value * BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) { - // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) { - scheduleBlock(BSIter.second.get()); + BlockScheduling *BS = BSIter.second.get(); + eliminatePseudos(BS); + scheduleBlock(BS); } Builder.SetInsertPoint(&F->getEntryBlock().front()); @@ -3584,7 +3865,7 @@ // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0].Scalars[0]->MainOp; if (MinBWs.count(ScalarRoot)) { if (auto *I = dyn_cast(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); @@ -3609,7 +3890,7 @@ }; // Extract all of the elements with the external uses. - for (const auto &ExternalUse : ExternalUses) { + for (auto &ExternalUse : ExternalUses) { Value *Scalar = ExternalUse.Scalar; llvm::User *User = ExternalUse.User; @@ -3617,7 +3898,7 @@ // has multiple uses of the same value. if (User && !is_contained(Scalar->users(), User)) continue; - TreeEntry *E = getTreeEntry(Scalar); + TreeEntry *E = getTreeEntry(dyn_cast(Scalar)); assert(E && "Invalid scalar"); assert(!E->NeedToGather && "Extracting from a gather list"); @@ -3699,7 +3980,10 @@ // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { - Value *Scalar = Entry->Scalars[Lane]; + Value *Scalar = Entry->Scalars[Lane]->Op; + + if (!Entry->Scalars[Lane]->isOpcodeOrAlt()) + continue; Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { @@ -3708,7 +3992,9 @@ LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); // It is legal to replace users in the ignorelist by undef. - assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && + assert((getTreeEntry(dyn_cast(U)) || + Entry->Scalars[Lane]->PseudoOp || + is_contained(UserIgnoreList, U)) && "Replacing out-of-tree value with undef"); } #endif @@ -3716,7 +4002,7 @@ Scalar->replaceAllUsesWith(Undef); } LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n"); - eraseInstruction(cast(Scalar)); + eraseInstruction(Entry->Scalars[Lane]->OpInst); } } @@ -3810,10 +4096,9 @@ // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. -bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, - BoUpSLP *SLP, - const InstructionsState &S) { - if (isa(S.OpValue)) +bool BoUpSLP::BlockScheduling::tryScheduleBundle( + ArrayRef IL, BoUpSLP *SLP) { + if (isa(IL[0]->MainOp)) return true; // Initialize the instruction bundle. @@ -3821,19 +4106,27 @@ ScheduleData *PrevInBundle = nullptr; ScheduleData *Bundle = nullptr; bool ReSchedule = false; - LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); + bool IsPseudo = false; + LLVM_DEBUG(dbgs() << "SLP: bundle: " << *IL[0]->MainOp << "\n"); // Make sure that the scheduling region contains all // instructions of the bundle. - for (Value *V : VL) { - if (!extendSchedulingRegion(V, S)) + for (InstructionOrPseudo *IOP : IL) + if (!extendSchedulingRegion(IOP)) return false; - } - for (Value *V : VL) { - ScheduleData *BundleMember = getScheduleData(V); + for (InstructionOrPseudo *IOP : IL) { + ScheduleData *BundleMember = + ScheduleDataMap[IOP->OpInst][std::make_pair(nullptr, 0)]; + if (IOP->isNonAlt()) { + BundleMember = getScheduleData(IOP); + IsPseudo = true; + } + if (BundleMember->isPartOfBundle()) + return false; assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); + assert(!BundleMember->isPartOfBundle() && "Already part of another bundle"); if (BundleMember->IsScheduled) { // A bundle member was scheduled as single instruction before and now // needs to be scheduled as part of the bundle. We just get rid of the @@ -3850,22 +4143,28 @@ Bundle = BundleMember; } BundleMember->UnscheduledDepsInBundle = 0; + BundleMember->Opcode = IOP->MainOpcode; + BundleMember->Parent = IOP->getKey().first; Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; // Group the instructions to a bundle. BundleMember->FirstInBundle = Bundle; PrevInBundle = BundleMember; } - if (ScheduleEnd != OldScheduleEnd) { + if (ScheduleEnd != OldScheduleEnd || IsPseudo) { // The scheduling region got new instructions at the lower end (or it is a // new region for the first bundle). This makes it necessary to // recalculate all dependencies. // It is seldom that this needs to be done a second time after adding the // initial bundle to the region. for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - doForAllOpcodes(I, [](ScheduleData *SD) { - SD->clearDependencies(); - }); + if (ScheduleDataMap.find(I) == ScheduleDataMap.end()) + continue; + for (auto It : ScheduleDataMap[I]) { + auto *SD = It.second; + if (SD && isInSchedulingRegion(SD)) + SD->clearDependencies(); + } } ReSchedule = true; } @@ -3893,18 +4192,33 @@ } } if (!Bundle->isReady()) { - cancelScheduling(VL, S.OpValue); + cancelScheduling(IL[0]); + // We have to clear all dependencies, since all values + // were calculated for the vectorized bundle. + for (auto *I = ScheduleStart; I != ScheduleEnd; + I = I->getNextNode()) { + if (ScheduleDataMap.find(I) == ScheduleDataMap.end()) + continue; + for (auto It : ScheduleDataMap[I]) { + auto *SD = It.second; + if (SD && isInSchedulingRegion(SD)) + SD->clearDependencies(); + } + } + resetSchedule(); return false; } return true; } -void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL, - Value *OpValue) { - if (isa(OpValue)) +void BoUpSLP::BlockScheduling::cancelScheduling(InstructionOrPseudo *IOP) { + if (!IOP->MainOp) return; - - ScheduleData *Bundle = getScheduleData(OpValue); + if (isa(IOP->MainOp)) + return; + auto Key = IOP->getKey(); + ScheduleData *Bundle = getScheduleData(IOP)->FirstInBundle; + assert(Bundle && "Counld not find bundle"); LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); @@ -3914,57 +4228,69 @@ // Un-bundle: make single instructions out of the bundle. ScheduleData *BundleMember = Bundle; while (BundleMember) { - assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); + assert(BundleMember->FirstInBundle == Bundle && "Corrupt bundle links"); + assert(BundleMember->Parent == Key.first && + BundleMember->Opcode == Key.second && "Corrupt bundle"); BundleMember->FirstInBundle = BundleMember; ScheduleData *Next = BundleMember->NextInBundle; BundleMember->NextInBundle = nullptr; BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; - if (BundleMember->UnscheduledDepsInBundle == 0) { - ReadyInsts.insert(BundleMember); + if (BundleMember->IsPseudo) { + ScheduleDataMap[BundleMember->Inst].erase(Key); + BundleMember->Opcode = 0; + BundleMember->Parent = nullptr; + } else { + BundleMember->Opcode = 0; + BundleMember->Parent = nullptr; + if (BundleMember->UnscheduledDepsInBundle == 0) { + ReadyInsts.insert(BundleMember); + } } BundleMember = Next; } } -BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { +BoUpSLP::ScheduleData * +BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { // Allocate a new ScheduleData for the instruction. if (ChunkPos >= ChunkSize) { - ScheduleDataChunks.push_back(llvm::make_unique(ChunkSize)); + ScheduleDataChunks.push_back( + llvm::make_unique(ChunkSize)); ChunkPos = 0; } return &(ScheduleDataChunks.back()[ChunkPos++]); } -bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, - const InstructionsState &S) { - if (getScheduleData(V, isOneOf(S, V))) +bool BoUpSLP::BlockScheduling::extendSchedulingRegion( + InstructionOrPseudo *IOP) { + if (getScheduleData(IOP)) return true; - Instruction *I = dyn_cast(V); - assert(I && "bundle member must be an instruction"); - assert(!isa(I) && "phi nodes don't need to be scheduled"); - auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool { - ScheduleData *ISD = getScheduleData(I); - if (!ISD) + assert(!isa(IOP->OpInst) && "phi nodes don't need to be scheduled"); + auto &&CheckSheduleForI = [this, &IOP](Instruction *I) -> bool { + ScheduleData *ISD = ScheduleDataMap[I][std::make_pair(nullptr, 0)]; + if (!ISD || ISD->SchedulingRegionID != SchedulingRegionID) return false; assert(isInSchedulingRegion(ISD) && "ScheduleData not in scheduling region"); - ScheduleData *SD = allocateScheduleDataChunks(); - SD->Inst = I; - SD->init(SchedulingRegionID, S.OpValue); - ExtraScheduleDataMap[I][S.OpValue] = SD; + if (IOP->isNonAlt()) { + ScheduleData *SD = allocateScheduleDataChunks(); + SD->init(SchedulingRegionID, ISD, IOP->getKey().first, IOP->MainOpcode); + ScheduleDataMap[I][IOP->getKey()] = SD; + } return true; }; - if (CheckSheduleForI(I)) + if (CheckSheduleForI(IOP->OpInst)) return true; if (!ScheduleStart) { // It's the first instruction in the new region. - initScheduleData(I, I->getNextNode(), nullptr, nullptr); - ScheduleStart = I; - ScheduleEnd = I->getNextNode(); - if (isOneOf(S, I) != I) - CheckSheduleForI(I); + initScheduleData(IOP->OpInst, IOP->OpInst->getNextNode(), nullptr, nullptr); + ScheduleStart = IOP->OpInst; + ScheduleEnd = IOP->OpInst->getNextNode(); + if (IOP->isNonAlt()) + CheckSheduleForI(IOP->OpInst); assert(ScheduleEnd && "tried to vectorize a terminator?"); - LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); + LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *IOP->Op + << "\n"); return true; } // Search up and down at the same time, because we don't know if the new @@ -3981,26 +4307,27 @@ } if (UpIter != UpperEnd) { - if (&*UpIter == I) { - initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); - ScheduleStart = I; - if (isOneOf(S, I) != I) - CheckSheduleForI(I); - LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I - << "\n"); + if (&*UpIter == IOP->OpInst) { + initScheduleData(IOP->OpInst, ScheduleStart, nullptr, + FirstLoadStoreInRegion); + ScheduleStart = IOP->OpInst; + if (IOP->isNonAlt()) + CheckSheduleForI(IOP->OpInst); + LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " + << *IOP->Op << "\n"); return true; } UpIter++; } if (DownIter != LowerEnd) { - if (&*DownIter == I) { - initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, - nullptr); - ScheduleEnd = I->getNextNode(); - if (isOneOf(S, I) != I) - CheckSheduleForI(I); + if (&*DownIter == IOP->OpInst) { + initScheduleData(ScheduleEnd, IOP->OpInst->getNextNode(), + LastLoadStoreInRegion, nullptr); + ScheduleEnd = IOP->OpInst->getNextNode(); + if (IOP->isNonAlt()) + CheckSheduleForI(IOP->OpInst); assert(ScheduleEnd && "tried to vectorize a terminator?"); - LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I + LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *IOP->Op << "\n"); return true; } @@ -4012,21 +4339,20 @@ return true; } -void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, - Instruction *ToI, - ScheduleData *PrevLoadStore, - ScheduleData *NextLoadStore) { +void BoUpSLP::BlockScheduling::initScheduleData( + Instruction *FromI, Instruction *ToI, ScheduleData *PrevLoadStore, + ScheduleData *NextLoadStore) { ScheduleData *CurrentLoadStore = PrevLoadStore; for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { - ScheduleData *SD = ScheduleDataMap[I]; + ScheduleData *SD = ScheduleDataMap[I][std::make_pair(nullptr, 0)]; if (!SD) { SD = allocateScheduleDataChunks(); - ScheduleDataMap[I] = SD; + ScheduleDataMap[I][std::make_pair(nullptr, 0)] = SD; SD->Inst = I; } assert(!isInSchedulingRegion(SD) && "new ScheduleData already in scheduling region"); - SD->init(SchedulingRegionID, I); + SD->init(SchedulingRegionID); if (I->mayReadOrWriteMemory() && (!isa(I) || @@ -4061,8 +4387,13 @@ WorkList.pop_back(); ScheduleData *BundleMember = SD; + unsigned Opcode = BundleMember->Opcode; + Value *Parent = BundleMember->Parent; while (BundleMember) { assert(isInSchedulingRegion(BundleMember)); + assert(BundleMember->Opcode == Opcode && + BundleMember->Parent == Parent && "Corrupt bundle member"); + if (!BundleMember->hasValidDependencies()) { LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember @@ -4071,35 +4402,27 @@ BundleMember->resetUnscheduledDeps(); // Handle def-use chain dependencies. - if (BundleMember->OpValue != BundleMember->Inst) { - ScheduleData *UseSD = getScheduleData(BundleMember->Inst); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { - BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) - BundleMember->incrementUnscheduledDeps(1); - if (!DestBundle->hasValidDependencies()) - WorkList.push_back(DestBundle); - } - } else { - for (User *U : BundleMember->Inst->users()) { - if (isa(U)) { - ScheduleData *UseSD = getScheduleData(U); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { - BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) - BundleMember->incrementUnscheduledDeps(1); - if (!DestBundle->hasValidDependencies()) - WorkList.push_back(DestBundle); - } - } else { - // I'm not sure if this can ever happen. But we need to be safe. - // This lets the instruction/bundle never be scheduled and - // eventually disable vectorization. + for (User *U : BundleMember->Inst->users()) { + if (auto *I = dyn_cast(U)) { + if (ScheduleDataMap.find(I) == ScheduleDataMap.end()) + continue; + for (auto It : ScheduleDataMap[I]) { + ScheduleData *UseSD = It.second; + if (!UseSD || !isInSchedulingRegion(UseSD)) + continue; BundleMember->Dependencies++; - BundleMember->incrementUnscheduledDeps(1); + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) + BundleMember->incrementUnscheduledDeps(1); + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); } + } else { + // I'm not sure if this can ever happen. But we need to be safe. + // This lets the instruction/bundle never be scheduled and + // eventually disable vectorization. + BundleMember->Dependencies++; + BundleMember->incrementUnscheduledDeps(1); } } @@ -4108,7 +4431,7 @@ if (DepDest) { Instruction *SrcInst = BundleMember->Inst; MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA); - bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); + bool SrcMayWrite = SrcInst->mayWriteToMemory(); unsigned numAliased = 0; unsigned DistToSrc = 1; @@ -4132,14 +4455,23 @@ // balance between reduced runtime and accurate dependencies. numAliased++; - DepDest->MemoryDependencies.push_back(BundleMember); - BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; - if (!DestBundle->IsScheduled) { - BundleMember->incrementUnscheduledDeps(1); - } - if (!DestBundle->hasValidDependencies()) { - WorkList.push_back(DestBundle); + // We don't want any duplicates in the set to have a correct + // dependancies. + if (ScheduleDataMap.find(DepDest->Inst) != ScheduleDataMap.end()) { + for (auto It : ScheduleDataMap[DepDest->Inst]) { + auto *Dep = It.second; + if (!Dep || !isInSchedulingRegion(Dep)) + continue; + Dep->MemoryDependencies.push_back(BundleMember); + BundleMember->Dependencies++; + ScheduleData *DestBundle = Dep->FirstInBundle; + if (!DestBundle->IsScheduled) { + BundleMember->incrementUnscheduledDeps(1); + } + if (!DestBundle->hasValidDependencies()) { + WorkList.push_back(DestBundle); + } + } } } DepDest = DepDest->NextLoadStore; @@ -4177,22 +4509,102 @@ assert(ScheduleStart && "tried to reset schedule on block which has not been scheduled"); for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - doForAllOpcodes(I, [&](ScheduleData *SD) { - assert(isInSchedulingRegion(SD) && - "ScheduleData not in scheduling region"); + if (ScheduleDataMap.find(I) == ScheduleDataMap.end()) + continue; + for (auto It : ScheduleDataMap[I]) { + auto *SD = It.second; + if (!SD || !isInSchedulingRegion(SD)) + continue; SD->IsScheduled = false; SD->resetUnscheduledDeps(); - }); + } } ReadyInsts.clear(); } +void BoUpSLP::eliminatePseudos(BlockScheduling *BS) { + if (!BS->ScheduleStart) + return; + bool Changed = false; + for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + SmallSet, 2> Keys; + SmallDenseMap NewOps; + for (auto It : BS->ScheduleDataMap[I]) { + auto *SD = It.second; + if (!SD || !BS->isInSchedulingRegion(SD) || !SD->IsPseudo) + continue; + auto Key = It.first; + // Removing any pseudo instruction that + // remained out of canceled bundles. + if (!SD->isPartOfBundle()) { + Keys.insert(Key); + continue; + } + Changed = true; + // Emitting a real instruction + Builder.SetInsertPoint(I->getNextNode()); + Constant *C = + ConstantExpr::getBinOpIdentity(SD->Opcode, SD->Inst->getType(), true); + Value *V = Builder.CreateBinOp( + static_cast(SD->Opcode), I, C); + Instruction *NewOp = cast(V); + // Change instruction in ScheduleData. + SD->Inst = NewOp; + SD->IsPseudo = false; + // Remove the pseudo instruction from ScheduleDataMap + // and add the real instruction. + NewOps[NewOp] = SD; + Keys.insert(Key); + std::swap(ScalarToTreeEntry[NewOp][Key], ScalarToTreeEntry[I][Key]); + ScalarToTreeEntry[I].erase(Key); + TreeEntry *E = &VectorizableTree[ScalarToTreeEntry[NewOp][Key]]; + llvm::propagateIRFlags(V, E->Scalars[0]->MainOp); + V = propagateMetadata(NewOp, E->Scalars); + // Replace instruction in TreeEntry. + for (auto *IOP : E->Scalars) { + if (IOP->OpInst == I) { + IOP->Op = NewOp; + IOP->OpInst = NewOp; + IOP->PseudoOp = I; + } + } + } + for (auto Key : Keys) + BS->ScheduleDataMap[I].erase(Key); + for (auto Ops : NewOps) { + auto *SD = Ops.second; + BS->ScheduleDataMap[Ops.first][std::make_pair(SD->Parent, SD->Opcode)] = + SD; + } + } + // We have to recalulate all dependancies in + // the scheduling region if any pseudo instructions + // were replaced. + if (Changed) { + BS->resetSchedule(); + BS->initialFillReadyList(BS->ReadyInsts); + for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; + I = I->getNextNode()) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + for (auto It : BS->ScheduleDataMap[I]) { + auto *SD = It.second; + if (SD && SD->SchedulingRegionID == BS->SchedulingRegionID) { + SD->clearDependencies(); + } + } + } + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n"); - BS->resetSchedule(); // For the real scheduling we use a more sophisticated ready-list: it is @@ -4211,16 +4623,24 @@ int NumToSchedule = 0; for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { - BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { + if (BS->ScheduleDataMap.find(I) == BS->ScheduleDataMap.end()) + continue; + for (auto It : BS->ScheduleDataMap[I]) { + auto *SD = It.second; + if (!SD || !BS->isInSchedulingRegion(SD)) + continue; + std::pair Key = std::make_pair(SD->Parent, SD->Opcode); assert(SD->isPartOfBundle() == - (getTreeEntry(SD->Inst) != nullptr) && + (ScalarToTreeEntry.find(SD->Inst) != ScalarToTreeEntry.end() && + ScalarToTreeEntry[SD->Inst].find(Key) != + ScalarToTreeEntry[SD->Inst].end()) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { BS->calculateDependencies(SD, false, this); NumToSchedule++; } - }); + } } BS->initialFillReadyList(ReadyInsts); @@ -4234,20 +4654,39 @@ // Move the scheduled instruction(s) to their dedicated places, if not // there yet. ScheduleData *BundleMember = picked; + unsigned Opcode = BundleMember->Opcode; + Value *Parent = BundleMember->Parent; while (BundleMember) { - Instruction *pickedInst = BundleMember->Inst; - if (LastScheduledInst->getNextNode() != pickedInst) { - BS->BB->getInstList().remove(pickedInst); + assert(Opcode == BundleMember->Opcode && + Parent == BundleMember->Parent && "Corrupt bundle member"); + Instruction *PickedInst = BundleMember->Inst; + if (LastScheduledInst != PickedInst) { + BS->BB->getInstList().remove(PickedInst); BS->BB->getInstList().insert(LastScheduledInst->getIterator(), - pickedInst); + PickedInst); } - LastScheduledInst = pickedInst; + LastScheduledInst = PickedInst; BundleMember = BundleMember->NextInBundle; } - BS->schedule(picked, ReadyInsts); NumToSchedule--; } +#ifndef NDEBUG + LLVM_DEBUG(dbgs() << *BS->BB << "\n"); + if (NumToSchedule != 0) { + for (BasicBlock::iterator I = BS->BB->begin(), E = BS->BB->end(); I != E; + ++I) { + if (BS->ScheduleDataMap.find(&*I) == BS->ScheduleDataMap.end()) + continue; + for (auto It : BS->ScheduleDataMap[&*I]) { + auto *SD = It.second; + if (SD && BS->isInSchedulingRegion(SD) && + SD->isSchedulingEntity() && SD->UnscheduledDepsInBundle != 0) + LLVM_DEBUG(dbgs() << "SLP: Failed to schedule: " << *SD << "\n"); + } + } + } +#endif assert(NumToSchedule == 0 && "could not schedule all instructions"); // Avoid duplicate scheduling of the block. @@ -4392,7 +4831,7 @@ // We only attempt to truncate integer expressions. auto &TreeRoot = VectorizableTree[0].Scalars; - auto *TreeRootIT = dyn_cast(TreeRoot[0]->getType()); + auto *TreeRootIT = dyn_cast(TreeRoot[0]->Op->getType()); if (!TreeRootIT) return; @@ -4402,7 +4841,9 @@ // This means that if a tree entry other than a root is used externally, it // must have multiple uses and InstCombine will not rewrite it. The code // below ensures that only the roots are used externally. - SmallPtrSet Expr(TreeRoot.begin(), TreeRoot.end()); + SmallPtrSet Expr; + for (auto *IOP : TreeRoot) + Expr.insert(IOP->Op); for (auto &EU : ExternalUses) if (!Expr.erase(EU.Scalar)) return; @@ -4413,12 +4854,13 @@ // context to determine which values can be demoted. If we see a truncation, // we mark it as seeding another demotion. for (auto &Entry : VectorizableTree) - Expr.insert(Entry.Scalars.begin(), Entry.Scalars.end()); + for (auto *IOP: Entry.Scalars) + Expr.insert(IOP->Op); // Ensure the roots of the vectorizable tree don't form a cycle. They must // have a single external user that is not in the vectorizable tree. for (auto *Root : TreeRoot) - if (!Root->hasOneUse() || Expr.count(*Root->user_begin())) + if (!Root->Op->hasOneUse() || Expr.count(*Root->Op->user_begin())) return; // Conservatively determine if we can actually truncate the roots of the @@ -4427,7 +4869,7 @@ SmallVector ToDemote; SmallVector Roots; for (auto *Root : TreeRoot) - if (!collectValuesToDemote(Root, Expr, ToDemote, Roots)) + if (!collectValuesToDemote(Root->Op, Expr, ToDemote, Roots)) return; // The maximum bit width required to represent all the values that can be @@ -4438,7 +4880,7 @@ // We first check if all the bits of the roots are demanded. If they're not, // we can truncate the roots to this narrower type. for (auto *Root : TreeRoot) { - auto Mask = DB->getDemandedBits(cast(Root)); + auto Mask = DB->getDemandedBits(Root->OpInst); MaxBitWidth = std::max( Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth); } @@ -4457,17 +4899,17 @@ // We start by looking at each entry that can be demoted. We compute the // maximum bit width required to store the scalar by using ValueTracking to // compute the number of high-order bits we can truncate. - if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) && - llvm::all_of(TreeRoot, [](Value *R) { - assert(R->hasOneUse() && "Root should have only one use!"); - return isa(R->user_back()); + if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->Op->getType()) && + llvm::all_of(TreeRoot, [](InstructionOrPseudo *R) { + assert(R->Op->hasOneUse() && "Root should have only one use!"); + return isa(R->Op->user_back()); })) { MaxBitWidth = 8u; // Determine if the sign bit of all the roots is known to be zero. If not, // IsKnownPositive is set to False. - IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) { - KnownBits Known = computeKnownBits(R, *DL); + IsKnownPositive = llvm::all_of(TreeRoot, [&](InstructionOrPseudo *R) { + KnownBits Known = computeKnownBits(R->Op, *DL); return Known.isNonNegative(); }); @@ -4865,9 +5307,21 @@ // Check that all of the parts are scalar instructions of the same type, // we permit an alternate opcode via InstructionsState. - InstructionsState S = getSameOpcode(VL); + SmallVector ArrayIL; + SmallVector IL; + + for (Value *V: VL) + ArrayIL.push_back(InstructionOrPseudo(VL[0], V)); + for (unsigned idx=0; idx < ArrayIL.size(); idx++) + IL.push_back(&ArrayIL[idx]); + InstructionsState S = getSameOpcode(IL); if (!S.getOpcode()) return false; + for (Value *V : VL) { + auto *I = dyn_cast(V); + if (isOneOf(S, I) != I) + return false; + } Instruction *I0 = cast(S.OpValue); unsigned Sz = R.getVectorElementSize(I0); Index: test/Transforms/SLPVectorizer/X86/PR39774.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/PR39774.ll +++ test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -3,185 +3,93 @@ ; RUN: opt -slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake -slp-threshold=-8 -slp-min-tree-size=6 | FileCheck %s --check-prefixes=ALL,FORCE_REDUCTION define void @Test(i32) { -; CHECK-LABEL: @Test( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> , [[SHUFFLE]] -; CHECK-NEXT: [[VAL_1:%.*]] = and i32 [[TMP2]], undef -; CHECK-NEXT: [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]] -; CHECK-NEXT: [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]] -; CHECK-NEXT: [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]] -; CHECK-NEXT: [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]] -; CHECK-NEXT: [[VAL_7:%.*]] = and i32 [[VAL_5]], undef -; CHECK-NEXT: [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]] -; CHECK-NEXT: [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]] -; CHECK-NEXT: [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]] -; CHECK-NEXT: [[VAL_12:%.*]] = and i32 [[VAL_10]], undef -; CHECK-NEXT: [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]] -; CHECK-NEXT: [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]] -; CHECK-NEXT: [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]] -; CHECK-NEXT: [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]] -; CHECK-NEXT: [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]] -; CHECK-NEXT: [[VAL_19:%.*]] = and i32 [[VAL_17]], undef -; CHECK-NEXT: [[VAL_21:%.*]] = and i32 [[VAL_19]], undef -; CHECK-NEXT: [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]] -; CHECK-NEXT: [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]] -; CHECK-NEXT: [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]] -; CHECK-NEXT: [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]] -; CHECK-NEXT: [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]] -; CHECK-NEXT: [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]] -; CHECK-NEXT: [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]] -; CHECK-NEXT: [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]] -; CHECK-NEXT: [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]] -; CHECK-NEXT: [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]] -; CHECK-NEXT: [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]] -; CHECK-NEXT: [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]] -; CHECK-NEXT: [[VAL_35:%.*]] = and i32 [[VAL_33]], undef -; CHECK-NEXT: [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]] -; CHECK-NEXT: [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]] -; CHECK-NEXT: [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]] -; CHECK-NEXT: [[VAL_40:%.*]] = and i32 [[VAL_38]], undef -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX:%.*]] = and <8 x i32> [[TMP3]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 -; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP4]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA28:%.*]] = and i32 [[OP_EXTRA27]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA29:%.*]] = and i32 [[OP_EXTRA28]], [[TMP0]] -; CHECK-NEXT: [[OP_EXTRA30:%.*]] = and i32 [[OP_EXTRA29]], [[TMP0]] -; CHECK-NEXT: [[VAL_42:%.*]] = and i32 [[VAL_40]], undef -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> undef, i32 [[OP_EXTRA30]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP2]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 14910, i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i32> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> undef, i32 [[TMP12]], i32 0 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP15]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i32 1 -; CHECK-NEXT: br label [[LOOP]] -; -; FORCE_REDUCTION-LABEL: @Test( -; FORCE_REDUCTION-NEXT: entry: -; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] -; FORCE_REDUCTION: loop: -; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> -; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <4 x i32> , [[SHUFFLE]] -; FORCE_REDUCTION-NEXT: [[VAL_1:%.*]] = and i32 [[TMP2]], undef -; FORCE_REDUCTION-NEXT: [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]] -; FORCE_REDUCTION-NEXT: [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_7:%.*]] = and i32 [[VAL_5]], undef -; FORCE_REDUCTION-NEXT: [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_12:%.*]] = and i32 [[VAL_10]], undef -; FORCE_REDUCTION-NEXT: [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_19:%.*]] = and i32 [[VAL_17]], undef -; FORCE_REDUCTION-NEXT: [[VAL_20:%.*]] = add i32 [[TMP2]], 1496 -; FORCE_REDUCTION-NEXT: [[VAL_21:%.*]] = and i32 [[VAL_19]], [[VAL_20]] -; FORCE_REDUCTION-NEXT: [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_34:%.*]] = add i32 [[TMP2]], 8555 -; FORCE_REDUCTION-NEXT: [[VAL_35:%.*]] = and i32 [[VAL_33]], [[VAL_34]] -; FORCE_REDUCTION-NEXT: [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> -; FORCE_REDUCTION-NEXT: [[BIN_RDX:%.*]] = and <4 x i32> [[TMP3]], [[RDX_SHUF]] -; FORCE_REDUCTION-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> -; FORCE_REDUCTION-NEXT: [[BIN_RDX2:%.*]] = and <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]] -; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], [[VAL_20]] -; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = and i32 [[TMP5]], [[VAL_34]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP6]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA3:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA4:%.*]] = and i32 [[OP_EXTRA3]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA4]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA28:%.*]] = and i32 [[OP_EXTRA27]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[OP_EXTRA29:%.*]] = and i32 [[OP_EXTRA28]], [[TMP2]] -; FORCE_REDUCTION-NEXT: [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]] -; FORCE_REDUCTION-NEXT: [[VAL_39:%.*]] = add i32 [[TMP2]], 12529 -; FORCE_REDUCTION-NEXT: [[VAL_40:%.*]] = and i32 [[OP_EXTRA29]], [[VAL_39]] -; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[TMP2]], 13685 -; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_40]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_41]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 14910, i32 1 -; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = and <2 x i32> [[TMP8]], [[TMP10]] -; FORCE_REDUCTION-NEXT: [[TMP12:%.*]] = add <2 x i32> [[TMP8]], [[TMP10]] -; FORCE_REDUCTION-NEXT: [[TMP13]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> -; FORCE_REDUCTION-NEXT: br label [[LOOP]] +; ALL-LABEL: @Test( +; ALL-NEXT: entry: +; ALL-NEXT: br label [[LOOP:%.*]] +; ALL: loop: +; ALL-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP11:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; ALL-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> +; ALL-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 +; ALL-NEXT: [[TMP3:%.*]] = add <8 x i32> , [[SHUFFLE]] +; ALL-NEXT: [[VAL_1:%.*]] = and i32 [[TMP2]], undef +; ALL-NEXT: [[VAL_2:%.*]] = and i32 [[VAL_1]], [[TMP0:%.*]] +; ALL-NEXT: [[VAL_3:%.*]] = and i32 [[VAL_2]], [[TMP0]] +; ALL-NEXT: [[VAL_4:%.*]] = and i32 [[VAL_3]], [[TMP0]] +; ALL-NEXT: [[VAL_5:%.*]] = and i32 [[VAL_4]], [[TMP0]] +; ALL-NEXT: [[VAL_7:%.*]] = and i32 [[VAL_5]], undef +; ALL-NEXT: [[VAL_8:%.*]] = and i32 [[VAL_7]], [[TMP0]] +; ALL-NEXT: [[VAL_9:%.*]] = and i32 [[VAL_8]], [[TMP0]] +; ALL-NEXT: [[VAL_10:%.*]] = and i32 [[VAL_9]], [[TMP0]] +; ALL-NEXT: [[VAL_12:%.*]] = and i32 [[VAL_10]], undef +; ALL-NEXT: [[VAL_13:%.*]] = and i32 [[VAL_12]], [[TMP0]] +; ALL-NEXT: [[VAL_14:%.*]] = and i32 [[VAL_13]], [[TMP0]] +; ALL-NEXT: [[VAL_15:%.*]] = and i32 [[VAL_14]], [[TMP0]] +; ALL-NEXT: [[VAL_16:%.*]] = and i32 [[VAL_15]], [[TMP0]] +; ALL-NEXT: [[VAL_17:%.*]] = and i32 [[VAL_16]], [[TMP0]] +; ALL-NEXT: [[VAL_19:%.*]] = and i32 [[VAL_17]], undef +; ALL-NEXT: [[VAL_21:%.*]] = and i32 [[VAL_19]], undef +; ALL-NEXT: [[VAL_22:%.*]] = and i32 [[VAL_21]], [[TMP0]] +; ALL-NEXT: [[VAL_23:%.*]] = and i32 [[VAL_22]], [[TMP0]] +; ALL-NEXT: [[VAL_24:%.*]] = and i32 [[VAL_23]], [[TMP0]] +; ALL-NEXT: [[VAL_25:%.*]] = and i32 [[VAL_24]], [[TMP0]] +; ALL-NEXT: [[VAL_26:%.*]] = and i32 [[VAL_25]], [[TMP0]] +; ALL-NEXT: [[VAL_27:%.*]] = and i32 [[VAL_26]], [[TMP0]] +; ALL-NEXT: [[VAL_28:%.*]] = and i32 [[VAL_27]], [[TMP0]] +; ALL-NEXT: [[VAL_29:%.*]] = and i32 [[VAL_28]], [[TMP0]] +; ALL-NEXT: [[VAL_30:%.*]] = and i32 [[VAL_29]], [[TMP0]] +; ALL-NEXT: [[VAL_31:%.*]] = and i32 [[VAL_30]], [[TMP0]] +; ALL-NEXT: [[VAL_32:%.*]] = and i32 [[VAL_31]], [[TMP0]] +; ALL-NEXT: [[VAL_33:%.*]] = and i32 [[VAL_32]], [[TMP0]] +; ALL-NEXT: [[VAL_35:%.*]] = and i32 [[VAL_33]], undef +; ALL-NEXT: [[VAL_36:%.*]] = and i32 [[VAL_35]], [[TMP0]] +; ALL-NEXT: [[VAL_37:%.*]] = and i32 [[VAL_36]], [[TMP0]] +; ALL-NEXT: [[VAL_38:%.*]] = and i32 [[VAL_37]], [[TMP0]] +; ALL-NEXT: [[VAL_40:%.*]] = and i32 [[VAL_38]], undef +; ALL-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_40]], i32 0 +; ALL-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[TMP2]], i32 1 +; ALL-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP3]], i32 7 +; ALL-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP6]], i32 0 +; ALL-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 14910, i32 1 +; ALL-NEXT: [[TMP9:%.*]] = and <2 x i32> [[TMP5]], [[TMP8]] +; ALL-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP5]], [[TMP8]] +; ALL-NEXT: [[TMP11]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> +; ALL-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> undef, <8 x i32> +; ALL-NEXT: [[BIN_RDX:%.*]] = and <8 x i32> [[TMP3]], [[RDX_SHUF]] +; ALL-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> +; ALL-NEXT: [[BIN_RDX2:%.*]] = and <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]] +; ALL-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> +; ALL-NEXT: [[BIN_RDX4:%.*]] = and <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] +; ALL-NEXT: [[TMP12:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0 +; ALL-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP12]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA5:%.*]] = and i32 [[OP_EXTRA]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA6:%.*]] = and i32 [[OP_EXTRA5]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA7:%.*]] = and i32 [[OP_EXTRA6]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA8:%.*]] = and i32 [[OP_EXTRA7]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA9:%.*]] = and i32 [[OP_EXTRA8]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA10:%.*]] = and i32 [[OP_EXTRA9]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA11:%.*]] = and i32 [[OP_EXTRA10]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA12:%.*]] = and i32 [[OP_EXTRA11]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA13:%.*]] = and i32 [[OP_EXTRA12]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA14:%.*]] = and i32 [[OP_EXTRA13]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA15:%.*]] = and i32 [[OP_EXTRA14]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA16:%.*]] = and i32 [[OP_EXTRA15]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA17:%.*]] = and i32 [[OP_EXTRA16]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA18:%.*]] = and i32 [[OP_EXTRA17]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA19:%.*]] = and i32 [[OP_EXTRA18]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA20:%.*]] = and i32 [[OP_EXTRA19]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA21:%.*]] = and i32 [[OP_EXTRA20]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA22:%.*]] = and i32 [[OP_EXTRA21]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA23:%.*]] = and i32 [[OP_EXTRA22]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA24:%.*]] = and i32 [[OP_EXTRA23]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA25:%.*]] = and i32 [[OP_EXTRA24]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA26:%.*]] = and i32 [[OP_EXTRA25]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA27:%.*]] = and i32 [[OP_EXTRA26]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA28:%.*]] = and i32 [[OP_EXTRA27]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA29:%.*]] = and i32 [[OP_EXTRA28]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA30:%.*]] = and i32 [[OP_EXTRA29]], [[TMP0]] +; ALL-NEXT: [[OP_EXTRA31:%.*]] = and i32 [[OP_EXTRA30]], [[TMP2]] +; ALL-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 +; ALL-NEXT: br label [[LOOP]] ; entry: br label %loop Index: test/Transforms/SLPVectorizer/X86/PR40310.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/PR40310.ll +++ test/Transforms/SLPVectorizer/X86/PR40310.ll @@ -7,7 +7,7 @@ ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[PARAM:%.*]], i32 1 ; CHECK-NEXT: br label [[BCI_15:%.*]] ; CHECK: bci_15: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP11:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15 @@ -28,6 +28,13 @@ ; CHECK-NEXT: [[V38:%.*]] = and i32 undef, [[V36]] ; CHECK-NEXT: [[V40:%.*]] = and i32 undef, [[V38]] ; CHECK-NEXT: [[V42:%.*]] = and i32 undef, [[V40]] +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> undef, i32 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <16 x i32> [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP6]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> , i32 [[V42]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = and <2 x i32> [[TMP7]], [[TMP8]] +; CHECK-NEXT: [[TMP11]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> undef, <16 x i32> ; CHECK-NEXT: [[BIN_RDX:%.*]] = and <16 x i32> [[TMP4]], [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> @@ -36,12 +43,9 @@ ; CHECK-NEXT: [[BIN_RDX4:%.*]] = and <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]] ; CHECK-NEXT: [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> ; CHECK-NEXT: [[BIN_RDX6:%.*]] = and <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]] -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0 -; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]] -; CHECK-NEXT: [[V43:%.*]] = and i32 undef, [[V42]] -; CHECK-NEXT: [[V44:%.*]] = add i32 [[TMP2]], 16 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[V44]], i32 0 -; CHECK-NEXT: [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0 +; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP12]], [[TMP2]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 ; CHECK-NEXT: br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]] ; CHECK: loopexit: ; CHECK-NEXT: ret void Index: test/Transforms/SLPVectorizer/X86/pr35497.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/pr35497.ll +++ test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -12,20 +12,20 @@ define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 { ; CHECK-LABEL: @_ZN1C10SwitchModeEv( ; CHECK-NEXT: for.body.lr.ph.i: -; CHECK-NEXT: [[OR_1:%.*]] = or i64 undef, 1 -; CHECK-NEXT: store i64 [[OR_1]], i64* undef, align 8 +; CHECK-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> undef, i64 [[BAR5]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i64> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: store i64 [[TMP2]], i64* undef, align 8 ; CHECK-NEXT: [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0 ; CHECK-NEXT: [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 -; CHECK-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i64> [[TMP1]], [[TMP4]] ; CHECK-NEXT: [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0 ; CHECK-NEXT: [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8 ; CHECK-NEXT: ret void ; for.body.lr.ph.i: Index: test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -43,22 +43,16 @@ ; CHECK-LABEL: @add1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -86,22 +80,16 @@ ; CHECK-LABEL: @sub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -205,22 +193,18 @@ ; CHECK-LABEL: @addsub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -248,22 +232,18 @@ ; CHECK-LABEL: @addsub1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -291,22 +271,16 @@ ; CHECK-LABEL: @mul( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[MUL]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9 -; CHECK-NEXT: store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -334,22 +308,16 @@ ; CHECK-LABEL: @shl0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SHL5:%.*]] = shl i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -453,22 +421,16 @@ ; CHECK-LABEL: @add1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -496,22 +458,16 @@ ; CHECK-LABEL: @sub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -615,22 +571,18 @@ ; CHECK-LABEL: @addsub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[SUB5]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -658,22 +610,18 @@ ; CHECK-LABEL: @addsub1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -701,22 +649,16 @@ ; CHECK-LABEL: @mulf( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 -; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -825,22 +767,16 @@ ; CHECK-LABEL: @sub0fn( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: