Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -299,6 +299,22 @@ : TargetTransformInfo::SK_PermuteSingleSrc; } +/// Checks if the \p Opcode can be considered as an operand of a (possibly) +/// binary operation \p I. +/// \returns The code of the binary operation of instruction \p I if the +/// instruction with \p Opcode can be considered as an operand of \p I with the +/// default value. +static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) { + if (I->getOpcode() != Instruction::PHI && + I->getOpcode() != Instruction::SRem && + I->getOpcode() != Instruction::URem && + I->getOpcode() != Instruction::FRem && + (I->getType()->isIntegerTy() || + (isa(I) && cast(I)->isFast()))) + return I->getOpcode(); + return 0; +} + namespace { /// Main data required for vectorization of instructions. @@ -320,14 +336,15 @@ } /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { return getOpcode() != getAltOpcode(); } + bool isAltShuffle() const { return (getOpcode() != 0 && getAltOpcode() != 0 && + getOpcode() != getAltOpcode()); } bool isOpcodeOrAlt(Instruction *I) const { unsigned CheckedOpcode = I->getOpcode(); return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; } - InstructionsState() = delete; + InstructionsState() = default; InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} }; @@ -353,41 +370,92 @@ if (llvm::any_of(VL, [](Value *V) { return !isa(V); })) return InstructionsState(VL[BaseIndex], nullptr, nullptr); + unsigned Opcode = cast(VL[BaseIndex])->getOpcode(); bool IsCastOp = isa(VL[BaseIndex]); bool IsBinOp = isa(VL[BaseIndex]); - unsigned Opcode = cast(VL[BaseIndex])->getOpcode(); + bool IsNonAlt = false; unsigned AltOpcode = Opcode; + unsigned OpcodeNum = 0; + unsigned AltOpcodeNum = 0; + unsigned NonAltNum = 0; + unsigned NonAltIndex = 0; unsigned AltIndex = BaseIndex; - // Check for one alternate opcode from another BinaryOperator. - // TODO - generalize to support all operators (types, calls etc.). + // Check for an alternate opcode pattern. for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { - unsigned InstOpcode = cast(VL[Cnt])->getOpcode(); - if (IsBinOp && isa(VL[Cnt])) { - if (InstOpcode == Opcode || InstOpcode == AltOpcode) - continue; - if (Opcode == AltOpcode) { - AltOpcode = InstOpcode; - AltIndex = Cnt; - continue; - } - } else if (IsCastOp && isa(VL[Cnt])) { + auto *I = cast(VL[Cnt]); + unsigned InstOpcode = I->getOpcode(); + if (IsCastOp && isa(VL[Cnt])) { Type *Ty0 = cast(VL[BaseIndex])->getOperand(0)->getType(); Type *Ty1 = cast(VL[Cnt])->getOperand(0)->getType(); if (Ty0 == Ty1) { - if (InstOpcode == Opcode || InstOpcode == AltOpcode) + if (InstOpcode == Opcode) { + OpcodeNum++; + continue; + } + if (AltOpcode != Opcode && InstOpcode == AltOpcode) { + AltOpcodeNum++; continue; + } if (Opcode == AltOpcode) { AltOpcode = InstOpcode; AltIndex = Cnt; + AltOpcodeNum++; continue; } } - } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } + if (InstOpcode == Opcode) { + OpcodeNum++; continue; - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } + if (AltOpcode != Opcode && InstOpcode == AltOpcode) { + AltOpcodeNum++; + continue; + } + if (InstOpcode != Opcode && InstOpcode != AltOpcode) { + if (IsBinOp && AltOpcode == Opcode && isa(I)) { + AltOpcode = InstOpcode; + AltOpcodeNum++; + AltIndex = Cnt; + continue; + } + if (Opcode != Instruction::PHI && + (tryToRepresentAsInstArg(Opcode, I) || + (IsBinOp && InstOpcode != Instruction::PHI && + tryToRepresentAsInstArg(InstOpcode, + cast(VL[BaseIndex]))))) { + if (!IsNonAlt) { + NonAltIndex = Cnt; + IsNonAlt = true; + } + NonAltNum++; + continue; + } + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } } + if (IsNonAlt && VL.size() > 2 && (OpcodeNum + AltOpcodeNum) <= NonAltNum) { + BaseIndex = NonAltIndex; + AltIndex = BaseIndex; + Opcode = cast(VL[BaseIndex])->getOpcode(); + AltOpcode = Opcode; + IsBinOp = isa(VL[BaseIndex]); + for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { + auto *I = cast(VL[Cnt]); + unsigned InstOpcode = I->getOpcode(); + if (Opcode == AltOpcode && IsBinOp && isa(I)) { + AltOpcode = InstOpcode; + AltIndex = Cnt; + } + } + } + + if (IsNonAlt && !IsBinOp) + return InstructionsState(VL[BaseIndex], nullptr, nullptr); + return InstructionsState(VL[BaseIndex], cast(VL[BaseIndex]), cast(VL[AltIndex])); } @@ -701,10 +769,14 @@ /// The TreeEntry index containing the user of this entry. We can actually /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + + /// Info about instruction in this tree entry. + InstructionsState State; }; /// Create a new VectorizableTree entry. void newTreeEntry(ArrayRef VL, bool Vectorized, int &UserTreeIdx, + const InstructionsState &S, ArrayRef ReuseShuffleIndices = None, ArrayRef ReorderIndices = None) { VectorizableTree.emplace_back(VectorizableTree); @@ -716,11 +788,20 @@ ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; if (Vectorized) { + Last->State = S; for (int i = 0, e = VL.size(); i != e; ++i) { - assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = idx; + assert(!getTreeEntry(VL[i], S.getOpcode()) && "Scalar already in tree!"); + ScalarToTreeEntry[VL[i]][S.getOpcode()] = idx; } } else { + for (Value *V: VL) { + if (Instruction *I = dyn_cast(V)) { + Last->State.MainOp = I; + Last->State.AltOp = I; + break; + } + } + Last->State.OpValue = VL[0]; MustGather.insert(VL.begin(), VL.end()); } @@ -735,13 +816,29 @@ TreeEntry *getTreeEntry(Value *V) { auto I = ScalarToTreeEntry.find(V); - if (I != ScalarToTreeEntry.end()) - return &VectorizableTree[I->second]; + if (I != ScalarToTreeEntry.end()) { + auto &STT = I->second; + for (auto STTI : STT) { + if (isOneOf(VectorizableTree[STTI.second].State, V) == V) + return &VectorizableTree[STTI.second]; + } + } + return nullptr; + } + + TreeEntry *getTreeEntry(Value *V, unsigned Opcode) { + auto I = ScalarToTreeEntry.find(V); + if (I != ScalarToTreeEntry.end()) { + auto &STT = I->second; + auto STTI = STT.find(Opcode); + if (STTI != STT.end()) + return &VectorizableTree[STTI->second]; + } return nullptr; } /// Maps a specific scalar to its tree entry. - SmallDenseMap ScalarToTreeEntry; + SmallDenseMap> ScalarToTreeEntry; /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -831,19 +928,6 @@ // dependencies are not calculated yet. enum { InvalidDeps = -1 }; - ScheduleData() = default; - - void init(int BlockSchedulingRegionID, Value *OpVal) { - FirstInBundle = this; - NextInBundle = nullptr; - NextLoadStore = nullptr; - IsScheduled = false; - SchedulingRegionID = BlockSchedulingRegionID; - UnscheduledDepsInBundle = UnscheduledDeps; - clearDependencies(); - OpValue = OpVal; - } - /// Returns true if the dependency information has been calculated. bool hasValidDependencies() const { return Dependencies != InvalidDeps; } @@ -885,24 +969,39 @@ MemoryDependencies.clear(); } + /// Get an instruction behind this ScheduleData instance. + virtual Instruction *getInst() const = 0; + + /// Returns true if the instance is a pseudo instruction one. + virtual bool isPseudo() const = 0; + void dump(raw_ostream &os) const { if (!isSchedulingEntity()) { - os << "/ " << *Inst; + os << "/ "; + if (isPseudo()) + os << "*"; + os << *getInst(); } else if (NextInBundle) { - os << '[' << *Inst; + os << '['; + if (isPseudo()) + os << "*"; + os << *getInst(); ScheduleData *SD = NextInBundle; while (SD) { - os << ';' << *SD->Inst; - SD = SD->NextInBundle; + os << ';' ; + if (SD->isPseudo()) + os << "*"; + os << *SD->getInst(); + SD = SD->NextInBundle; } os << ']'; } else { - os << *Inst; + if (isPseudo()) + os << "*"; + os << *getInst(); } } - Instruction *Inst = nullptr; - /// Points to the head in an instruction bundle (and always to this for /// single instructions). ScheduleData *FirstInBundle = nullptr; @@ -916,8 +1015,8 @@ ScheduleData *NextLoadStore = nullptr; /// The dependent memory instructions. - /// This list is derived on demand in calculateDependencies(). - SmallVector MemoryDependencies; + /// This set is derived on demand in calculateDependencies(). + SmallPtrSet MemoryDependencies; /// This ScheduleData is in the current scheduling region if this matches /// the current SchedulingRegionID of BlockScheduling. @@ -946,7 +1045,64 @@ /// dry-run). bool IsScheduled = false; - /// Opcode of the current instruction in the schedule data. + /// Opcode that represents instructions to be vectorized. + unsigned Opcode = 0; + }; + + struct InstScheduleData : public ScheduleData { + + InstScheduleData() = default; + + Instruction *Inst = nullptr; + + void init(int BlockSchedulingRegionID) { + FirstInBundle = this; + NextInBundle = nullptr; + NextLoadStore = nullptr; + IsScheduled = false; + SchedulingRegionID = BlockSchedulingRegionID; + UnscheduledDepsInBundle = UnscheduledDeps; + clearDependencies(); + } + + Instruction *getInst() const { + return Inst; + } + + bool isPseudo() const { + return false; + } + + }; + + struct PseudoScheduleData : public ScheduleData { + + PseudoScheduleData() = default; + + InstScheduleData *ISD; + + void init(int BlockSchedulingRegionID, InstScheduleData *OpISD, + Value *OpVal, unsigned OpCode) { + FirstInBundle = this; + NextInBundle = nullptr; + NextLoadStore = OpISD->NextLoadStore; + IsScheduled = false; + SchedulingRegionID = BlockSchedulingRegionID; + UnscheduledDepsInBundle = UnscheduledDeps; + clearDependencies(); + OpValue = OpVal; + ISD = OpISD; + Opcode = OpCode; + } + + Instruction *getInst() const { + return ISD->Inst; + } + + bool isPseudo() const { + return true; + } + Value *OpValue = nullptr; }; @@ -964,7 +1120,8 @@ /// Contains all scheduling data for a basic block. struct BlockScheduling { BlockScheduling(BasicBlock *BB) - : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {} + : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), + PseudoChunkSize(BB->size()), PseudoChunkPos(PseudoChunkSize) {} void clear() { ReadyInsts.clear(); @@ -972,6 +1129,7 @@ ScheduleEnd = nullptr; FirstLoadStoreInRegion = nullptr; LastLoadStoreInRegion = nullptr; + PseudoInstScheduleDataMap.clear(); // Reduce the maximum schedule region size by the size of the // previous scheduling run. @@ -985,21 +1143,23 @@ ++SchedulingRegionID; } - ScheduleData *getScheduleData(Value *V) { - ScheduleData *SD = ScheduleDataMap[V]; + InstScheduleData *getInstScheduleData(Value *V) { + InstScheduleData *SD = InstScheduleDataMap[V]; if (SD && SD->SchedulingRegionID == SchedulingRegionID) return SD; return nullptr; } - ScheduleData *getScheduleData(Value *V, Value *Key) { - if (V == Key) - return getScheduleData(V); - auto I = ExtraScheduleDataMap.find(V); - if (I != ExtraScheduleDataMap.end()) { - ScheduleData *SD = I->second[Key]; - if (SD && SD->SchedulingRegionID == SchedulingRegionID) - return SD; + ScheduleData *getScheduleData(Value *V, unsigned Opcode) { + ScheduleData *SD = getInstScheduleData(V); + if (SD && SD->Opcode == Opcode) + return SD; + auto I = PseudoInstScheduleDataMap.find(V); + if (I != PseudoInstScheduleDataMap.end()) { + PseudoScheduleData *PSD = I->second[Opcode]; + if (PSD && PSD->SchedulingRegionID == SchedulingRegionID && + PSD->Opcode == Opcode) + return PSD; } return nullptr; } @@ -1016,13 +1176,11 @@ LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); ScheduleData *BundleMember = SD; + unsigned Opcode = BundleMember->Opcode; while (BundleMember) { - if (BundleMember->Inst != BundleMember->OpValue) { - BundleMember = BundleMember->NextInBundle; - continue; - } + assert(BundleMember->Opcode == Opcode && "Corrupt bundle member"); // Handle the def-use chain dependencies. - for (Use &U : BundleMember->Inst->operands()) { + for (Use &U : BundleMember->getInst()->operands()) { auto *I = dyn_cast(U.get()); if (!I) continue; @@ -1060,13 +1218,23 @@ void doForAllOpcodes(Value *V, function_ref Action) { - if (ScheduleData *SD = getScheduleData(V)) - Action(SD); - auto I = ExtraScheduleDataMap.find(V); - if (I != ExtraScheduleDataMap.end()) - for (auto &P : I->second) - if (P.second->SchedulingRegionID == SchedulingRegionID) - Action(P.second); + bool Found = false; + auto I = PseudoInstScheduleDataMap.find(V); + if (I != PseudoInstScheduleDataMap.end()) { + for (auto &P : I->second) { + ScheduleData *SD = P.second; + if (SD && SD->isPartOfBundle() && + SD->SchedulingRegionID == SchedulingRegionID) { + Found = true; + Action(SD); + } + } + } + if (ScheduleData *SD = getInstScheduleData(V)) { + if (!Found || SD->isPartOfBundle()) { + Action(SD); + } + } } /// Put all instructions into the ReadyList which are ready for scheduling. @@ -1090,20 +1258,22 @@ const InstructionsState &S); /// Un-bundles a group of instructions. - void cancelScheduling(ArrayRef VL, Value *OpValue); + void cancelScheduling(Value *OpValue, unsigned Opcode); /// Allocates schedule data chunk. - ScheduleData *allocateScheduleDataChunks(); + InstScheduleData *allocateInstScheduleDataChunks(); + + PseudoScheduleData *allocatePseudoInstDataChunks(); /// Extends the scheduling region so that V is inside the region. /// \returns true if the region size is within the limit. bool extendSchedulingRegion(Value *V, const InstructionsState &S); - /// Initialize the ScheduleData structures for new instructions in the + /// Initialize the InstScheduleData structures for new instructions in the /// scheduling region. void initScheduleData(Instruction *FromI, Instruction *ToI, - ScheduleData *PrevLoadStore, - ScheduleData *NextLoadStore); + InstScheduleData *PrevLoadStore, + InstScheduleData *NextLoadStore); /// Updates the dependency information of a bundle and of all instructions/ /// bundles which depend on the original bundle. @@ -1115,24 +1285,30 @@ BasicBlock *BB; - /// Simple memory allocation for ScheduleData. - std::vector> ScheduleDataChunks; + /// Simple memory allocation for InstScheduleData. + std::vector> InstScheduleDataChunks; + + std::vector> PseudoScheduleDataChunks; - /// The size of a ScheduleData array in ScheduleDataChunks. + /// The size of a InstScheduleData array in InstScheduleDataChunks. int ChunkSize; /// The allocator position in the current chunk, which is the last entry - /// of ScheduleDataChunks. + /// of InstScheduleDataChunks. int ChunkPos; - /// Attaches ScheduleData to Instruction. + int PseudoChunkSize; + + int PseudoChunkPos; + + /// Attaches InstScheduleData to Instruction. /// Note that the mapping survives during all vectorization iterations, i.e. - /// ScheduleData structures are recycled. - DenseMap ScheduleDataMap; + /// InstScheduleData structures are recycled. + DenseMap InstScheduleDataMap; - /// Attaches ScheduleData to Instruction with the leading key. - DenseMap> - ExtraScheduleDataMap; + /// Attaches InstScheduleData to Instruction with the leading key. + DenseMap> + PseudoInstScheduleDataMap; struct ReadyList : SmallVector { void insert(ScheduleData *SD) { push_back(SD); } @@ -1149,11 +1325,11 @@ /// The first memory accessing instruction in the scheduling region /// (can be null). - ScheduleData *FirstLoadStoreInRegion = nullptr; + InstScheduleData *FirstLoadStoreInRegion = nullptr; /// The last memory accessing instruction in the scheduling region /// (can be null). - ScheduleData *LastLoadStoreInRegion = nullptr; + InstScheduleData *LastLoadStoreInRegion = nullptr; /// The current size of the scheduling region. int ScheduleRegionSize = 0; @@ -1162,9 +1338,9 @@ int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget; /// The ID of the scheduling region. For a new vectorization iteration this - /// is incremented which "removes" all ScheduleData from the region. + /// is incremented which "removes" all InstScheduleData from the region. // Make sure that the initial SchedulingRegionID is greater than the - // initial SchedulingRegionID in ScheduleData (which is 0). + // initial SchedulingRegionID in InstScheduleData (which is 0). int SchedulingRegionID = 1; }; @@ -1345,6 +1521,8 @@ for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; int FoundLane = Lane; + if (!Entry->State.isOpcodeOrAlt(cast(Scalar))) + continue; if (!Entry->ReuseShuffleIndices.empty()) { FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(), @@ -1392,6 +1570,34 @@ } } +static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) { + switch(Opcode) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + return ConstantInt::getNullValue(Ty); + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + return ConstantInt::get(Ty, /*V=*/1); + case Instruction::FAdd: + case Instruction::FSub: + return ConstantFP::get(Ty, /*V=*/0.0); + case Instruction::FMul: + case Instruction::FDiv: + return ConstantFP::get(Ty, /*V=*/1.0); + case Instruction::And: + return ConstantInt::getAllOnesValue(Ty); + default: + break; + } + llvm_unreachable("unknown binop for default constant value"); +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, int UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -1399,28 +1605,28 @@ InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } if (StoreInst *SI = dyn_cast(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } @@ -1432,7 +1638,7 @@ if (EphValues.count(VL[i])) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1442,7 +1648,7 @@ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); if (!E->isSame(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } // Record the reuse of the tree node. FIXME, currently this is only used to @@ -1458,10 +1664,10 @@ auto *I = dyn_cast(VL[i]); if (!I) continue; - if (getTreeEntry(I)) { + if (getTreeEntry(VL[i]) || getTreeEntry(VL[i], S.getOpcode())) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1471,7 +1677,7 @@ for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1485,7 +1691,7 @@ // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } @@ -1505,7 +1711,7 @@ LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } VL = UniqueValues; @@ -1519,10 +1725,10 @@ if (!BS.tryScheduleBundle(VL, this, S)) { LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); - assert((!BS.getScheduleData(VL0) || - !BS.getScheduleData(VL0)->isPartOfBundle()) && + assert((!BS.getScheduleData(VL0, S.getOpcode()) || + !BS.getScheduleData(VL0, S.getOpcode())->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1542,13 +1748,13 @@ LLVM_DEBUG( dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1569,7 +1775,7 @@ if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, ReuseShuffleIndicies); return; } @@ -1586,13 +1792,15 @@ auto StoredCurrentOrderAndNum = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++StoredCurrentOrderAndNum->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, + ReuseShuffleIndicies, StoredCurrentOrderAndNum->getFirst()); return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); - newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); - BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, S, + ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); return; } case Instruction::Load: { @@ -1606,8 +1814,8 @@ if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1619,8 +1827,8 @@ for (Value *V : VL) { auto *L = cast(V); if (!L->isSimple()) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1650,14 +1858,14 @@ if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, ReuseShuffleIndicies, I->getFirst()); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } @@ -1666,8 +1874,8 @@ } LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -1686,14 +1894,14 @@ for (unsigned i = 0; i < VL.size(); ++i) { Type *Ty = cast(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1715,15 +1923,15 @@ CmpInst *Cmp = cast(VL[i]); if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1755,7 +1963,7 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1771,10 +1979,18 @@ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); - - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + for (Value *VecOp : VL) { + auto *I = cast(VecOp); + if (I->getOpcode() == S.getOpcode()) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.getOpcode()) && + "Expected a binary operation."); + Operands.push_back(VecOp); + } + if (allSameType(Operands)) + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; @@ -1783,8 +1999,8 @@ for (unsigned j = 0; j < VL.size(); ++j) { if (cast(VL[j])->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } @@ -1797,8 +2013,8 @@ if (Ty0 != CurTy) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } @@ -1809,13 +2025,13 @@ if (!isa(Op)) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1831,13 +2047,13 @@ // Check if the stores are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; @@ -1854,8 +2070,8 @@ // represented by an intrinsic call Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1868,8 +2084,8 @@ if (!CI2 || CI2->getCalledFunction() != Int || getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1879,8 +2095,8 @@ if (hasVectorInstrinsicScalarOpd(ID, 1)) { Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument " << A1I << "!=" << A1J << "\n"); return; @@ -1891,23 +2107,31 @@ !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) { - CallInst *CI2 = dyn_cast(j); - Operands.push_back(CI2->getArgOperand(i)); + for (Value *VecOp : VL) { + auto *I = cast(VecOp); + if (S.isOpcodeOrAlt(I)) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.getOpcode()) && + "Expected a binary operation."); + Value *Operand = getDefaultConstantForOpcode(S.getOpcode(), I->getType()); + Operands.push_back(Operand); } - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + if (allSameType(Operands)) + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1915,12 +2139,12 @@ // If this is not an alternate sequence of opcode like add-sub // then do not vectorize this instruction. if (!S.isAltShuffle()) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. @@ -1935,16 +2159,25 @@ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *VecOp : VL) { + auto *I = cast(VecOp); + if (S.isOpcodeOrAlt(I)) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.getOpcode()) && + "Expected a binary operation."); + Value *Operand = getDefaultConstantForOpcode(S.getOpcode(), I->getType()); + Operands.push_back(Operand); + } buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; default: - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getOpcode()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -2102,11 +2335,10 @@ } return ReuseShuffleCost + getGatherCost(VL); } - InstructionsState S = getSameOpcode(VL); - assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); - Instruction *VL0 = cast(S.OpValue); - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + assert(E->State.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + auto *VL0 = cast(E->State.OpValue); + unsigned ShuffleOrOp = E->State.isAltShuffle() ? + (unsigned) Instruction::ShuffleVector : E->State.getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: return 0; @@ -2192,7 +2424,7 @@ case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); int ScalarEltCost = - TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0); + TTI->getCastInstrCost(E->State.getOpcode(), ScalarTy, SrcTy, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } @@ -2205,7 +2437,8 @@ // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { VecCost = ReuseShuffleCost + - TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0); + TTI->getCastInstrCost(E->State.getOpcode(), VecTy, + SrcVecTy, VL0); } return VecCost - ScalarCost; } @@ -2213,14 +2446,16 @@ case Instruction::ICmp: case Instruction::Select: { // Calculate the cost of this instruction. - int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy, + int ScalarEltCost = TTI->getCmpSelInstrCost(E->State.getOpcode(), + ScalarTy, Builder.getInt1Ty(), VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0); + int VecCost = TTI->getCmpSelInstrCost(E->State.getOpcode(), VecTy, + MaskTy, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Add: @@ -2246,7 +2481,7 @@ TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; + TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueProperties Op1VP = TargetTransformInfo::OP_None; TargetTransformInfo::OperandValueProperties Op2VP = @@ -2257,35 +2492,40 @@ // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt0 = nullptr; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - const Instruction *I = cast(VL[i]); - ConstantInt *CInt = dyn_cast(I->getOperand(1)); - if (!CInt) { - Op2VK = TargetTransformInfo::OK_AnyValue; - Op2VP = TargetTransformInfo::OP_None; - break; - } - if (Op2VP == TargetTransformInfo::OP_PowerOf2 && - !CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_None; - if (i == 0) { - CInt0 = CInt; - continue; + if (auto *CInt = dyn_cast(VL0->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_UniformConstantValue; + const unsigned Opcode = E->State.getOpcode(); + for (auto *V : VL) { + auto *I = cast(V); + if (I == VL0 || Opcode != I->getOpcode()) + continue; + if (!isa(I->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_AnyValue; + Op2VP = TargetTransformInfo::OP_None; + break; + } + ConstantInt *CInt_cur = cast(I->getOperand(1)); + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && + CInt != cast(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + if (Op2VP == TargetTransformInfo::OP_PowerOf2 && + !CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_None; + if (CInt != CInt_cur) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } - if (CInt0 != CInt) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } SmallVector Operands(VL0->operand_values()); int ScalarEltCost = TTI->getArithmeticInstrCost( - S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); + E->State.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK, - Op2VK, Op1VP, Op2VP, Operands); + int VecCost = TTI->getArithmeticInstrCost(E->State.getOpcode(), VecTy, + Op1VK, Op2VK, Op1VP, Op2VP, + Operands); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -2366,11 +2606,11 @@ return ReuseShuffleCost + VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(E->State.isAltShuffle() && + ((Instruction::isBinaryOp(E->State.getOpcode()) && + Instruction::isBinaryOp(E->State.getAltOpcode())) || + (Instruction::isCast(E->State.getOpcode()) && + Instruction::isCast(E->State.getAltOpcode()))) && "Invalid Shuffle Vector Operand"); int ScalarCost = 0; if (NeedToShuffleReuses) { @@ -2387,23 +2627,22 @@ } for (Value *i : VL) { Instruction *I = cast(i); - assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); ScalarCost += TTI->getInstructionCost( I, TargetTransformInfo::TCK_RecipThroughput); } // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. int VecCost = 0; - if (Instruction::isBinaryOp(S.getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy); - VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy); + if (Instruction::isBinaryOp(E->State.getOpcode())) { + VecCost = TTI->getArithmeticInstrCost(E->State.getOpcode(), VecTy); + VecCost += TTI->getArithmeticInstrCost(E->State.getAltOpcode(), VecTy); } else { - Type *Src0SclTy = S.MainOp->getOperand(0)->getType(); - Type *Src1SclTy = S.AltOp->getOperand(0)->getType(); + Type *Src0SclTy = E->State.MainOp->getOperand(0)->getType(); + Type *Src1SclTy = E->State.AltOp->getOperand(0)->getType(); VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size()); VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size()); - VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty); - VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty); + VecCost = TTI->getCastInstrCost(E->State.getOpcode(), VecTy, Src0Ty); + VecCost += TTI->getCastInstrCost(E->State.getAltOpcode(), VecTy, Src1Ty); } VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); return ReuseShuffleCost + VecCost - ScalarCost; @@ -2469,7 +2708,7 @@ Instruction *PrevInst = nullptr; for (const auto &N : VectorizableTree) { - Instruction *Inst = dyn_cast(N.Scalars[0]); + Instruction *Inst = dyn_cast(N.State.OpValue); if (!Inst) continue; @@ -2654,9 +2893,13 @@ // Push left and right operands of binary operation into Left and Right for (Value *V : VL) { auto *I = cast(V); - assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector"); - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); + if (S.isOpcodeOrAlt(I)) { + Left.push_back(I->getOperand(0)); + Right.push_back(I->getOperand(1)); + } else { + Left.push_back(I); + Right.push_back(getDefaultConstantForOpcode(S.getOpcode(), I->getType())); + } } // Reorder if we have a commutative operation and consecutive access @@ -2705,8 +2948,13 @@ int i, unsigned Opcode, Instruction &I, ArrayRef Left, ArrayRef Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { - VLeft = I.getOperand(0); - VRight = I.getOperand(1); + if (I.getOpcode() == Opcode) { + VLeft = I.getOperand(0); + VRight = I.getOperand(1); + } else { + VLeft = &I; + VRight = getDefaultConstantForOpcode(Opcode, I.getType()); + } // If we have "SplatRight", try to see if commuting is needed to preserve it. if (SplatRight) { if (VRight == Right[i - 1]) @@ -2770,8 +3018,15 @@ // Peel the first iteration out of the loop since there's nothing // interesting to do anyway and it simplifies the checks in the loop. auto *I = cast(VL[0]); - Value *VLeft = I->getOperand(0); - Value *VRight = I->getOperand(1); + Value *VLeft; + Value *VRight; + if (I->getOpcode() == Opcode) { + VLeft = I->getOperand(0); + VRight = I->getOperand(1); + } else { + VLeft = I; + VRight = getDefaultConstantForOpcode(Opcode, I->getType()); + } if (!isa(VRight) && isa(VLeft)) // Favor having instruction to the right. FIXME: why? std::swap(VLeft, VRight); @@ -2869,17 +3124,14 @@ // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; - // Find the last instruction. The common case should be that BB has been - // scheduled, and the last instruction is VL.back(). So we start with - // VL.back() and iterate over schedule data until we reach the end of the - // bundle. The end of the bundle is marked by null ScheduleData. + // Find the last instruction. If the bundle is not scheduled then + // the first in the bundle is the last one in BB, because we discover + // bundles in backward walk. if (BlocksSchedules.count(BB)) { auto *Bundle = - BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back())); + BlocksSchedules[BB]->getInstScheduleData(isOneOf(S, VL.back())); if (Bundle && Bundle->isPartOfBundle()) - for (; Bundle; Bundle = Bundle->NextInBundle) - if (Bundle->OpValue == Bundle->Inst) - LastInst = Bundle->Inst; + LastInst = Bundle->FirstInBundle->getInst(); } // LastInst can still be null at this point if there's either not an entry @@ -2953,7 +3205,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { InstructionsState S = getSameOpcode(VL); if (S.getOpcode()) { - if (TreeEntry *E = getTreeEntry(S.OpValue)) { + if (TreeEntry *E = getTreeEntry(S.OpValue, S.getOpcode())) { if (E->isSame(VL)) { Value *V = vectorizeTree(E); if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) { @@ -3026,12 +3278,12 @@ IRBuilder<>::InsertPointGuard Guard(Builder); if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " + << *E->State.OpValue << ".\n"); return E->VectorizedValue; } - InstructionsState S = getSameOpcode(E->Scalars); - Instruction *VL0 = cast(S.OpValue); + auto *VL0 = cast(E->State.OpValue); Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast(VL0)) ScalarTy = SI->getValueOperand()->getType(); @@ -3040,7 +3292,7 @@ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->NeedToGather) { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3054,8 +3306,8 @@ return V; } - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + unsigned ShuffleOrOp = E->State.isAltShuffle() ? + (unsigned) Instruction::ShuffleVector : E->State.getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); @@ -3117,7 +3369,7 @@ E->VectorizedValue = V; return V; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3152,7 +3404,7 @@ E->VectorizedValue = NewV; return NewV; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3181,7 +3433,7 @@ for (Value *V : E->Scalars) INVL.push_back(cast(V)->getOperand(0)); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); Value *InVec = vectorizeTree(INVL); @@ -3208,7 +3460,7 @@ RHSV.push_back(cast(V)->getOperand(1)); } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); Value *L = vectorizeTree(LHSV); Value *R = vectorizeTree(RHSV); @@ -3220,7 +3472,7 @@ CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V; - if (S.getOpcode() == Instruction::FCmp) + if (E->State.getOpcode() == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); else V = Builder.CreateICmp(P0, L, R); @@ -3242,7 +3494,7 @@ FalseVec.push_back(cast(V)->getOperand(2)); } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); Value *Cond = vectorizeTree(CondVec); Value *True = vectorizeTree(TrueVec); @@ -3282,16 +3534,22 @@ case Instruction::Xor: { ValueList LHSVL, RHSVL; if (isa(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL, - RHSVL); + reorderInputsAccordingToOpcode(E->State.getOpcode(), E->Scalars, + LHSVL, RHSVL); else for (Value *V : E->Scalars) { auto *I = cast(V); - LHSVL.push_back(I->getOperand(0)); - RHSVL.push_back(I->getOperand(1)); + if (I->getOpcode() == E->State.getOpcode()) { + LHSVL.push_back(I->getOperand(0)); + RHSVL.push_back(I->getOperand(1)); + } else { + LHSVL.push_back(V); + RHSVL.push_back( + getDefaultConstantForOpcode(E->State.getOpcode(), I->getType())); + } } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); Value *LHS = vectorizeTree(LHSVL); Value *RHS = vectorizeTree(RHSVL); @@ -3302,7 +3560,7 @@ } Value *V = Builder.CreateBinOp( - static_cast(S.getOpcode()), LHS, RHS); + static_cast(VL0->getOpcode()), LHS, RHS); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); @@ -3321,10 +3579,12 @@ // sink them all the way down past store instructions. bool IsReorder = !E->ReorderIndices.empty(); if (IsReorder) { - S = getSameOpcode(E->Scalars, E->ReorderIndices.front()); + InstructionsState S = getSameOpcode(E->Scalars, + E->ReorderIndices.front()); VL0 = cast(S.OpValue); - } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, S); + } else + setInsertPointAfterBundle(E->Scalars, E->State); LoadInst *LI = cast(VL0); Type *ScalarLoadTy = LI->getType(); @@ -3371,7 +3631,7 @@ for (Value *V : E->Scalars) ScalarStoreValues.push_back(cast(V)->getValueOperand()); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); Value *VecValue = vectorizeTree(ScalarStoreValues); Value *ScalarPtr = SI->getPointerOperand(); @@ -3398,7 +3658,7 @@ return V; } case Instruction::GetElementPtr: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); ValueList Op0VL; for (Value *V : E->Scalars) @@ -3433,7 +3693,7 @@ } case Instruction::Call: { CallInst *CI = cast(VL0); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); Function *FI; Intrinsic::ID IID = Intrinsic::not_intrinsic; Value *ScalarArg = nullptr; @@ -3486,24 +3746,24 @@ } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(E->State.isAltShuffle() && + ((Instruction::isBinaryOp(E->State.getOpcode()) && + Instruction::isBinaryOp(E->State.getAltOpcode())) || + (Instruction::isCast(E->State.getOpcode()) && + Instruction::isCast(E->State.getAltOpcode()))) && "Invalid Shuffle Vector Operand"); Value *LHS, *RHS; - if (Instruction::isBinaryOp(S.getOpcode())) { - reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL); - setInsertPointAfterBundle(E->Scalars, S); + if (Instruction::isBinaryOp(E->State.getOpcode())) { + reorderAltShuffleOperands(E->State, E->Scalars, LHSVL, RHSVL); + setInsertPointAfterBundle(E->Scalars, E->State); LHS = vectorizeTree(LHSVL); RHS = vectorizeTree(RHSVL); } else { ValueList INVL; for (Value *V : E->Scalars) INVL.push_back(cast(V)->getOperand(0)); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); LHS = vectorizeTree(INVL); } @@ -3513,16 +3773,16 @@ } Value *V0, *V1; - if (Instruction::isBinaryOp(S.getOpcode())) { + if (Instruction::isBinaryOp(E->State.getOpcode())) { V0 = Builder.CreateBinOp( - static_cast(S.getOpcode()), LHS, RHS); + static_cast(E->State.getOpcode()), LHS, RHS); V1 = Builder.CreateBinOp( - static_cast(S.getAltOpcode()), LHS, RHS); + static_cast(E->State.getAltOpcode()), LHS, RHS); } else { V0 = Builder.CreateCast( - static_cast(S.getOpcode()), LHS, VecTy); + static_cast(E->State.getOpcode()), LHS, VecTy); V1 = Builder.CreateCast( - static_cast(S.getAltOpcode()), LHS, VecTy); + static_cast(E->State.getAltOpcode()), LHS, VecTy); } // Create shuffle to take alternate operations from the vector. @@ -3533,8 +3793,7 @@ SmallVector Mask(e); for (unsigned i = 0; i < e; ++i) { auto *OpInst = cast(E->Scalars[i]); - assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); - if (OpInst->getOpcode() == S.getAltOpcode()) { + if (OpInst->getOpcode() == E->State.getAltOpcode()) { Mask[i] = Builder.getInt32(e + i); AltScalars.push_back(E->Scalars[i]); } else { @@ -3544,8 +3803,10 @@ } Value *ShuffleMask = ConstantVector::get(Mask); - propagateIRFlags(V0, OpScalars); - propagateIRFlags(V1, AltScalars); + InstructionsState S = getSameOpcode(OpScalars); + propagateIRFlags(V0, OpScalars, S.OpValue); + S = getSameOpcode(AltScalars); + propagateIRFlags(V1, AltScalars, S.OpValue); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); if (Instruction *I = dyn_cast(V)) @@ -3583,7 +3844,7 @@ // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0].State.OpValue; if (MinBWs.count(ScalarRoot)) { if (auto *I = dyn_cast(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); @@ -3698,6 +3959,9 @@ for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (!Entry->State.isOpcodeOrAlt(cast(Scalar))) + continue; + Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { #ifndef NDEBUG @@ -3828,9 +4092,14 @@ } for (Value *V : VL) { - ScheduleData *BundleMember = getScheduleData(V); + ScheduleData *BundleMember = getInstScheduleData(V); + if (BundleMember->isPartOfBundle()) + BundleMember = getScheduleData(V, S.getOpcode()); + if (BundleMember->isPartOfBundle()) + return false; assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); + assert(!BundleMember->isPartOfBundle() && "Already part of another bundle"); if (BundleMember->IsScheduled) { // A bundle member was scheduled as single instruction before and now // needs to be scheduled as part of the bundle. We just get rid of the @@ -3847,6 +4116,7 @@ Bundle = BundleMember; } BundleMember->UnscheduledDepsInBundle = 0; + BundleMember->Opcode = S.getOpcode(); Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; // Group the instructions to a bundle. @@ -3890,18 +4160,27 @@ } } if (!Bundle->isReady()) { - cancelScheduling(VL, S.OpValue); + cancelScheduling(S.OpValue, S.getOpcode()); + // We have to clear all dependencies, since all values + // were calculated for the vectorized bundle. + for (auto *I = ScheduleStart; I != ScheduleEnd; + I = I->getNextNode()) { + doForAllOpcodes(I, [](ScheduleData *SD) { + SD->clearDependencies(); + }); + } + resetSchedule(); return false; } return true; } -void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL, - Value *OpValue) { +void BoUpSLP::BlockScheduling::cancelScheduling(Value *OpValue, + unsigned Opcode) { if (isa(OpValue)) return; - - ScheduleData *Bundle = getScheduleData(OpValue); + ScheduleData *Bundle = getScheduleData(OpValue, Opcode)->FirstInBundle; + assert(Bundle && "Counld not find bundle"); LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); @@ -3911,44 +4190,66 @@ // Un-bundle: make single instructions out of the bundle. ScheduleData *BundleMember = Bundle; while (BundleMember) { - assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); + assert(BundleMember->FirstInBundle == Bundle && "Corrupt bundle links"); + assert(BundleMember->Opcode == Opcode && "Corrupt bundle"); BundleMember->FirstInBundle = BundleMember; ScheduleData *Next = BundleMember->NextInBundle; BundleMember->NextInBundle = nullptr; BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; - if (BundleMember->UnscheduledDepsInBundle == 0) { - ReadyInsts.insert(BundleMember); + if (BundleMember->isPseudo()) { + PseudoInstScheduleDataMap[BundleMember->getInst()].erase( + BundleMember->Opcode); + BundleMember->Opcode = 0; + } else { + BundleMember->Opcode = 0; + if (BundleMember->UnscheduledDepsInBundle == 0) { + ReadyInsts.insert(BundleMember); + } } BundleMember = Next; } } -BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { - // Allocate a new ScheduleData for the instruction. +BoUpSLP::InstScheduleData * +BoUpSLP::BlockScheduling::allocateInstScheduleDataChunks() { + // Allocate a new InstScheduleData for the instruction. if (ChunkPos >= ChunkSize) { - ScheduleDataChunks.push_back(llvm::make_unique(ChunkSize)); + InstScheduleDataChunks.push_back( + llvm::make_unique(ChunkSize)); ChunkPos = 0; } - return &(ScheduleDataChunks.back()[ChunkPos++]); + return &(InstScheduleDataChunks.back()[ChunkPos++]); +} + +BoUpSLP::PseudoScheduleData * +BoUpSLP::BlockScheduling::allocatePseudoInstDataChunks() { + // Allocate a new PseudoScheduleData for the instruction. + if (PseudoChunkPos >= PseudoChunkSize) { + PseudoScheduleDataChunks.push_back( + llvm::make_unique(PseudoChunkSize)); + PseudoChunkPos = 0; + } + return &(PseudoScheduleDataChunks.back()[PseudoChunkPos++]); } bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, const InstructionsState &S) { - if (getScheduleData(V, isOneOf(S, V))) + if (getScheduleData(V, S.getOpcode())) return true; Instruction *I = dyn_cast(V); assert(I && "bundle member must be an instruction"); assert(!isa(I) && "phi nodes don't need to be scheduled"); auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool { - ScheduleData *ISD = getScheduleData(I); + InstScheduleData *ISD = getInstScheduleData(I); if (!ISD) return false; assert(isInSchedulingRegion(ISD) && - "ScheduleData not in scheduling region"); - ScheduleData *SD = allocateScheduleDataChunks(); - SD->Inst = I; - SD->init(SchedulingRegionID, S.OpValue); - ExtraScheduleDataMap[I][S.OpValue] = SD; + "InstScheduleData not in scheduling region"); + if (ISD->isPartOfBundle()) { + PseudoScheduleData *PSD = allocatePseudoInstDataChunks(); + PSD->init(SchedulingRegionID, ISD, S.OpValue, S.getOpcode()); + PseudoInstScheduleDataMap[I][S.getOpcode()] = PSD; + } return true; }; if (CheckSheduleForI(I)) @@ -3958,8 +4259,7 @@ initScheduleData(I, I->getNextNode(), nullptr, nullptr); ScheduleStart = I; ScheduleEnd = I->getNextNode(); - if (isOneOf(S, I) != I) - CheckSheduleForI(I); + CheckSheduleForI(I); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); return true; @@ -3981,8 +4281,7 @@ if (&*UpIter == I) { initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); ScheduleStart = I; - if (isOneOf(S, I) != I) - CheckSheduleForI(I); + CheckSheduleForI(I); LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); return true; @@ -3994,8 +4293,7 @@ initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, nullptr); ScheduleEnd = I->getNextNode(); - if (isOneOf(S, I) != I) - CheckSheduleForI(I); + CheckSheduleForI(I); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); @@ -4009,21 +4307,20 @@ return true; } -void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, - Instruction *ToI, - ScheduleData *PrevLoadStore, - ScheduleData *NextLoadStore) { - ScheduleData *CurrentLoadStore = PrevLoadStore; +void BoUpSLP::BlockScheduling::initScheduleData( + Instruction *FromI, Instruction *ToI, InstScheduleData *PrevLoadStore, + InstScheduleData *NextLoadStore) { + InstScheduleData *CurrentLoadStore = PrevLoadStore; for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { - ScheduleData *SD = ScheduleDataMap[I]; + InstScheduleData *SD = InstScheduleDataMap[I]; if (!SD) { - SD = allocateScheduleDataChunks(); - ScheduleDataMap[I] = SD; + SD = allocateInstScheduleDataChunks(); + InstScheduleDataMap[I] = SD; SD->Inst = I; } assert(!isInSchedulingRegion(SD) && - "new ScheduleData already in scheduling region"); - SD->init(SchedulingRegionID, I); + "new InstScheduleData already in scheduling region"); + SD->init(SchedulingRegionID); if (I->mayReadOrWriteMemory() && (!isa(I) || @@ -4058,8 +4355,11 @@ WorkList.pop_back(); ScheduleData *BundleMember = SD; + unsigned Opcode = BundleMember->Opcode; while (BundleMember) { assert(isInSchedulingRegion(BundleMember)); + assert(BundleMember->Opcode == Opcode && "Corrupt bundle member"); + if (!BundleMember->hasValidDependencies()) { LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember @@ -4068,44 +4368,31 @@ BundleMember->resetUnscheduledDeps(); // Handle def-use chain dependencies. - if (BundleMember->OpValue != BundleMember->Inst) { - ScheduleData *UseSD = getScheduleData(BundleMember->Inst); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { - BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) - BundleMember->incrementUnscheduledDeps(1); - if (!DestBundle->hasValidDependencies()) - WorkList.push_back(DestBundle); - } - } else { - for (User *U : BundleMember->Inst->users()) { - if (isa(U)) { - ScheduleData *UseSD = getScheduleData(U); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { - BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) - BundleMember->incrementUnscheduledDeps(1); - if (!DestBundle->hasValidDependencies()) - WorkList.push_back(DestBundle); - } - } else { - // I'm not sure if this can ever happen. But we need to be safe. - // This lets the instruction/bundle never be scheduled and - // eventually disable vectorization. + for (User *U : BundleMember->getInst()->users()) { + if (isa(U)) { + doForAllOpcodes(U, [&BundleMember, &WorkList](ScheduleData *UseSD) { BundleMember->Dependencies++; - BundleMember->incrementUnscheduledDeps(1); - } + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) + BundleMember->incrementUnscheduledDeps(1); + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); + }); + } else { + // I'm not sure if this can ever happen. But we need to be safe. + // This lets the instruction/bundle never be scheduled and + // eventually disable vectorization. + BundleMember->Dependencies++; + BundleMember->incrementUnscheduledDeps(1); } } // Handle the memory dependencies. ScheduleData *DepDest = BundleMember->NextLoadStore; if (DepDest) { - Instruction *SrcInst = BundleMember->Inst; + Instruction *SrcInst = BundleMember->getInst(); MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA); - bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); + bool SrcMayWrite = SrcInst->mayWriteToMemory(); unsigned numAliased = 0; unsigned DistToSrc = 1; @@ -4120,24 +4407,31 @@ // It's important for the loop break condition (see below) to // check this limit even between two read-only instructions. if (DistToSrc >= MaxMemDepDistance || - ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && + ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) && (numAliased >= AliasedCheckLimit || - SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { + SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) { // We increment the counter only if the locations are aliased // (instead of counting all alias checks). This gives a better // balance between reduced runtime and accurate dependencies. numAliased++; - DepDest->MemoryDependencies.push_back(BundleMember); - BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; - if (!DestBundle->IsScheduled) { - BundleMember->incrementUnscheduledDeps(1); - } - if (!DestBundle->hasValidDependencies()) { - WorkList.push_back(DestBundle); - } + // We don't want any duplicates in the set to have a correct + // dependancies. + doForAllOpcodes(DepDest->getInst(), [&BundleMember, &WorkList]( + ScheduleData *DepDest) { + if (DepDest->MemoryDependencies.count(BundleMember) == 0) { + DepDest->MemoryDependencies.insert(BundleMember); + BundleMember->Dependencies++; + ScheduleData *DestBundle = DepDest->FirstInBundle; + if (!DestBundle->IsScheduled) { + BundleMember->incrementUnscheduledDeps(1); + } + if (!DestBundle->hasValidDependencies()) { + WorkList.push_back(DestBundle); + } + } + }); } DepDest = DepDest->NextLoadStore; @@ -4164,7 +4458,7 @@ } if (InsertInReadyList && SD->isReady()) { ReadyInsts.push_back(SD); - LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst + LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->getInst() << "\n"); } } @@ -4176,7 +4470,7 @@ for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { doForAllOpcodes(I, [&](ScheduleData *SD) { assert(isInSchedulingRegion(SD) && - "ScheduleData not in scheduling region"); + "InstScheduleData not in scheduling region"); SD->IsScheduled = false; SD->resetUnscheduledDeps(); }); @@ -4210,7 +4504,7 @@ I = I->getNextNode()) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { assert(SD->isPartOfBundle() == - (getTreeEntry(SD->Inst) != nullptr) && + (getTreeEntry(SD->getInst(), SD->Opcode) != nullptr) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { @@ -4231,20 +4525,31 @@ // Move the scheduled instruction(s) to their dedicated places, if not // there yet. ScheduleData *BundleMember = picked; + unsigned Opcode = BundleMember->Opcode; while (BundleMember) { - Instruction *pickedInst = BundleMember->Inst; - if (LastScheduledInst->getNextNode() != pickedInst) { - BS->BB->getInstList().remove(pickedInst); + assert(Opcode == BundleMember->Opcode && "Corrupt bundle member"); + Instruction *PickedInst = BundleMember->getInst(); + if (LastScheduledInst->getNextNode() != PickedInst) { + BS->BB->getInstList().remove(PickedInst); BS->BB->getInstList().insert(LastScheduledInst->getIterator(), - pickedInst); + PickedInst); } - LastScheduledInst = pickedInst; + LastScheduledInst = PickedInst; BundleMember = BundleMember->NextInBundle; } - BS->schedule(picked, ReadyInsts); NumToSchedule--; } +#ifndef NDEBUG + if (NumToSchedule != 0) { + for (BasicBlock::iterator I = BS->BB->begin(), E = BS->BB->end(); I != E; ++I) { + BS->doForAllOpcodes(&*I, [](ScheduleData *SD) { + if (SD->isSchedulingEntity() && SD->UnscheduledDepsInBundle != 0) + LLVM_DEBUG(dbgs() << "SLP: Failed to schedule: " << *SD << ".\n"); + }); + } + } +#endif assert(NumToSchedule == 0 && "could not schedule all instructions"); // Avoid duplicate scheduling of the block. @@ -4865,6 +5170,10 @@ InstructionsState S = getSameOpcode(VL); if (!S.getOpcode()) return false; + for (Value *V : VL) { + if (isOneOf(S, V) != V) + return false; + } Instruction *I0 = cast(S.OpValue); unsigned Sz = R.getVectorElementSize(I0); Index: test/Transforms/SLPVectorizer/X86/cancel_scheduling.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/cancel_scheduling.ll @@ -0,0 +1,215 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -slp-vectorizer -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; This testcase shows the failure of scheduling bundles after calling +; cancelScheduling() in tryScheduleBundle() and not cleaning all +; dependencies. The dependency values are supposed to be cleared, +; since everything was calculated before we cancel the bundle. + +define dso_local void @fn1() local_unnamed_addr #0 { +; CHECK-LABEL: @fn1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 2 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 0 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 3 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 17 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX5]], align 2 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 16 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 15 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX7]], align 2 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 12 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX8]], align 2 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 11 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX9]], align 2 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 10 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX10]], align 2 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 9 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX11]], align 2 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 8 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX12]], align 2 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 7 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 6 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 5 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[ARRAYIDX2]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> , <8 x i16>* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* inttoptr (i64 1 to i8*), align 1 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 18 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i8 [[TMP1]], 2 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 19 +; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP1]], 2 +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 20 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* inttoptr (i64 2 to i8*), align 2 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i8 [[TMP4]], 4 +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 21 +; CHECK-NEXT: [[TMP6:%.*]] = lshr i8 [[TMP4]], 1 +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 22 +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 23 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* inttoptr (i64 3 to i8*), align 1 +; CHECK-NEXT: [[TMP8:%.*]] = lshr i8 [[TMP7]], 3 +; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 24 +; CHECK-NEXT: [[ARRAYIDX54:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 25 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* inttoptr (i64 4 to i8*), align 4 +; CHECK-NEXT: [[TMP10:%.*]] = lshr i8 [[TMP9]], 4 +; CHECK-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 26 +; CHECK-NEXT: [[TMP11:%.*]] = lshr i8 [[TMP9]], 1 +; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 27 +; CHECK-NEXT: [[ARRAYIDX69:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 28 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* inttoptr (i64 5 to i8*), align 1 +; CHECK-NEXT: [[TMP13:%.*]] = lshr i8 [[TMP12]], 3 +; CHECK-NEXT: [[ARRAYIDX74:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 29 +; CHECK-NEXT: [[ARRAYIDX79:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 30 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* inttoptr (i64 6 to i8*), align 2 +; CHECK-NEXT: [[ARRAYIDX83:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 31 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i8 [[TMP14]], 2 +; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 32 +; CHECK-NEXT: [[TMP16:%.*]] = shl i8 [[TMP14]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = or i8 [[TMP3]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i8> undef, i8 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i8> [[TMP18]], i8 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> [[TMP19]], i8 [[TMP17]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i8> [[TMP20]], i8 [[TMP5]], i32 3 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP6]], i32 4 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i8> [[TMP22]], i8 [[TMP4]], i32 5 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[TMP8]], i32 6 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP24]], i8 [[TMP7]], i32 7 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i8> [[TMP25]], i8 [[TMP10]], i32 8 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP11]], i32 9 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i8> [[TMP27]], i8 [[TMP9]], i32 10 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[TMP13]], i32 11 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[TMP12]], i32 12 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i8> [[TMP30]], i8 [[TMP14]], i32 13 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP15]], i32 14 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i8> [[TMP32]], i8 [[TMP16]], i32 15 +; CHECK-NEXT: [[TMP34:%.*]] = ashr <16 x i8> [[TMP33]], +; CHECK-NEXT: [[TMP35:%.*]] = and <16 x i8> [[TMP33]], +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[TMP36]] to <16 x i16> +; CHECK-NEXT: [[TMP38:%.*]] = zext <16 x i8> [[TMP36]] to <16 x i16> +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[TMP37]], <16 x i16> [[TMP38]], <16 x i32> +; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 33 +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i16* [[ARRAYIDX17]] to <16 x i16>* +; CHECK-NEXT: store <16 x i16> [[TMP39]], <16 x i16>* [[TMP40]], align 2 +; CHECK-NEXT: ret void +; +entry: + %arrayidx = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 2 + store i16 2, i16* %arrayidx, align 2 + %arrayidx1 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 1 + store i16 2, i16* %arrayidx1, align 2 + %arrayidx2 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 0 + store i16 2, i16* %arrayidx2, align 2 + %arrayidx3 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 4 + store i16 0, i16* %arrayidx3, align 2 + %arrayidx4 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 3 + store i16 0, i16* %arrayidx4, align 2 + %arrayidx5 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 17 + store i16 7, i16* %arrayidx5, align 2 + %arrayidx6 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 16 + store i16 7, i16* %arrayidx6, align 2 + %arrayidx7 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 15 + store i16 7, i16* %arrayidx7, align 2 + %arrayidx8 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 12 + store i16 7, i16* %arrayidx8, align 2 + %arrayidx9 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 11 + store i16 7, i16* %arrayidx9, align 2 + %arrayidx10 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 10 + store i16 7, i16* %arrayidx10, align 2 + %arrayidx11 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 9 + store i16 7, i16* %arrayidx11, align 2 + %arrayidx12 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 8 + store i16 7, i16* %arrayidx12, align 2 + %arrayidx13 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 7 + store i16 7, i16* %arrayidx13, align 2 + %arrayidx14 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 6 + store i16 7, i16* %arrayidx14, align 2 + %arrayidx15 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 5 + store i16 7, i16* %arrayidx15, align 2 + %0 = load i8, i8* inttoptr (i64 1 to i8*), align 1 + %1 = ashr i8 %0, 7 + %conv16 = sext i8 %1 to i16 + %arrayidx17 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 18 + store i16 %conv16, i16* %arrayidx17, align 2 + %2 = lshr i8 %0, 2 + %3 = and i8 %2, 7 + %conv20 = zext i8 %3 to i16 + %arrayidx21 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 19 + store i16 %conv20, i16* %arrayidx21, align 2 + %4 = and i8 %0, 2 + %arrayidx26 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 20 + %5 = or i8 %4, 1 + %conv29 = zext i8 %5 to i16 + store i16 %conv29, i16* %arrayidx26, align 2 + %6 = load i8, i8* inttoptr (i64 2 to i8*), align 2 + %7 = lshr i8 %6, 4 + %8 = and i8 %7, 7 + %conv33 = zext i8 %8 to i16 + %arrayidx34 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 21 + store i16 %conv33, i16* %arrayidx34, align 2 + %9 = lshr i8 %6, 1 + %10 = and i8 %9, 7 + %conv38 = zext i8 %10 to i16 + %arrayidx39 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 22 + store i16 %conv38, i16* %arrayidx39, align 2 + %11 = and i8 %6, 2 + %conv43 = zext i8 %11 to i16 + %arrayidx44 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 23 + store i16 %conv43, i16* %arrayidx44, align 2 + %12 = load i8, i8* inttoptr (i64 3 to i8*), align 1 + %13 = lshr i8 %12, 3 + %14 = and i8 %13, 7 + %conv48 = zext i8 %14 to i16 + %arrayidx49 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 24 + store i16 %conv48, i16* %arrayidx49, align 2 + %15 = and i8 %12, 7 + %conv53 = zext i8 %15 to i16 + %arrayidx54 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 25 + store i16 %conv53, i16* %arrayidx54, align 2 + %16 = load i8, i8* inttoptr (i64 4 to i8*), align 4 + %17 = lshr i8 %16, 4 + %18 = and i8 %17, 7 + %conv58 = zext i8 %18 to i16 + %arrayidx59 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 26 + store i16 %conv58, i16* %arrayidx59, align 2 + %19 = lshr i8 %16, 1 + %20 = and i8 %19, 7 + %conv63 = zext i8 %20 to i16 + %arrayidx64 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 27 + store i16 %conv63, i16* %arrayidx64, align 2 + %21 = and i8 %16, 2 + %conv68 = zext i8 %21 to i16 + %arrayidx69 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 28 + store i16 %conv68, i16* %arrayidx69, align 2 + %22 = load i8, i8* inttoptr (i64 5 to i8*), align 1 + %23 = lshr i8 %22, 3 + %24 = and i8 %23, 7 + %conv73 = zext i8 %24 to i16 + %arrayidx74 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 29 + store i16 %conv73, i16* %arrayidx74, align 2 + %25 = and i8 %22, 7 + %conv78 = zext i8 %25 to i16 + %arrayidx79 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 30 + store i16 %conv78, i16* %arrayidx79, align 2 + %26 = load i8, i8* inttoptr (i64 6 to i8*), align 2 + %27 = and i8 %26, 7 + %conv82 = zext i8 %27 to i16 + %arrayidx83 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 31 + store i16 %conv82, i16* %arrayidx83, align 2 + %28 = lshr i8 %26, 2 + %29 = and i8 %28, 7 + %conv87 = zext i8 %29 to i16 + %arrayidx88 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 32 + store i16 %conv87, i16* %arrayidx88, align 2 + %30 = shl i8 %26, 1 + %31 = and i8 %30, 6 + %conv91 = zext i8 %31 to i16 + %arrayidx92 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 33 + store i16 %conv91, i16* %arrayidx92, align 2 + ret void +} + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/SLPVectorizer/X86/memory-dep.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/memory-dep.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -slp-vectorizer -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.anon.1.2.3.4.87 = type { [6 x [6 x i16]], [6 x [6 x i32]], [0 x [4 x [4 x i32]]] } + +@f = external dso_local local_unnamed_addr global %struct.anon.1.2.3.4.87, align 4 + +; Function Attrs: norecurse nounwind uwtable +define dso_local void @itrans() local_unnamed_addr #0 { +; CHECK-LABEL: @itrans( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 undef, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 2), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 undef, i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 undef, i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> , [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> [[TMP6]], undef +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> undef, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i1> [[TMP10]] to <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> undef, [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i1> [[TMP12]] to <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* bitcast (i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 0) to <4 x i32>*), align 4 +; CHECK-NEXT: ret void +; +entry: + %add8 = add nsw i32 undef, undef + store i32 undef, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4 + %add15 = add nsw i32 undef, undef + %add26 = add nsw i32 %add8, 2 + %sub27 = sub i32 %add26, undef + %add33 = add nsw i32 %sub27, undef + %shl = shl i32 %add33, 6 + %cmp.i = icmp slt i32 %shl, 1 + %conv.i = zext i1 %cmp.i to i32 + %cmp1.i = icmp slt i32 undef, %conv.i + %conv2.i = zext i1 %cmp1.i to i32 + store i32 %conv2.i, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 0), align 4 + %add26.1 = add nsw i32 %add15, 2 + %sub27.1 = sub i32 %add26.1, undef + %add33.1 = add nsw i32 %sub27.1, undef + %shl.1 = shl i32 %add33.1, 6 + %cmp.i.1 = icmp slt i32 %shl.1, 1 + %conv.i.1 = zext i1 %cmp.i.1 to i32 + %cmp1.i.1 = icmp slt i32 undef, %conv.i.1 + %conv2.i.1 = zext i1 %cmp1.i.1 to i32 + store i32 %conv2.i.1, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 1), align 4 + %0 = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 2), align 4 + %add26.2 = add nsw i32 %0, 2 + %sub27.2 = sub i32 %add26.2, undef + %add33.2 = add nsw i32 %sub27.2, undef + %shl.2 = shl i32 %add33.2, 6 + %cmp.i.2 = icmp slt i32 %shl.2, 1 + %conv.i.2 = zext i1 %cmp.i.2 to i32 + %cmp1.i.2 = icmp slt i32 undef, %conv.i.2 + %conv2.i.2 = zext i1 %cmp1.i.2 to i32 + store i32 %conv2.i.2, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 2), align 4 + %1 = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4 + %add26.3 = add nsw i32 %1, 2 + %sub27.3 = sub i32 %add26.3, undef + %add33.3 = add nsw i32 %sub27.3, undef + %shl.3 = shl i32 %add33.3, 6 + %cmp.i.3 = icmp slt i32 %shl.3, 1 + %conv.i.3 = zext i1 %cmp.i.3 to i32 + %cmp1.i.3 = icmp slt i32 undef, %conv.i.3 + %conv2.i.3 = zext i1 %cmp1.i.3 to i32 + store i32 %conv2.i.3, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 3), align 4 + ret void +} Index: test/Transforms/SLPVectorizer/X86/pr35497.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/pr35497.ll +++ test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -12,20 +12,20 @@ define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 { ; CHECK-LABEL: @_ZN1C10SwitchModeEv( ; CHECK-NEXT: for.body.lr.ph.i: -; CHECK-NEXT: [[OR_1:%.*]] = or i64 undef, 1 -; CHECK-NEXT: store i64 [[OR_1]], i64* undef, align 8 +; CHECK-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> undef, i64 [[BAR5]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i64> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: store i64 [[TMP2]], i64* undef, align 8 ; CHECK-NEXT: [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0 ; CHECK-NEXT: [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 -; CHECK-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i64> [[TMP1]], [[TMP4]] ; CHECK-NEXT: [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0 ; CHECK-NEXT: [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8 ; CHECK-NEXT: ret void ; for.body.lr.ph.i: Index: test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -43,22 +43,16 @@ ; CHECK-LABEL: @add1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -86,22 +80,16 @@ ; CHECK-LABEL: @sub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -205,22 +193,18 @@ ; CHECK-LABEL: @addsub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -248,22 +232,18 @@ ; CHECK-LABEL: @addsub1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -291,22 +271,16 @@ ; CHECK-LABEL: @mul( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[MUL]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9 -; CHECK-NEXT: store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -334,22 +308,16 @@ ; CHECK-LABEL: @shl0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SHL5:%.*]] = shl i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -453,22 +421,16 @@ ; CHECK-LABEL: @add1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -496,22 +458,16 @@ ; CHECK-LABEL: @sub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -615,22 +571,18 @@ ; CHECK-LABEL: @addsub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[SUB5]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -658,22 +610,18 @@ ; CHECK-LABEL: @addsub1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -701,22 +649,16 @@ ; CHECK-LABEL: @mulf( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 -; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -825,22 +767,16 @@ ; CHECK-LABEL: @sub0fn( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: