Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -299,12 +299,31 @@ : TargetTransformInfo::SK_PermuteSingleSrc; } +static bool isRemainder(unsigned Opcode) { + return (Opcode == Instruction::SRem || Opcode == Instruction::URem || + Opcode == Instruction::SRem || Opcode == Instruction::FRem); +} + +/// Checks if the \p Opcode can be considered as an operand of a (possibly) +/// binary operation \p I. +/// \returns The code of the binary operation of instruction \p I if the +/// instruction with \p Opcode can be considered as an operand of \p I with the +/// default value. +static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) { + if (I->getOpcode() != Instruction::PHI && !isRemainder(I->getOpcode()) && + (I->getType()->isIntegerTy() || + (isa(I) && cast(I)->isFast()))) + return I->getOpcode(); + return 0; +} + namespace { /// Main data required for vectorization of instructions. struct InstructionsState { /// The very first instruction in the list with the main opcode. Value *OpValue = nullptr; + Value *Parent = nullptr; /// The main/alternate instruction. Instruction *MainOp = nullptr; @@ -315,21 +334,28 @@ return MainOp ? MainOp->getOpcode() : 0; } + std::pair getKey() const { + assert(Parent && "Incorrect parent!"); + return std::make_pair(Parent, getOpcode()); + } + unsigned getAltOpcode() const { return AltOp ? AltOp->getOpcode() : 0; } /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { return getOpcode() != getAltOpcode(); } + bool isAltShuffle() const { return (getOpcode() != 0 && getAltOpcode() != 0 && + getOpcode() != getAltOpcode()); } bool isOpcodeOrAlt(Instruction *I) const { unsigned CheckedOpcode = I->getOpcode(); return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; } - InstructionsState() = delete; - InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) - : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} + InstructionsState() = default; + InstructionsState(Value *OpValue, Value *Parent, + Instruction *MainOp, Instruction *AltOp) + : OpValue(OpValue), Parent(Parent), MainOp(MainOp), AltOp(AltOp) {} }; } // end anonymous namespace @@ -337,58 +363,112 @@ /// Chooses the correct key for scheduling data. If \p Op has the same (or /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p /// OpValue. -static Value *isOneOf(const InstructionsState &S, Value *Op) { - auto *I = dyn_cast(Op); +static Value *isOneOf(const InstructionsState &S, Instruction *I) { if (I && S.isOpcodeOrAlt(I)) - return Op; + return I; return S.OpValue; } /// \returns analysis of the Instructions in \p VL described in /// InstructionsState, the Opcode that we suppose the whole list /// could be vectorized even if its structure is diverse. -static InstructionsState getSameOpcode(ArrayRef VL, +static InstructionsState getSameOpcode(Value *Parent, ArrayRef VL, unsigned BaseIndex = 0) { + assert(Parent && "Incorrect parent!"); // Make sure these are all Instructions. if (llvm::any_of(VL, [](Value *V) { return !isa(V); })) - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + return InstructionsState(VL[BaseIndex], Parent, nullptr, nullptr); + unsigned Opcode = cast(VL[BaseIndex])->getOpcode(); bool IsCastOp = isa(VL[BaseIndex]); bool IsBinOp = isa(VL[BaseIndex]); - unsigned Opcode = cast(VL[BaseIndex])->getOpcode(); + bool IsNonAlt = false; unsigned AltOpcode = Opcode; + unsigned OpcodeNum = 0; + unsigned AltOpcodeNum = 0; + unsigned NonAltNum = 0; + unsigned NonAltIndex = 0; unsigned AltIndex = BaseIndex; - // Check for one alternate opcode from another BinaryOperator. - // TODO - generalize to support all operators (types, calls etc.). + // Check for an alternate opcode pattern. for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { - unsigned InstOpcode = cast(VL[Cnt])->getOpcode(); - if (IsBinOp && isa(VL[Cnt])) { - if (InstOpcode == Opcode || InstOpcode == AltOpcode) - continue; - if (Opcode == AltOpcode) { - AltOpcode = InstOpcode; - AltIndex = Cnt; - continue; - } - } else if (IsCastOp && isa(VL[Cnt])) { + auto *I = cast(VL[Cnt]); + unsigned InstOpcode = I->getOpcode(); + if (IsCastOp && isa(VL[Cnt])) { Type *Ty0 = cast(VL[BaseIndex])->getOperand(0)->getType(); Type *Ty1 = cast(VL[Cnt])->getOperand(0)->getType(); if (Ty0 == Ty1) { - if (InstOpcode == Opcode || InstOpcode == AltOpcode) + if (InstOpcode == Opcode) { + OpcodeNum++; + continue; + } + if (AltOpcode != Opcode && InstOpcode == AltOpcode) { + AltOpcodeNum++; continue; + } if (Opcode == AltOpcode) { AltOpcode = InstOpcode; AltIndex = Cnt; + AltOpcodeNum++; continue; } } - } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) + return InstructionsState(VL[BaseIndex], Parent, nullptr, nullptr); + } + if (InstOpcode == Opcode) { + OpcodeNum++; continue; - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } + if (AltOpcode != Opcode && InstOpcode == AltOpcode) { + AltOpcodeNum++; + continue; + } + if (InstOpcode != Opcode && InstOpcode != AltOpcode) { + if (IsBinOp && AltOpcode == Opcode && isa(I)) { + AltOpcode = InstOpcode; + AltOpcodeNum++; + AltIndex = Cnt; + continue; + } + if (Opcode != Instruction::PHI && + (tryToRepresentAsInstArg(Opcode, I) || + (IsBinOp && InstOpcode != Instruction::PHI && + tryToRepresentAsInstArg(InstOpcode, + cast(VL[BaseIndex]))))) { + if (!IsNonAlt) { + NonAltIndex = Cnt; + IsNonAlt = true; + } + NonAltNum++; + continue; + } + return InstructionsState(VL[BaseIndex], Parent, nullptr, nullptr); + } } - return InstructionsState(VL[BaseIndex], cast(VL[BaseIndex]), + if (IsNonAlt && VL.size() > 2 && (OpcodeNum + AltOpcodeNum) <= NonAltNum) { + BaseIndex = NonAltIndex; + AltIndex = BaseIndex; + Opcode = cast(VL[BaseIndex])->getOpcode(); + AltOpcode = Opcode; + IsBinOp = isa(VL[BaseIndex]); + for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { + auto *I = cast(VL[Cnt]); + unsigned InstOpcode = I->getOpcode(); + if (Opcode == AltOpcode && IsBinOp && isa(I)) { + AltOpcode = InstOpcode; + AltIndex = Cnt; + } + } + } + + if (IsNonAlt && (!IsBinOp || + isRemainder(Opcode) || + isRemainder(AltOpcode))) + return InstructionsState(VL[BaseIndex], Parent, nullptr, nullptr); + + return InstructionsState(VL[BaseIndex], Parent, + cast(VL[BaseIndex]), cast(VL[AltIndex])); } @@ -613,7 +693,8 @@ int getEntryCost(TreeEntry *E); /// This is the recursive part of buildTree. - void buildTree_rec(ArrayRef Roots, unsigned Depth, int); + void buildTree_rec(Value *Parent, ArrayRef Roots, unsigned Depth, + int); /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can /// be vectorized to use the original vector (or aggregate "bitcast" to a @@ -627,7 +708,7 @@ Value *vectorizeTree(TreeEntry *E); /// Vectorize a single entry in the tree, starting in \p VL. - Value *vectorizeTree(ArrayRef VL); + Value *vectorizeTree(ArrayRef VL, Value *Parent); /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. @@ -701,10 +782,14 @@ /// The TreeEntry index containing the user of this entry. We can actually /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + + /// Info about instruction in this tree entry. + InstructionsState State; }; /// Create a new VectorizableTree entry. void newTreeEntry(ArrayRef VL, bool Vectorized, int &UserTreeIdx, + const InstructionsState &S, ArrayRef ReuseShuffleIndices = None, ArrayRef ReorderIndices = None) { VectorizableTree.emplace_back(VectorizableTree); @@ -716,11 +801,22 @@ ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; if (Vectorized) { - for (int i = 0, e = VL.size(); i != e; ++i) { - assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = idx; + Last->State = S; + for (Value *V: VL) { + auto *I = cast(V); + assert(!getTreeEntry(I, S.getKey()) && "Scalar already in tree!"); + ScalarToTreeEntry[I][S.getKey()] = idx; } } else { + for (Value *V: VL) { + if (Instruction *I = dyn_cast(V)) { + Last->State.MainOp = I; + Last->State.AltOp = I; + break; + } + } + Last->State.OpValue = VL[0]; + Last->State.Parent = VL[0]; MustGather.insert(VL.begin(), VL.end()); } @@ -733,15 +829,36 @@ /// Holds all of the tree entries. std::vector VectorizableTree; - TreeEntry *getTreeEntry(Value *V) { - auto I = ScalarToTreeEntry.find(V); - if (I != ScalarToTreeEntry.end()) - return &VectorizableTree[I->second]; + TreeEntry *getTreeEntry(Instruction *I) { + if (!I) + return nullptr; + auto It = ScalarToTreeEntry.find(I); + if (It != ScalarToTreeEntry.end()) { + auto &STT = It->second; + for (auto STTI : STT) { + if (isOneOf(VectorizableTree[STTI.second].State, I) == I) + return &VectorizableTree[STTI.second]; + } + } + return nullptr; + } + + TreeEntry *getTreeEntry(Instruction *I, std::pair Key) { + if (!I) + return nullptr; + auto It = ScalarToTreeEntry.find(I); + if (It != ScalarToTreeEntry.end()) { + auto &STT = It->second; + auto STTI = STT.find(Key); + if (STTI != STT.end()) + return &VectorizableTree[STTI->second]; + } return nullptr; } /// Maps a specific scalar to its tree entry. - SmallDenseMap ScalarToTreeEntry; + SmallDenseMap, int>> + ScalarToTreeEntry; /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -831,19 +948,6 @@ // dependencies are not calculated yet. enum { InvalidDeps = -1 }; - ScheduleData() = default; - - void init(int BlockSchedulingRegionID, Value *OpVal) { - FirstInBundle = this; - NextInBundle = nullptr; - NextLoadStore = nullptr; - IsScheduled = false; - SchedulingRegionID = BlockSchedulingRegionID; - UnscheduledDepsInBundle = UnscheduledDeps; - clearDependencies(); - OpValue = OpVal; - } - /// Returns true if the dependency information has been calculated. bool hasValidDependencies() const { return Dependencies != InvalidDeps; } @@ -885,24 +989,39 @@ MemoryDependencies.clear(); } + /// Get an instruction behind this ScheduleData instance. + virtual Instruction *getInst() const = 0; + + /// Returns true if the instance is a pseudo instruction one. + virtual bool isPseudo() const = 0; + void dump(raw_ostream &os) const { if (!isSchedulingEntity()) { - os << "/ " << *Inst; + os << "/ "; + if (isPseudo()) + os << "*"; + os << *getInst(); } else if (NextInBundle) { - os << '[' << *Inst; + os << '['; + if (isPseudo()) + os << "*"; + os << *getInst(); ScheduleData *SD = NextInBundle; while (SD) { - os << ';' << *SD->Inst; - SD = SD->NextInBundle; + os << ';' ; + if (SD->isPseudo()) + os << "*"; + os << *SD->getInst(); + SD = SD->NextInBundle; } os << ']'; } else { - os << *Inst; + if (isPseudo()) + os << "*"; + os << *getInst(); } } - Instruction *Inst = nullptr; - /// Points to the head in an instruction bundle (and always to this for /// single instructions). ScheduleData *FirstInBundle = nullptr; @@ -946,8 +1065,66 @@ /// dry-run). bool IsScheduled = false; - /// Opcode of the current instruction in the schedule data. - Value *OpValue = nullptr; + /// Opcode that represents instructions to be vectorized. + unsigned Opcode = 0; + + Value *Parent = nullptr; + }; + + struct InstScheduleData : public ScheduleData { + + InstScheduleData() = default; + + Instruction *Inst = nullptr; + + void init(int BlockSchedulingRegionID) { + FirstInBundle = this; + NextInBundle = nullptr; + NextLoadStore = nullptr; + IsScheduled = false; + SchedulingRegionID = BlockSchedulingRegionID; + UnscheduledDepsInBundle = UnscheduledDeps; + clearDependencies(); + } + + Instruction *getInst() const { + return Inst; + } + + bool isPseudo() const { + return false; + } + + }; + + struct PseudoScheduleData : public ScheduleData { + + PseudoScheduleData() = default; + + InstScheduleData *ISD; + + void init(int BlockSchedulingRegionID, InstScheduleData *OpISD, + Value *OpParent, unsigned OpCode) { + FirstInBundle = this; + NextInBundle = nullptr; + NextLoadStore = OpISD->NextLoadStore; + IsScheduled = false; + SchedulingRegionID = BlockSchedulingRegionID; + UnscheduledDepsInBundle = UnscheduledDeps; + clearDependencies(); + ISD = OpISD; + Opcode = OpCode; + Parent = OpParent; + } + + Instruction *getInst() const { + return ISD->Inst; + } + + bool isPseudo() const { + return true; + } + }; #ifndef NDEBUG @@ -964,7 +1141,8 @@ /// Contains all scheduling data for a basic block. struct BlockScheduling { BlockScheduling(BasicBlock *BB) - : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {} + : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize), + PseudoChunkSize(BB->size()), PseudoChunkPos(PseudoChunkSize) {} void clear() { ReadyInsts.clear(); @@ -972,6 +1150,7 @@ ScheduleEnd = nullptr; FirstLoadStoreInRegion = nullptr; LastLoadStoreInRegion = nullptr; + PseudoInstScheduleDataMap.clear(); // Reduce the maximum schedule region size by the size of the // previous scheduling run. @@ -985,21 +1164,24 @@ ++SchedulingRegionID; } - ScheduleData *getScheduleData(Value *V) { - ScheduleData *SD = ScheduleDataMap[V]; + InstScheduleData *getInstScheduleData(Instruction *I) { + InstScheduleData *SD = InstScheduleDataMap[I]; if (SD && SD->SchedulingRegionID == SchedulingRegionID) return SD; return nullptr; } - ScheduleData *getScheduleData(Value *V, Value *Key) { - if (V == Key) - return getScheduleData(V); - auto I = ExtraScheduleDataMap.find(V); - if (I != ExtraScheduleDataMap.end()) { - ScheduleData *SD = I->second[Key]; - if (SD && SD->SchedulingRegionID == SchedulingRegionID) - return SD; + ScheduleData *getScheduleData(Instruction *I, + std::pair Key) { + ScheduleData *SD = getInstScheduleData(I); + if (SD && SD->Parent == Key.first && SD->Opcode == Key.second) + return SD; + auto It = PseudoInstScheduleDataMap.find(I); + if (It != PseudoInstScheduleDataMap.end()) { + PseudoScheduleData *PSD = It->second[Key]; + if (PSD && PSD->SchedulingRegionID == SchedulingRegionID && + PSD->Parent == Key.first && PSD->Opcode == Key.second) + return PSD; } return nullptr; } @@ -1016,13 +1198,13 @@ LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); ScheduleData *BundleMember = SD; + unsigned Opcode = BundleMember->Opcode; + Value *Parent = BundleMember->Parent; while (BundleMember) { - if (BundleMember->Inst != BundleMember->OpValue) { - BundleMember = BundleMember->NextInBundle; - continue; - } + assert(BundleMember->Opcode == Opcode && + BundleMember->Parent == Parent && "Corrupt bundle member"); // Handle the def-use chain dependencies. - for (Use &U : BundleMember->Inst->operands()) { + for (Use &U : BundleMember->getInst()->operands()) { auto *I = dyn_cast(U.get()); if (!I) continue; @@ -1058,15 +1240,21 @@ } } - void doForAllOpcodes(Value *V, + void doForAllOpcodes(Instruction *I, function_ref Action) { - if (ScheduleData *SD = getScheduleData(V)) + auto It = PseudoInstScheduleDataMap.find(I); + if (It != PseudoInstScheduleDataMap.end()) { + for (auto &P : It->second) { + ScheduleData *SD = P.second; + if (SD && SD->isPartOfBundle() && + SD->SchedulingRegionID == SchedulingRegionID) { + Action(SD); + } + } + } + if (ScheduleData *SD = getInstScheduleData(I)) { Action(SD); - auto I = ExtraScheduleDataMap.find(V); - if (I != ExtraScheduleDataMap.end()) - for (auto &P : I->second) - if (P.second->SchedulingRegionID == SchedulingRegionID) - Action(P.second); + } } /// Put all instructions into the ReadyList which are ready for scheduling. @@ -1090,20 +1278,22 @@ const InstructionsState &S); /// Un-bundles a group of instructions. - void cancelScheduling(ArrayRef VL, Value *OpValue); + void cancelScheduling(Value *OpValue, std::pair Key); /// Allocates schedule data chunk. - ScheduleData *allocateScheduleDataChunks(); + InstScheduleData *allocateInstScheduleDataChunks(); - /// Extends the scheduling region so that V is inside the region. + PseudoScheduleData *allocatePseudoInstDataChunks(); + + /// Extends the scheduling region so that I is inside the region. /// \returns true if the region size is within the limit. - bool extendSchedulingRegion(Value *V, const InstructionsState &S); + bool extendSchedulingRegion(Instruction *I, const InstructionsState &S); - /// Initialize the ScheduleData structures for new instructions in the + /// Initialize the InstScheduleData structures for new instructions in the /// scheduling region. void initScheduleData(Instruction *FromI, Instruction *ToI, - ScheduleData *PrevLoadStore, - ScheduleData *NextLoadStore); + InstScheduleData *PrevLoadStore, + InstScheduleData *NextLoadStore); /// Updates the dependency information of a bundle and of all instructions/ /// bundles which depend on the original bundle. @@ -1113,26 +1303,39 @@ /// Sets all instruction in the scheduling region to un-scheduled. void resetSchedule(); + /// Reorder bundles from PseudoScheduleData data after scheduling, + /// if an Instruction is present in PseudoScheduleData that means this + /// Instruction is prenet in multiply bundles and FirstInBundle is not last + /// one scheduled for all copies of instuction in InstScheduleData and + /// PseudoScheduleData. + void reorderBundles(); + BasicBlock *BB; - /// Simple memory allocation for ScheduleData. - std::vector> ScheduleDataChunks; + /// Simple memory allocation for InstScheduleData. + std::vector> InstScheduleDataChunks; + + std::vector> PseudoScheduleDataChunks; - /// The size of a ScheduleData array in ScheduleDataChunks. + /// The size of a InstScheduleData array in InstScheduleDataChunks. int ChunkSize; /// The allocator position in the current chunk, which is the last entry - /// of ScheduleDataChunks. + /// of InstScheduleDataChunks. int ChunkPos; - /// Attaches ScheduleData to Instruction. + int PseudoChunkSize; + + int PseudoChunkPos; + + /// Attaches InstScheduleData to Instruction. /// Note that the mapping survives during all vectorization iterations, i.e. - /// ScheduleData structures are recycled. - DenseMap ScheduleDataMap; + /// InstScheduleData structures are recycled. + DenseMap InstScheduleDataMap; - /// Attaches ScheduleData to Instruction with the leading key. - DenseMap> - ExtraScheduleDataMap; + DenseMap, PseudoScheduleData *>> + PseudoInstScheduleDataMap; struct ReadyList : SmallVector { void insert(ScheduleData *SD) { push_back(SD); } @@ -1149,11 +1352,11 @@ /// The first memory accessing instruction in the scheduling region /// (can be null). - ScheduleData *FirstLoadStoreInRegion = nullptr; + InstScheduleData *FirstLoadStoreInRegion = nullptr; /// The last memory accessing instruction in the scheduling region /// (can be null). - ScheduleData *LastLoadStoreInRegion = nullptr; + InstScheduleData *LastLoadStoreInRegion = nullptr; /// The current size of the scheduling region. int ScheduleRegionSize = 0; @@ -1162,9 +1365,9 @@ int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget; /// The ID of the scheduling region. For a new vectorization iteration this - /// is incremented which "removes" all ScheduleData from the region. + /// is incremented which "removes" all InstScheduleData from the region. // Make sure that the initial SchedulingRegionID is greater than the - // initial SchedulingRegionID in ScheduleData (which is 0). + // initial SchedulingRegionID in InstScheduleData (which is 0). int SchedulingRegionID = 1; }; @@ -1331,7 +1534,7 @@ UserIgnoreList = UserIgnoreLst; if (!allSameType(Roots)) return; - buildTree_rec(Roots, 0, -1); + buildTree_rec(Roots[0], Roots, 0, -1); // Collect the values that we need to extract from the tree. for (TreeEntry &EIdx : VectorizableTree) { @@ -1345,6 +1548,8 @@ for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; int FoundLane = Lane; + if (!Entry->State.isOpcodeOrAlt(cast(Scalar))) + continue; if (!Entry->ReuseShuffleIndices.empty()) { FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(), @@ -1366,7 +1571,7 @@ continue; // Skip in-tree scalars that become vectors - if (TreeEntry *UseEntry = getTreeEntry(U)) { + if (TreeEntry *UseEntry = getTreeEntry(cast(U))) { Value *UseScalar = UseEntry->Scalars[0]; // Some in-tree scalars will remain as scalar in vectorized // instructions. If that is the case, the one in Lane 0 will @@ -1392,35 +1597,63 @@ } } -void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, +static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) { + switch(Opcode) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + return ConstantInt::getNullValue(Ty); + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + return ConstantInt::get(Ty, /*V=*/1); + case Instruction::FAdd: + case Instruction::FSub: + return ConstantFP::get(Ty, /*V=*/0.0); + case Instruction::FMul: + case Instruction::FDiv: + return ConstantFP::get(Ty, /*V=*/1.0); + case Instruction::And: + return ConstantInt::getAllOnesValue(Ty); + default: + break; + } + llvm_unreachable("unknown binop for default constant value"); +} + +void BoUpSLP::buildTree_rec(Value *Parent, ArrayRef VL, unsigned Depth, int UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); - InstructionsState S = getSameOpcode(VL); + InstructionsState S = getSameOpcode(Parent, VL); if (Depth == RecursionMaxDepth) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } if (StoreInst *SI = dyn_cast(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } @@ -1432,17 +1665,17 @@ if (EphValues.count(VL[i])) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } // Check if this is a duplicate of another entry. - if (TreeEntry *E = getTreeEntry(S.OpValue)) { + if (TreeEntry *E = getTreeEntry(cast(S.OpValue))) { LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); if (!E->isSame(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } // Record the reuse of the tree node. FIXME, currently this is only used to @@ -1455,13 +1688,11 @@ // Check that none of the instructions in the bundle are already in the tree. for (unsigned i = 0, e = VL.size(); i != e; ++i) { - auto *I = dyn_cast(VL[i]); - if (!I) - continue; - if (getTreeEntry(I)) { + auto *I = cast(VL[i]); + if (getTreeEntry(I) || getTreeEntry(I, S.getKey())) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1471,7 +1702,7 @@ for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1485,7 +1716,7 @@ // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } @@ -1505,7 +1736,7 @@ LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } VL = UniqueValues; @@ -1519,10 +1750,10 @@ if (!BS.tryScheduleBundle(VL, this, S)) { LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); - assert((!BS.getScheduleData(VL0) || - !BS.getScheduleData(VL0)->isPartOfBundle()) && + assert((!BS.getScheduleData(VL0, S.getKey()) || + !BS.getScheduleData(VL0, S.getKey())->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1542,13 +1773,13 @@ if (Term && Term->isTerminator()) { LLVM_DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1558,7 +1789,7 @@ Operands.push_back(cast(j)->getIncomingValueForBlock( PH->getIncomingBlock(i))); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; } @@ -1569,7 +1800,7 @@ if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, ReuseShuffleIndicies); return; } @@ -1586,13 +1817,15 @@ auto StoredCurrentOrderAndNum = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++StoredCurrentOrderAndNum->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, + ReuseShuffleIndicies, StoredCurrentOrderAndNum->getFirst()); return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); - newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); - BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, S, + ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); return; } case Instruction::Load: { @@ -1606,8 +1839,8 @@ if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1619,8 +1852,8 @@ for (Value *V : VL) { auto *L = cast(V); if (!L->isSimple()) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1650,14 +1883,14 @@ if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, ReuseShuffleIndicies, I->getFirst()); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } @@ -1666,8 +1899,8 @@ } LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -1686,14 +1919,14 @@ for (unsigned i = 0; i < VL.size(); ++i) { Type *Ty = cast(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1702,7 +1935,7 @@ for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; } @@ -1715,15 +1948,15 @@ CmpInst *Cmp = cast(VL[i]); if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1732,7 +1965,7 @@ for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; } @@ -1755,7 +1988,7 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1763,18 +1996,26 @@ if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; reorderInputsAccordingToOpcode(S.getOpcode(), VL, Left, Right); - buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Left, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Right, Depth + 1, UserTreeIdx); return; } for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); - - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + for (Value *VecOp : VL) { + auto *I = cast(VecOp); + if (I->getOpcode() == S.getOpcode()) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.getOpcode()) && + "Expected a binary operation."); + Operands.push_back(VecOp); + } + if (allSameType(Operands)) + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; @@ -1783,8 +2024,8 @@ for (unsigned j = 0; j < VL.size(); ++j) { if (cast(VL[j])->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } @@ -1797,8 +2038,8 @@ if (Ty0 != CurTy) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } @@ -1809,13 +2050,13 @@ if (!isa(Op)) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1823,7 +2064,7 @@ for (Value *j : VL) Operands.push_back(cast(j)->getOperand(i)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; } @@ -1831,20 +2072,20 @@ // Check if the stores are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; for (Value *j : VL) Operands.push_back(cast(j)->getOperand(0)); - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); return; } case Instruction::Call: { @@ -1854,8 +2095,8 @@ // represented by an intrinsic call Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1868,8 +2109,8 @@ if (!CI2 || CI2->getCalledFunction() != Int || getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1879,8 +2120,8 @@ if (hasVectorInstrinsicScalarOpd(ID, 1)) { Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument " << A1I << "!=" << A1J << "\n"); return; @@ -1891,23 +2132,32 @@ !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) { - CallInst *CI2 = dyn_cast(j); - Operands.push_back(CI2->getArgOperand(i)); + for (Value *VecOp : VL) { + auto *I = cast(VecOp); + if (S.isOpcodeOrAlt(I)) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.getOpcode()) && + "Expected a binary operation."); + Value *Operand = getDefaultConstantForOpcode(S.getOpcode(), + I->getType()); + Operands.push_back(Operand); } - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + if (allSameType(Operands)) + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; } @@ -1915,36 +2165,45 @@ // If this is not an alternate sequence of opcode like add-sub // then do not vectorize this instruction. if (!S.isAltShuffle()) { - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; reorderAltShuffleOperands(S, VL, Left, Right); - buildTree_rec(Left, Depth + 1, UserTreeIdx); - buildTree_rec(Right, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Left, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Right, Depth + 1, UserTreeIdx); return; } for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *VecOp : VL) { + auto *I = cast(VecOp); + if (S.isOpcodeOrAlt(I)) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.getOpcode()) && + "Expected a binary operation."); + Value *Operand = getDefaultConstantForOpcode(S.getOpcode(), I->getType()); + Operands.push_back(Operand); + } - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + buildTree_rec(VL0, Operands, Depth + 1, UserTreeIdx); } return; default: - BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + BS.cancelScheduling(VL0, S.getKey()); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -1980,7 +2239,7 @@ Instruction *E0 = cast(OpValue); assert(E0->getOpcode() == Instruction::ExtractElement || E0->getOpcode() == Instruction::ExtractValue); - assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode"); + assert(E0->getOpcode() == getSameOpcode(VL[0], VL).getOpcode() && "Invalid opcode"); // Check if all of the extracts come from the same vector and from the // correct offset. Value *Vec = E0->getOperand(0); @@ -2045,7 +2304,7 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I) const { return I->hasOneUse() || std::all_of(I->user_begin(), I->user_end(), [this](User *U) { - return ScalarToTreeEntry.count(U) > 0; + return ScalarToTreeEntry.count(dyn_cast(U)) > 0; }); } @@ -2079,7 +2338,7 @@ return ReuseShuffleCost + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } - if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement && + if (getSameOpcode(VL[0], VL).getOpcode() == Instruction::ExtractElement && allSameType(VL) && allSameBlock(VL)) { Optional ShuffleKind = isShuffle(VL); if (ShuffleKind.hasValue()) { @@ -2089,10 +2348,11 @@ // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the // vectorized tree. - if (areAllUsersVectorized(cast(V)) && - !ScalarToTreeEntry.count(V)) { + auto *I = cast(V); + if (areAllUsersVectorized(I) && + !ScalarToTreeEntry.count(I)) { auto *IO = cast( - cast(V)->getIndexOperand()); + cast(I)->getIndexOperand()); Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, IO->getZExtValue()); } @@ -2102,11 +2362,11 @@ } return ReuseShuffleCost + getGatherCost(VL); } - InstructionsState S = getSameOpcode(VL); - assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); - Instruction *VL0 = cast(S.OpValue); - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + assert(E->State.getOpcode() && allSameType(VL) && allSameBlock(VL) && + "Invalid VL"); + auto *VL0 = cast(E->State.OpValue); + unsigned ShuffleOrOp = E->State.isAltShuffle() ? + (unsigned) Instruction::ShuffleVector : E->State.getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: return 0; @@ -2192,7 +2452,7 @@ case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); int ScalarEltCost = - TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0); + TTI->getCastInstrCost(E->State.getOpcode(), ScalarTy, SrcTy, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } @@ -2205,7 +2465,8 @@ // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { VecCost = ReuseShuffleCost + - TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0); + TTI->getCastInstrCost(E->State.getOpcode(), VecTy, + SrcVecTy, VL0); } return VecCost - ScalarCost; } @@ -2213,14 +2474,16 @@ case Instruction::ICmp: case Instruction::Select: { // Calculate the cost of this instruction. - int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy, + int ScalarEltCost = TTI->getCmpSelInstrCost(E->State.getOpcode(), + ScalarTy, Builder.getInt1Ty(), VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0); + int VecCost = TTI->getCmpSelInstrCost(E->State.getOpcode(), VecTy, + MaskTy, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Add: @@ -2246,7 +2509,7 @@ TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; + TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueProperties Op1VP = TargetTransformInfo::OP_None; TargetTransformInfo::OperandValueProperties Op2VP = @@ -2257,35 +2520,40 @@ // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt0 = nullptr; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - const Instruction *I = cast(VL[i]); - ConstantInt *CInt = dyn_cast(I->getOperand(1)); - if (!CInt) { - Op2VK = TargetTransformInfo::OK_AnyValue; - Op2VP = TargetTransformInfo::OP_None; - break; - } - if (Op2VP == TargetTransformInfo::OP_PowerOf2 && - !CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_None; - if (i == 0) { - CInt0 = CInt; - continue; + if (auto *CInt = dyn_cast(VL0->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_UniformConstantValue; + const unsigned Opcode = E->State.getOpcode(); + for (auto *V : VL) { + auto *I = cast(V); + if (I == VL0 || Opcode != I->getOpcode()) + continue; + if (!isa(I->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_AnyValue; + Op2VP = TargetTransformInfo::OP_None; + break; + } + ConstantInt *CInt_cur = cast(I->getOperand(1)); + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && + CInt != cast(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + if (Op2VP == TargetTransformInfo::OP_PowerOf2 && + !CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_None; + if (CInt != CInt_cur) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } - if (CInt0 != CInt) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } SmallVector Operands(VL0->operand_values()); int ScalarEltCost = TTI->getArithmeticInstrCost( - S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); + E->State.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK, - Op2VK, Op1VP, Op2VP, Operands); + int VecCost = TTI->getArithmeticInstrCost(E->State.getOpcode(), VecTy, + Op1VK, Op2VK, Op1VP, Op2VP, + Operands); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -2366,11 +2634,11 @@ return ReuseShuffleCost + VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(E->State.isAltShuffle() && + ((Instruction::isBinaryOp(E->State.getOpcode()) && + Instruction::isBinaryOp(E->State.getAltOpcode())) || + (Instruction::isCast(E->State.getOpcode()) && + Instruction::isCast(E->State.getAltOpcode()))) && "Invalid Shuffle Vector Operand"); int ScalarCost = 0; if (NeedToShuffleReuses) { @@ -2387,23 +2655,23 @@ } for (Value *i : VL) { Instruction *I = cast(i); - assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); ScalarCost += TTI->getInstructionCost( I, TargetTransformInfo::TCK_RecipThroughput); } // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. int VecCost = 0; - if (Instruction::isBinaryOp(S.getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy); - VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy); + if (Instruction::isBinaryOp(E->State.getOpcode())) { + VecCost = TTI->getArithmeticInstrCost(E->State.getOpcode(), VecTy); + VecCost += TTI->getArithmeticInstrCost(E->State.getAltOpcode(), VecTy); } else { - Type *Src0SclTy = S.MainOp->getOperand(0)->getType(); - Type *Src1SclTy = S.AltOp->getOperand(0)->getType(); + Type *Src0SclTy = E->State.MainOp->getOperand(0)->getType(); + Type *Src1SclTy = E->State.AltOp->getOperand(0)->getType(); VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size()); VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size()); - VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty); - VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty); + VecCost = TTI->getCastInstrCost(E->State.getOpcode(), VecTy, Src0Ty); + VecCost += TTI->getCastInstrCost(E->State.getAltOpcode(), VecTy, + Src1Ty); } VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); return ReuseShuffleCost + VecCost - ScalarCost; @@ -2469,7 +2737,7 @@ Instruction *PrevInst = nullptr; for (const auto &N : VectorizableTree) { - Instruction *Inst = dyn_cast(N.Scalars[0]); + Instruction *Inst = dyn_cast(N.State.OpValue); if (!Inst) continue; @@ -2481,8 +2749,9 @@ // Update LiveValues. LiveValues.erase(PrevInst); for (auto &J : PrevInst->operands()) { - if (isa(&*J) && getTreeEntry(&*J)) - LiveValues.insert(cast(&*J)); + auto *I = dyn_cast(&*J); + if (I && getTreeEntry(I)) + LiveValues.insert(I); } LLVM_DEBUG({ @@ -2654,9 +2923,13 @@ // Push left and right operands of binary operation into Left and Right for (Value *V : VL) { auto *I = cast(V); - assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector"); - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); + if (S.isOpcodeOrAlt(I)) { + Left.push_back(I->getOperand(0)); + Right.push_back(I->getOperand(1)); + } else { + Left.push_back(I); + Right.push_back(getDefaultConstantForOpcode(S.getOpcode(), I->getType())); + } } // Reorder if we have a commutative operation and consecutive access @@ -2705,8 +2978,13 @@ int i, unsigned Opcode, Instruction &I, ArrayRef Left, ArrayRef Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { - VLeft = I.getOperand(0); - VRight = I.getOperand(1); + if (I.getOpcode() == Opcode) { + VLeft = I.getOperand(0); + VRight = I.getOperand(1); + } else { + VLeft = &I; + VRight = getDefaultConstantForOpcode(Opcode, I.getType()); + } // If we have "SplatRight", try to see if commuting is needed to preserve it. if (SplatRight) { if (VRight == Right[i - 1]) @@ -2770,8 +3048,15 @@ // Peel the first iteration out of the loop since there's nothing // interesting to do anyway and it simplifies the checks in the loop. auto *I = cast(VL[0]); - Value *VLeft = I->getOperand(0); - Value *VRight = I->getOperand(1); + Value *VLeft; + Value *VRight; + if (I->getOpcode() == Opcode) { + VLeft = I->getOperand(0); + VRight = I->getOperand(1); + } else { + VLeft = I; + VRight = getDefaultConstantForOpcode(Opcode, I->getType()); + } if (!isa(VRight) && isa(VLeft)) // Favor having instruction to the right. FIXME: why? std::swap(VLeft, VRight); @@ -2869,17 +3154,15 @@ // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; - // Find the last instruction. The common case should be that BB has been - // scheduled, and the last instruction is VL.back(). So we start with - // VL.back() and iterate over schedule data until we reach the end of the - // bundle. The end of the bundle is marked by null ScheduleData. + // Find the last instruction. If the bundle is not scheduled then + // the first in the bundle is the last one in BB, because we discover + // bundles in backward walk. if (BlocksSchedules.count(BB)) { - auto *Bundle = - BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back())); + BlockScheduling *BS = BlocksSchedules[BB].get(); + auto *Bundle = BS->getScheduleData(cast(S.OpValue), + S.getKey()); if (Bundle && Bundle->isPartOfBundle()) - for (; Bundle; Bundle = Bundle->NextInBundle) - if (Bundle->OpValue == Bundle->Inst) - LastInst = Bundle->Inst; + LastInst = Bundle->FirstInBundle->getInst(); } // LastInst can still be null at this point if there's either not an entry @@ -2926,7 +3209,7 @@ CSEBlocks.insert(Insrt->getParent()); // Add to our 'need-to-extract' list. - if (TreeEntry *E = getTreeEntry(VL[i])) { + if (TreeEntry *E = getTreeEntry(dyn_cast(VL[i]))) { // Find which lane we need to extract. int FoundLane = -1; for (unsigned Lane = 0, LE = E->Scalars.size(); Lane != LE; ++Lane) { @@ -2950,10 +3233,11 @@ return Vec; } -Value *BoUpSLP::vectorizeTree(ArrayRef VL) { - InstructionsState S = getSameOpcode(VL); +Value *BoUpSLP::vectorizeTree(ArrayRef VL, Value *Parent) { + InstructionsState S = getSameOpcode(Parent, VL); if (S.getOpcode()) { - if (TreeEntry *E = getTreeEntry(S.OpValue)) { + TreeEntry *E = getTreeEntry(dyn_cast(S.OpValue)); + if (E && E->State.getOpcode() == S.getOpcode()) { if (E->isSame(VL)) { Value *V = vectorizeTree(E); if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) { @@ -3026,12 +3310,12 @@ IRBuilder<>::InsertPointGuard Guard(Builder); if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " + << *E->State.OpValue << ".\n"); return E->VectorizedValue; } - InstructionsState S = getSameOpcode(E->Scalars); - Instruction *VL0 = cast(S.OpValue); + auto *VL0 = cast(E->State.OpValue); Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast(VL0)) ScalarTy = SI->getValueOperand()->getType(); @@ -3040,7 +3324,7 @@ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->NeedToGather) { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3054,8 +3338,8 @@ return V; } - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + unsigned ShuffleOrOp = E->State.isAltShuffle() ? + (unsigned) Instruction::ShuffleVector : E->State.getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); @@ -3088,7 +3372,7 @@ Builder.SetInsertPoint(IBB->getTerminator()); Builder.SetCurrentDebugLocation(PH->getDebugLoc()); - Value *Vec = vectorizeTree(Operands); + Value *Vec = vectorizeTree(Operands, E->State.OpValue); NewPhi->addIncoming(Vec, IBB); } @@ -3117,7 +3401,7 @@ E->VectorizedValue = V; return V; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3152,7 +3436,7 @@ E->VectorizedValue = NewV; return NewV; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3181,9 +3465,9 @@ for (Value *V : E->Scalars) INVL.push_back(cast(V)->getOperand(0)); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); - Value *InVec = vectorizeTree(INVL); + Value *InVec = vectorizeTree(INVL, E->State.OpValue); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3208,10 +3492,10 @@ RHSV.push_back(cast(V)->getOperand(1)); } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); - Value *L = vectorizeTree(LHSV); - Value *R = vectorizeTree(RHSV); + Value *L = vectorizeTree(LHSV, E->State.OpValue); + Value *R = vectorizeTree(RHSV, E->State.OpValue); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3220,7 +3504,7 @@ CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V; - if (S.getOpcode() == Instruction::FCmp) + if (E->State.getOpcode() == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); else V = Builder.CreateICmp(P0, L, R); @@ -3242,11 +3526,11 @@ FalseVec.push_back(cast(V)->getOperand(2)); } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); - Value *Cond = vectorizeTree(CondVec); - Value *True = vectorizeTree(TrueVec); - Value *False = vectorizeTree(FalseVec); + Value *Cond = vectorizeTree(CondVec, E->State.OpValue); + Value *True = vectorizeTree(TrueVec, E->State.OpValue); + Value *False = vectorizeTree(FalseVec, E->State.OpValue); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3282,19 +3566,26 @@ case Instruction::Xor: { ValueList LHSVL, RHSVL; if (isa(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL, - RHSVL); + reorderInputsAccordingToOpcode(E->State.getOpcode(), E->Scalars, + LHSVL, RHSVL); else for (Value *V : E->Scalars) { auto *I = cast(V); - LHSVL.push_back(I->getOperand(0)); - RHSVL.push_back(I->getOperand(1)); + if (I->getOpcode() == E->State.getOpcode()) { + LHSVL.push_back(I->getOperand(0)); + RHSVL.push_back(I->getOperand(1)); + } else { + LHSVL.push_back(V); + RHSVL.push_back( + getDefaultConstantForOpcode(E->State.getOpcode(), + I->getType())); + } } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); - Value *LHS = vectorizeTree(LHSVL); - Value *RHS = vectorizeTree(RHSVL); + Value *LHS = vectorizeTree(LHSVL, E->State.OpValue); + Value *RHS = vectorizeTree(RHSVL, E->State.OpValue); if (E->VectorizedValue) { LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n"); @@ -3302,7 +3593,7 @@ } Value *V = Builder.CreateBinOp( - static_cast(S.getOpcode()), LHS, RHS); + static_cast(VL0->getOpcode()), LHS, RHS); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); @@ -3321,10 +3612,12 @@ // sink them all the way down past store instructions. bool IsReorder = !E->ReorderIndices.empty(); if (IsReorder) { - S = getSameOpcode(E->Scalars, E->ReorderIndices.front()); + InstructionsState S = getSameOpcode(E->State.OpValue, E->Scalars, + E->ReorderIndices.front()); VL0 = cast(S.OpValue); - } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, S); + } else + setInsertPointAfterBundle(E->Scalars, E->State); LoadInst *LI = cast(VL0); Type *ScalarLoadTy = LI->getType(); @@ -3337,7 +3630,7 @@ // ExternalUses list to make sure that an extract will be generated in the // future. Value *PO = LI->getPointerOperand(); - if (getTreeEntry(PO)) + if (getTreeEntry(dyn_cast(PO))) ExternalUses.push_back(ExternalUser(PO, cast(VecPtr), 0)); unsigned Alignment = LI->getAlignment(); @@ -3371,9 +3664,9 @@ for (Value *V : E->Scalars) ScalarStoreValues.push_back(cast(V)->getValueOperand()); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); - Value *VecValue = vectorizeTree(ScalarStoreValues); + Value *VecValue = vectorizeTree(ScalarStoreValues, E->State.OpValue); Value *ScalarPtr = SI->getPointerOperand(); Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); @@ -3381,7 +3674,7 @@ // The pointer operand uses an in-tree scalar, so add the new BitCast to // ExternalUses to make sure that an extract will be generated in the // future. - if (getTreeEntry(ScalarPtr)) + if (getTreeEntry(dyn_cast(ScalarPtr))) ExternalUses.push_back(ExternalUser(ScalarPtr, cast(VecPtr), 0)); if (!Alignment) @@ -3398,13 +3691,13 @@ return V; } case Instruction::GetElementPtr: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); ValueList Op0VL; for (Value *V : E->Scalars) Op0VL.push_back(cast(V)->getOperand(0)); - Value *Op0 = vectorizeTree(Op0VL); + Value *Op0 = vectorizeTree(Op0VL, E->State.OpValue); std::vector OpVecs; for (int j = 1, e = cast(VL0)->getNumOperands(); j < e; @@ -3413,7 +3706,7 @@ for (Value *V : E->Scalars) OpVL.push_back(cast(V)->getOperand(j)); - Value *OpVec = vectorizeTree(OpVL); + Value *OpVec = vectorizeTree(OpVL, E->State.OpValue); OpVecs.push_back(OpVec); } @@ -3433,7 +3726,7 @@ } case Instruction::Call: { CallInst *CI = cast(VL0); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); Function *FI; Intrinsic::ID IID = Intrinsic::not_intrinsic; Value *ScalarArg = nullptr; @@ -3456,7 +3749,7 @@ OpVL.push_back(CEI->getArgOperand(j)); } - Value *OpVec = vectorizeTree(OpVL); + Value *OpVec = vectorizeTree(OpVL, E->State.OpValue); LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n"); OpVecs.push_back(OpVec); } @@ -3472,7 +3765,7 @@ // The scalar argument uses an in-tree scalar so we add the new vectorized // call to ExternalUses list to make sure that an extract will be // generated in the future. - if (ScalarArg && getTreeEntry(ScalarArg)) + if (ScalarArg && getTreeEntry(dyn_cast(ScalarArg))) ExternalUses.push_back(ExternalUser(ScalarArg, cast(V), 0)); propagateIRFlags(V, E->Scalars, VL0); @@ -3486,25 +3779,25 @@ } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(E->State.isAltShuffle() && + ((Instruction::isBinaryOp(E->State.getOpcode()) && + Instruction::isBinaryOp(E->State.getAltOpcode())) || + (Instruction::isCast(E->State.getOpcode()) && + Instruction::isCast(E->State.getAltOpcode()))) && "Invalid Shuffle Vector Operand"); Value *LHS, *RHS; - if (Instruction::isBinaryOp(S.getOpcode())) { - reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL); - setInsertPointAfterBundle(E->Scalars, S); - LHS = vectorizeTree(LHSVL); - RHS = vectorizeTree(RHSVL); + if (Instruction::isBinaryOp(E->State.getOpcode())) { + reorderAltShuffleOperands(E->State, E->Scalars, LHSVL, RHSVL); + setInsertPointAfterBundle(E->Scalars, E->State); + LHS = vectorizeTree(LHSVL, E->State.OpValue); + RHS = vectorizeTree(RHSVL, E->State.OpValue); } else { ValueList INVL; for (Value *V : E->Scalars) INVL.push_back(cast(V)->getOperand(0)); - setInsertPointAfterBundle(E->Scalars, S); - LHS = vectorizeTree(INVL); + setInsertPointAfterBundle(E->Scalars, E->State); + LHS = vectorizeTree(INVL, E->State.OpValue); } if (E->VectorizedValue) { @@ -3513,16 +3806,20 @@ } Value *V0, *V1; - if (Instruction::isBinaryOp(S.getOpcode())) { + if (Instruction::isBinaryOp(E->State.getOpcode())) { V0 = Builder.CreateBinOp( - static_cast(S.getOpcode()), LHS, RHS); + static_cast(E->State.getOpcode()), LHS, + RHS); V1 = Builder.CreateBinOp( - static_cast(S.getAltOpcode()), LHS, RHS); + static_cast(E->State.getAltOpcode()), LHS, + RHS); } else { V0 = Builder.CreateCast( - static_cast(S.getOpcode()), LHS, VecTy); + static_cast(E->State.getOpcode()), LHS, + VecTy); V1 = Builder.CreateCast( - static_cast(S.getAltOpcode()), LHS, VecTy); + static_cast(E->State.getAltOpcode()), LHS, + VecTy); } // Create shuffle to take alternate operations from the vector. @@ -3533,8 +3830,7 @@ SmallVector Mask(e); for (unsigned i = 0; i < e; ++i) { auto *OpInst = cast(E->Scalars[i]); - assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); - if (OpInst->getOpcode() == S.getAltOpcode()) { + if (OpInst->getOpcode() == E->State.getAltOpcode()) { Mask[i] = Builder.getInt32(e + i); AltScalars.push_back(E->Scalars[i]); } else { @@ -3544,8 +3840,10 @@ } Value *ShuffleMask = ConstantVector::get(Mask); - propagateIRFlags(V0, OpScalars); - propagateIRFlags(V1, AltScalars); + InstructionsState S = getSameOpcode(E->State.OpValue, OpScalars); + propagateIRFlags(V0, OpScalars, S.OpValue); + S = getSameOpcode(E->State.OpValue, AltScalars); + propagateIRFlags(V1, AltScalars, S.OpValue); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); if (Instruction *I = dyn_cast(V)) @@ -3583,7 +3881,7 @@ // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0].State.OpValue; if (MinBWs.count(ScalarRoot)) { if (auto *I = dyn_cast(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); @@ -3616,7 +3914,7 @@ // has multiple uses of the same value. if (User && !is_contained(Scalar->users(), User)) continue; - TreeEntry *E = getTreeEntry(Scalar); + TreeEntry *E = getTreeEntry(dyn_cast(Scalar)); assert(E && "Invalid scalar"); assert(!E->NeedToGather && "Extracting from a gather list"); @@ -3698,6 +3996,9 @@ for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (!Entry->State.isOpcodeOrAlt(cast(Scalar))) + continue; + Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { #ifndef NDEBUG @@ -3705,7 +4006,8 @@ LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); // It is legal to replace users in the ignorelist by undef. - assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && + assert((getTreeEntry(dyn_cast(U)) || + is_contained(UserIgnoreList, U)) && "Replacing out-of-tree value with undef"); } #endif @@ -3823,14 +4125,22 @@ // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { - if (!extendSchedulingRegion(V, S)) + auto *I = dyn_cast(V); + assert(I && "bundle member must be an instruction"); + if (!extendSchedulingRegion(I, S)) return false; } for (Value *V : VL) { - ScheduleData *BundleMember = getScheduleData(V); + auto *I = cast(V); + ScheduleData *BundleMember = getInstScheduleData(I); + if (BundleMember->isPartOfBundle()) + BundleMember = getScheduleData(I, S.getKey()); + if (BundleMember->isPartOfBundle()) + return false; assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); + assert(!BundleMember->isPartOfBundle() && "Already part of another bundle"); if (BundleMember->IsScheduled) { // A bundle member was scheduled as single instruction before and now // needs to be scheduled as part of the bundle. We just get rid of the @@ -3847,6 +4157,8 @@ Bundle = BundleMember; } BundleMember->UnscheduledDepsInBundle = 0; + BundleMember->Opcode = S.getOpcode(); + BundleMember->Parent = S.Parent; Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps; // Group the instructions to a bundle. @@ -3890,18 +4202,30 @@ } } if (!Bundle->isReady()) { - cancelScheduling(VL, S.OpValue); + cancelScheduling(S.OpValue, S.getKey()); + // We have to clear all dependencies, since all values + // were calculated for the vectorized bundle. + for (auto *I = ScheduleStart; I != ScheduleEnd; + I = I->getNextNode()) { + doForAllOpcodes(I, [](ScheduleData *SD) { + SD->clearDependencies(); + }); + } + resetSchedule(); return false; } return true; } -void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL, - Value *OpValue) { +void BoUpSLP::BlockScheduling::cancelScheduling(Value *OpValue, + std::pair Key) { if (isa(OpValue)) return; - - ScheduleData *Bundle = getScheduleData(OpValue); + auto *I = dyn_cast(OpValue); + if (!I) + return; + ScheduleData *Bundle = getScheduleData(I, Key)->FirstInBundle; + assert(Bundle && "Counld not find bundle"); LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); @@ -3911,44 +4235,66 @@ // Un-bundle: make single instructions out of the bundle. ScheduleData *BundleMember = Bundle; while (BundleMember) { - assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links"); + assert(BundleMember->FirstInBundle == Bundle && "Corrupt bundle links"); + assert(BundleMember->Parent == Key.first && + BundleMember->Opcode == Key.second && "Corrupt bundle"); BundleMember->FirstInBundle = BundleMember; ScheduleData *Next = BundleMember->NextInBundle; BundleMember->NextInBundle = nullptr; BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps; - if (BundleMember->UnscheduledDepsInBundle == 0) { - ReadyInsts.insert(BundleMember); + if (BundleMember->isPseudo()) { + PseudoInstScheduleDataMap[BundleMember->getInst()].erase(Key); + BundleMember->Opcode = 0; + BundleMember->Parent = nullptr; + } else { + BundleMember->Opcode = 0; + BundleMember->Parent = nullptr; + if (BundleMember->UnscheduledDepsInBundle == 0) { + ReadyInsts.insert(BundleMember); + } } BundleMember = Next; } } -BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { - // Allocate a new ScheduleData for the instruction. +BoUpSLP::InstScheduleData * +BoUpSLP::BlockScheduling::allocateInstScheduleDataChunks() { + // Allocate a new InstScheduleData for the instruction. if (ChunkPos >= ChunkSize) { - ScheduleDataChunks.push_back(llvm::make_unique(ChunkSize)); + InstScheduleDataChunks.push_back( + llvm::make_unique(ChunkSize)); ChunkPos = 0; } - return &(ScheduleDataChunks.back()[ChunkPos++]); + return &(InstScheduleDataChunks.back()[ChunkPos++]); } -bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, +BoUpSLP::PseudoScheduleData * +BoUpSLP::BlockScheduling::allocatePseudoInstDataChunks() { + // Allocate a new PseudoScheduleData for the instruction. + if (PseudoChunkPos >= PseudoChunkSize) { + PseudoScheduleDataChunks.push_back( + llvm::make_unique(PseudoChunkSize)); + PseudoChunkPos = 0; + } + return &(PseudoScheduleDataChunks.back()[PseudoChunkPos++]); +} + +bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Instruction *I, const InstructionsState &S) { - if (getScheduleData(V, isOneOf(S, V))) + if (getScheduleData(I, S.getKey())) return true; - Instruction *I = dyn_cast(V); - assert(I && "bundle member must be an instruction"); assert(!isa(I) && "phi nodes don't need to be scheduled"); auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool { - ScheduleData *ISD = getScheduleData(I); + InstScheduleData *ISD = getInstScheduleData(I); if (!ISD) return false; assert(isInSchedulingRegion(ISD) && - "ScheduleData not in scheduling region"); - ScheduleData *SD = allocateScheduleDataChunks(); - SD->Inst = I; - SD->init(SchedulingRegionID, S.OpValue); - ExtraScheduleDataMap[I][S.OpValue] = SD; + "InstScheduleData not in scheduling region"); + if (ISD->isPartOfBundle()) { + PseudoScheduleData *PSD = allocatePseudoInstDataChunks(); + PSD->init(SchedulingRegionID, ISD, S.Parent, S.getOpcode()); + PseudoInstScheduleDataMap[I][S.getKey()] = PSD; + } return true; }; if (CheckSheduleForI(I)) @@ -3958,8 +4304,7 @@ initScheduleData(I, I->getNextNode(), nullptr, nullptr); ScheduleStart = I; ScheduleEnd = I->getNextNode(); - if (isOneOf(S, I) != I) - CheckSheduleForI(I); + CheckSheduleForI(I); assert(ScheduleEnd && "tried to vectorize a terminator?"); LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); return true; @@ -3981,8 +4326,7 @@ if (&*UpIter == I) { initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); ScheduleStart = I; - if (isOneOf(S, I) != I) - CheckSheduleForI(I); + CheckSheduleForI(I); LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); return true; @@ -3994,8 +4338,7 @@ initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, nullptr); ScheduleEnd = I->getNextNode(); - if (isOneOf(S, I) != I) - CheckSheduleForI(I); + CheckSheduleForI(I); assert(ScheduleEnd && "tried to vectorize a terminator?"); LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); @@ -4009,21 +4352,20 @@ return true; } -void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, - Instruction *ToI, - ScheduleData *PrevLoadStore, - ScheduleData *NextLoadStore) { - ScheduleData *CurrentLoadStore = PrevLoadStore; +void BoUpSLP::BlockScheduling::initScheduleData( + Instruction *FromI, Instruction *ToI, InstScheduleData *PrevLoadStore, + InstScheduleData *NextLoadStore) { + InstScheduleData *CurrentLoadStore = PrevLoadStore; for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { - ScheduleData *SD = ScheduleDataMap[I]; + InstScheduleData *SD = InstScheduleDataMap[I]; if (!SD) { - SD = allocateScheduleDataChunks(); - ScheduleDataMap[I] = SD; + SD = allocateInstScheduleDataChunks(); + InstScheduleDataMap[I] = SD; SD->Inst = I; } assert(!isInSchedulingRegion(SD) && - "new ScheduleData already in scheduling region"); - SD->init(SchedulingRegionID, I); + "new InstScheduleData already in scheduling region"); + SD->init(SchedulingRegionID); if (I->mayReadOrWriteMemory() && (!isa(I) || @@ -4058,8 +4400,13 @@ WorkList.pop_back(); ScheduleData *BundleMember = SD; + unsigned Opcode = BundleMember->Opcode; + Value *Parent = BundleMember->Parent; while (BundleMember) { assert(isInSchedulingRegion(BundleMember)); + assert(BundleMember->Opcode == Opcode && + BundleMember->Parent == Parent && "Corrupt bundle member"); + if (!BundleMember->hasValidDependencies()) { LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember @@ -4068,44 +4415,31 @@ BundleMember->resetUnscheduledDeps(); // Handle def-use chain dependencies. - if (BundleMember->OpValue != BundleMember->Inst) { - ScheduleData *UseSD = getScheduleData(BundleMember->Inst); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { - BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) - BundleMember->incrementUnscheduledDeps(1); - if (!DestBundle->hasValidDependencies()) - WorkList.push_back(DestBundle); - } - } else { - for (User *U : BundleMember->Inst->users()) { - if (isa(U)) { - ScheduleData *UseSD = getScheduleData(U); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { - BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) - BundleMember->incrementUnscheduledDeps(1); - if (!DestBundle->hasValidDependencies()) - WorkList.push_back(DestBundle); - } - } else { - // I'm not sure if this can ever happen. But we need to be safe. - // This lets the instruction/bundle never be scheduled and - // eventually disable vectorization. + for (User *U : BundleMember->getInst()->users()) { + if (auto *I = dyn_cast(U)) { + doForAllOpcodes(I, [&BundleMember, &WorkList](ScheduleData *UseSD) { BundleMember->Dependencies++; - BundleMember->incrementUnscheduledDeps(1); - } + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) + BundleMember->incrementUnscheduledDeps(1); + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); + }); + } else { + // I'm not sure if this can ever happen. But we need to be safe. + // This lets the instruction/bundle never be scheduled and + // eventually disable vectorization. + BundleMember->Dependencies++; + BundleMember->incrementUnscheduledDeps(1); } } // Handle the memory dependencies. ScheduleData *DepDest = BundleMember->NextLoadStore; if (DepDest) { - Instruction *SrcInst = BundleMember->Inst; + Instruction *SrcInst = BundleMember->getInst(); MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA); - bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory(); + bool SrcMayWrite = SrcInst->mayWriteToMemory(); unsigned numAliased = 0; unsigned DistToSrc = 1; @@ -4120,24 +4454,29 @@ // It's important for the loop break condition (see below) to // check this limit even between two read-only instructions. if (DistToSrc >= MaxMemDepDistance || - ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) && + ((SrcMayWrite || DepDest->getInst()->mayWriteToMemory()) && (numAliased >= AliasedCheckLimit || - SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) { + SLP->isAliased(SrcLoc, SrcInst, DepDest->getInst())))) { // We increment the counter only if the locations are aliased // (instead of counting all alias checks). This gives a better // balance between reduced runtime and accurate dependencies. numAliased++; - DepDest->MemoryDependencies.push_back(BundleMember); - BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; - if (!DestBundle->IsScheduled) { - BundleMember->incrementUnscheduledDeps(1); - } - if (!DestBundle->hasValidDependencies()) { - WorkList.push_back(DestBundle); - } + // We don't want any duplicates in the set to have a correct + // dependancies. + doForAllOpcodes(DepDest->getInst(), [&BundleMember, &WorkList]( + ScheduleData *DepDest) { + DepDest->MemoryDependencies.push_back(BundleMember); + BundleMember->Dependencies++; + ScheduleData *DestBundle = DepDest->FirstInBundle; + if (!DestBundle->IsScheduled) { + BundleMember->incrementUnscheduledDeps(1); + } + if (!DestBundle->hasValidDependencies()) { + WorkList.push_back(DestBundle); + } + }); } DepDest = DepDest->NextLoadStore; @@ -4164,7 +4503,7 @@ } if (InsertInReadyList && SD->isReady()) { ReadyInsts.push_back(SD); - LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst + LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->getInst() << "\n"); } } @@ -4176,7 +4515,7 @@ for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { doForAllOpcodes(I, [&](ScheduleData *SD) { assert(isInSchedulingRegion(SD) && - "ScheduleData not in scheduling region"); + "InstScheduleData not in scheduling region"); SD->IsScheduled = false; SD->resetUnscheduledDeps(); }); @@ -4184,6 +4523,56 @@ ReadyInsts.clear(); } +void BoUpSLP::BlockScheduling::reorderBundles() { + SmallPtrSet Bundles; + DenseMap ReorderMap; + for (auto I = PseudoInstScheduleDataMap.begin(), + E = PseudoInstScheduleDataMap.end(); + I != E; ++I) { + doForAllOpcodes(I->first, [&Bundles](ScheduleData *SD) { + if (SD->isPartOfBundle()) + Bundles.insert(SD->FirstInBundle); + }); + } + // Walk backward in the BB to descover the last instruction + // for a bundle. + for (auto I = BB->rbegin(), E = BB->rend(); (I != E && Bundles.size() > 0); + ++I) { + doForAllOpcodes(&*I, [&ReorderMap, &Bundles](ScheduleData *SD) { + if (SD->isPartOfBundle() && Bundles.count(SD->FirstInBundle) != 0) { + ReorderMap[SD->FirstInBundle] = SD; + Bundles.erase(SD->FirstInBundle); + } + }); + } + // Swap the last scheduled instruction with the first one in the bundle. + for (auto I = ReorderMap.begin(), E = ReorderMap.end(); I != E; ++I) { + ScheduleData *FirstSD = I->first; + ScheduleData *LastSD = I->second; + SmallVector Bundle; + unsigned LastPos = 0; + // The first instruction in the bundle is already the last one scheduled. + if (FirstSD == LastSD) + continue; + ScheduleData *SD = FirstSD; + while (SD) { + if (SD == LastSD) + LastPos = Bundle.size(); + Bundle.push_back(SD); + SD = SD->NextInBundle; + } + std::swap(Bundle[0], Bundle[LastPos]); + for (ScheduleData *SD : Bundle) + SD->FirstInBundle = Bundle[0]; + Bundle[0]->NextInBundle = Bundle[1]; + Bundle[LastPos - 1]->NextInBundle = Bundle[LastPos]; + if (LastPos == Bundle.size() - 1) + Bundle[LastPos]->NextInBundle = nullptr; + else + Bundle[LastPos]->NextInBundle = Bundle[LastPos + 1]; + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -4210,7 +4599,9 @@ I = I->getNextNode()) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { assert(SD->isPartOfBundle() == - (getTreeEntry(SD->Inst) != nullptr) && + (getTreeEntry(SD->getInst(), + std::make_pair(SD->Parent, SD->Opcode)) != + nullptr) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { @@ -4231,21 +4622,36 @@ // Move the scheduled instruction(s) to their dedicated places, if not // there yet. ScheduleData *BundleMember = picked; + unsigned Opcode = BundleMember->Opcode; + Value *Parent = BundleMember->Parent; while (BundleMember) { - Instruction *pickedInst = BundleMember->Inst; - if (LastScheduledInst->getNextNode() != pickedInst) { - BS->BB->getInstList().remove(pickedInst); + assert(Opcode == BundleMember->Opcode && + Parent == BundleMember->Parent && "Corrupt bundle member"); + Instruction *PickedInst = BundleMember->getInst(); + if (LastScheduledInst->getNextNode() != PickedInst) { + BS->BB->getInstList().remove(PickedInst); BS->BB->getInstList().insert(LastScheduledInst->getIterator(), - pickedInst); + PickedInst); } - LastScheduledInst = pickedInst; + LastScheduledInst = PickedInst; BundleMember = BundleMember->NextInBundle; } - BS->schedule(picked, ReadyInsts); NumToSchedule--; } +#ifndef NDEBUG + if (NumToSchedule != 0) { + for (BasicBlock::iterator I = BS->BB->begin(), E = BS->BB->end(); I != E; + ++I) { + BS->doForAllOpcodes(&*I, [](ScheduleData *SD) { + if (SD->isSchedulingEntity() && SD->UnscheduledDepsInBundle != 0) + LLVM_DEBUG(dbgs() << "SLP: Failed to schedule: " << *SD << ".\n"); + }); + } + } +#endif assert(NumToSchedule == 0 && "could not schedule all instructions"); + BS->reorderBundles(); // Avoid duplicate scheduling of the block. BS->ScheduleStart = nullptr; @@ -4862,9 +5268,14 @@ // Check that all of the parts are scalar instructions of the same type, // we permit an alternate opcode via InstructionsState. - InstructionsState S = getSameOpcode(VL); + InstructionsState S = getSameOpcode(VL[0], VL); if (!S.getOpcode()) return false; + for (Value *V : VL) { + auto *I = dyn_cast(V); + if (isOneOf(S, I) != I) + return false; + } Instruction *I0 = cast(S.OpValue); unsigned Sz = R.getVectorElementSize(I0); Index: test/Transforms/SLPVectorizer/X86/cancel_scheduling.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/cancel_scheduling.ll @@ -0,0 +1,215 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -slp-vectorizer -S | FileCheck %s +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; This testcase shows the failure of scheduling bundles after calling +; cancelScheduling() in tryScheduleBundle() and not cleaning all +; dependencies. The dependency values are supposed to be cleared, +; since everything was calculated before we cancel the bundle. + +define dso_local void @fn1() local_unnamed_addr #0 { +; CHECK-LABEL: @fn1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 2 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 1 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 0 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 3 +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 17 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX5]], align 2 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 16 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX6]], align 2 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 15 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX7]], align 2 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 12 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX8]], align 2 +; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 11 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX9]], align 2 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 10 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX10]], align 2 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 9 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX11]], align 2 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 8 +; CHECK-NEXT: store i16 7, i16* [[ARRAYIDX12]], align 2 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 7 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 6 +; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 5 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[ARRAYIDX2]] to <8 x i16>* +; CHECK-NEXT: store <8 x i16> , <8 x i16>* [[TMP0]], align 2 +; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* inttoptr (i64 1 to i8*), align 1 +; CHECK-NEXT: [[ARRAYIDX17:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 18 +; CHECK-NEXT: [[TMP2:%.*]] = lshr i8 [[TMP1]], 2 +; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 19 +; CHECK-NEXT: [[TMP3:%.*]] = and i8 [[TMP1]], 2 +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 20 +; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* inttoptr (i64 2 to i8*), align 2 +; CHECK-NEXT: [[TMP5:%.*]] = lshr i8 [[TMP4]], 4 +; CHECK-NEXT: [[ARRAYIDX34:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 21 +; CHECK-NEXT: [[TMP6:%.*]] = lshr i8 [[TMP4]], 1 +; CHECK-NEXT: [[ARRAYIDX39:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 22 +; CHECK-NEXT: [[ARRAYIDX44:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 23 +; CHECK-NEXT: [[TMP7:%.*]] = load i8, i8* inttoptr (i64 3 to i8*), align 1 +; CHECK-NEXT: [[TMP8:%.*]] = lshr i8 [[TMP7]], 3 +; CHECK-NEXT: [[ARRAYIDX49:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 24 +; CHECK-NEXT: [[ARRAYIDX54:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 25 +; CHECK-NEXT: [[TMP9:%.*]] = load i8, i8* inttoptr (i64 4 to i8*), align 4 +; CHECK-NEXT: [[TMP10:%.*]] = lshr i8 [[TMP9]], 4 +; CHECK-NEXT: [[ARRAYIDX59:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 26 +; CHECK-NEXT: [[TMP11:%.*]] = lshr i8 [[TMP9]], 1 +; CHECK-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 27 +; CHECK-NEXT: [[ARRAYIDX69:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 28 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, i8* inttoptr (i64 5 to i8*), align 1 +; CHECK-NEXT: [[TMP13:%.*]] = lshr i8 [[TMP12]], 3 +; CHECK-NEXT: [[ARRAYIDX74:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 29 +; CHECK-NEXT: [[ARRAYIDX79:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 30 +; CHECK-NEXT: [[TMP14:%.*]] = load i8, i8* inttoptr (i64 6 to i8*), align 2 +; CHECK-NEXT: [[ARRAYIDX83:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 31 +; CHECK-NEXT: [[TMP15:%.*]] = lshr i8 [[TMP14]], 2 +; CHECK-NEXT: [[ARRAYIDX88:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 32 +; CHECK-NEXT: [[TMP16:%.*]] = shl i8 [[TMP14]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = or i8 [[TMP3]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i8> undef, i8 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i8> [[TMP18]], i8 [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> [[TMP19]], i8 [[TMP17]], i32 2 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i8> [[TMP20]], i8 [[TMP5]], i32 3 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[TMP6]], i32 4 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i8> [[TMP22]], i8 [[TMP4]], i32 5 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[TMP8]], i32 6 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP24]], i8 [[TMP7]], i32 7 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i8> [[TMP25]], i8 [[TMP10]], i32 8 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[TMP11]], i32 9 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i8> [[TMP27]], i8 [[TMP9]], i32 10 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[TMP13]], i32 11 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[TMP12]], i32 12 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i8> [[TMP30]], i8 [[TMP14]], i32 13 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[TMP15]], i32 14 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i8> [[TMP32]], i8 [[TMP16]], i32 15 +; CHECK-NEXT: [[TMP34:%.*]] = ashr <16 x i8> [[TMP33]], +; CHECK-NEXT: [[TMP35:%.*]] = and <16 x i8> [[TMP33]], +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i8> [[TMP34]], <16 x i8> [[TMP35]], <16 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[TMP36]] to <16 x i16> +; CHECK-NEXT: [[TMP38:%.*]] = zext <16 x i8> [[TMP36]] to <16 x i16> +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[TMP37]], <16 x i16> [[TMP38]], <16 x i32> +; CHECK-NEXT: [[ARRAYIDX92:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 33 +; CHECK-NEXT: [[TMP40:%.*]] = bitcast i16* [[ARRAYIDX17]] to <16 x i16>* +; CHECK-NEXT: store <16 x i16> [[TMP39]], <16 x i16>* [[TMP40]], align 2 +; CHECK-NEXT: ret void +; +entry: + %arrayidx = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 2 + store i16 2, i16* %arrayidx, align 2 + %arrayidx1 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 1 + store i16 2, i16* %arrayidx1, align 2 + %arrayidx2 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 0 + store i16 2, i16* %arrayidx2, align 2 + %arrayidx3 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 4 + store i16 0, i16* %arrayidx3, align 2 + %arrayidx4 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 3 + store i16 0, i16* %arrayidx4, align 2 + %arrayidx5 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 17 + store i16 7, i16* %arrayidx5, align 2 + %arrayidx6 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 16 + store i16 7, i16* %arrayidx6, align 2 + %arrayidx7 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 15 + store i16 7, i16* %arrayidx7, align 2 + %arrayidx8 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 12 + store i16 7, i16* %arrayidx8, align 2 + %arrayidx9 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 11 + store i16 7, i16* %arrayidx9, align 2 + %arrayidx10 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 10 + store i16 7, i16* %arrayidx10, align 2 + %arrayidx11 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 9 + store i16 7, i16* %arrayidx11, align 2 + %arrayidx12 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 8 + store i16 7, i16* %arrayidx12, align 2 + %arrayidx13 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 7 + store i16 7, i16* %arrayidx13, align 2 + %arrayidx14 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 6 + store i16 7, i16* %arrayidx14, align 2 + %arrayidx15 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 5 + store i16 7, i16* %arrayidx15, align 2 + %0 = load i8, i8* inttoptr (i64 1 to i8*), align 1 + %1 = ashr i8 %0, 7 + %conv16 = sext i8 %1 to i16 + %arrayidx17 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 18 + store i16 %conv16, i16* %arrayidx17, align 2 + %2 = lshr i8 %0, 2 + %3 = and i8 %2, 7 + %conv20 = zext i8 %3 to i16 + %arrayidx21 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 19 + store i16 %conv20, i16* %arrayidx21, align 2 + %4 = and i8 %0, 2 + %arrayidx26 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 20 + %5 = or i8 %4, 1 + %conv29 = zext i8 %5 to i16 + store i16 %conv29, i16* %arrayidx26, align 2 + %6 = load i8, i8* inttoptr (i64 2 to i8*), align 2 + %7 = lshr i8 %6, 4 + %8 = and i8 %7, 7 + %conv33 = zext i8 %8 to i16 + %arrayidx34 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 21 + store i16 %conv33, i16* %arrayidx34, align 2 + %9 = lshr i8 %6, 1 + %10 = and i8 %9, 7 + %conv38 = zext i8 %10 to i16 + %arrayidx39 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 22 + store i16 %conv38, i16* %arrayidx39, align 2 + %11 = and i8 %6, 2 + %conv43 = zext i8 %11 to i16 + %arrayidx44 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 23 + store i16 %conv43, i16* %arrayidx44, align 2 + %12 = load i8, i8* inttoptr (i64 3 to i8*), align 1 + %13 = lshr i8 %12, 3 + %14 = and i8 %13, 7 + %conv48 = zext i8 %14 to i16 + %arrayidx49 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 24 + store i16 %conv48, i16* %arrayidx49, align 2 + %15 = and i8 %12, 7 + %conv53 = zext i8 %15 to i16 + %arrayidx54 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 25 + store i16 %conv53, i16* %arrayidx54, align 2 + %16 = load i8, i8* inttoptr (i64 4 to i8*), align 4 + %17 = lshr i8 %16, 4 + %18 = and i8 %17, 7 + %conv58 = zext i8 %18 to i16 + %arrayidx59 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 26 + store i16 %conv58, i16* %arrayidx59, align 2 + %19 = lshr i8 %16, 1 + %20 = and i8 %19, 7 + %conv63 = zext i8 %20 to i16 + %arrayidx64 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 27 + store i16 %conv63, i16* %arrayidx64, align 2 + %21 = and i8 %16, 2 + %conv68 = zext i8 %21 to i16 + %arrayidx69 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 28 + store i16 %conv68, i16* %arrayidx69, align 2 + %22 = load i8, i8* inttoptr (i64 5 to i8*), align 1 + %23 = lshr i8 %22, 3 + %24 = and i8 %23, 7 + %conv73 = zext i8 %24 to i16 + %arrayidx74 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 29 + store i16 %conv73, i16* %arrayidx74, align 2 + %25 = and i8 %22, 7 + %conv78 = zext i8 %25 to i16 + %arrayidx79 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 30 + store i16 %conv78, i16* %arrayidx79, align 2 + %26 = load i8, i8* inttoptr (i64 6 to i8*), align 2 + %27 = and i8 %26, 7 + %conv82 = zext i8 %27 to i16 + %arrayidx83 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 31 + store i16 %conv82, i16* %arrayidx83, align 2 + %28 = lshr i8 %26, 2 + %29 = and i8 %28, 7 + %conv87 = zext i8 %29 to i16 + %arrayidx88 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 32 + store i16 %conv87, i16* %arrayidx88, align 2 + %30 = shl i8 %26, 1 + %31 = and i8 %30, 6 + %conv91 = zext i8 %31 to i16 + %arrayidx92 = getelementptr inbounds [4 x i16], [4 x i16]* undef, i64 0, i64 33 + store i16 %conv91, i16* %arrayidx92, align 2 + ret void +} + +attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/Transforms/SLPVectorizer/X86/insert-after-multiple-bundle.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/insert-after-multiple-bundle.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -slp-vectorizer -slp-vectorizer -mcpu=bdver1 < %s | FileCheck %s + +; Function Attrs: nounwind uwtable +define dso_local void @fn1() local_unnamed_addr #0 { +; CHECK-LABEL: @fn1( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[TMP0:%.*]] = phi <4 x i32> [ undef, [[ENTRY:%.*]] ], [ [[TMP7:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* undef, align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> , i32 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], zeroinitializer +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shl nsw <4 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP5:%.*]] = sub nsw <4 x i32> undef, [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp slt <4 x i32> [[TMP0]], [[TMP5]] +; CHECK-NEXT: [[TMP7]] = select <4 x i1> [[TMP6]], <4 x i32> [[TMP5]], <4 x i32> [[TMP0]] +; CHECK-NEXT: br label [[FOR_BODY]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %j.064 = phi i32 [ undef, %entry ], [ %spec.select, %for.body ] + %k.063 = phi i32 [ undef, %entry ], [ %k.1, %for.body ] + %l.062 = phi i32 [ undef, %entry ], [ %spec.select58, %for.body ] + %m.061 = phi i32 [ undef, %entry ], [ %m.1, %for.body ] + %conv = zext i8 undef to i32 + %mul = shl nuw nsw i32 %conv, 1 + %sub = sub nsw i32 undef, %mul + %mul4 = shl nuw nsw i32 %conv, 2 + %sub5 = sub nsw i32 undef, %mul4 + %conv8 = zext i8 undef to i32 + %0 = load i32, i32* undef, align 4 + %add = add nsw i32 %0, %conv8 + %mul11 = shl nsw i32 %add, 1 + %sub12 = sub nsw i32 undef, %mul11 + %mul19 = shl nsw i32 %add, 2 + %sub20 = sub nsw i32 undef, %mul19 + %cmp = icmp slt i32 %j.064, %sub + %spec.select = select i1 %cmp, i32 %sub, i32 %j.064 + %cmp22 = icmp slt i32 %k.063, %sub5 + %k.1 = select i1 %cmp22, i32 %sub5, i32 %k.063 + %cmp26 = icmp slt i32 %l.062, %sub12 + %spec.select58 = select i1 %cmp26, i32 %sub12, i32 %l.062 + %cmp30 = icmp slt i32 %m.061, %sub20 + %m.1 = select i1 %cmp30, i32 %sub20, i32 %m.061 + br label %for.body +} + +; Function Attrs: nounwind uwtable +define dso_local void @axis_to_quat() local_unnamed_addr #0 { +; CHECK-LABEL: @axis_to_quat( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CONV:%.*]] = fptrunc double undef to float +; CHECK-NEXT: [[MUL:%.*]] = fmul fast float undef, [[CONV]] +; CHECK-NEXT: store float [[MUL]], float* undef, align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* undef, i64 1 +; CHECK-NEXT: [[MUL2:%.*]] = fmul fast float undef, [[CONV]] +; CHECK-NEXT: store float [[MUL2]], float* [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* undef, i64 2 +; CHECK-NEXT: [[MUL4:%.*]] = fmul fast float undef, [[CONV]] +; CHECK-NEXT: store float [[MUL4]], float* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = tail call fast double @llvm.cos.f64(double 0x7FF8000000000000) +; CHECK-NEXT: [[CONV6:%.*]] = fptrunc double [[TMP0]] to float +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* undef, i64 3 +; CHECK-NEXT: store float [[CONV6]], float* [[ARRAYIDX7]], align 4 +; CHECK-NEXT: ret void +; +entry: + %conv = fptrunc double undef to float + %mul = fmul fast float undef, %conv + store float %mul, float* undef, align 4 + %arrayidx1 = getelementptr inbounds float, float* undef, i64 1 + %mul2 = fmul fast float undef, %conv + store float %mul2, float* %arrayidx1, align 4 + %arrayidx3 = getelementptr inbounds float, float* undef, i64 2 + %mul4 = fmul fast float undef, %conv + store float %mul4, float* %arrayidx3, align 4 + %0 = tail call fast double @llvm.cos.f64(double 0x7FF8000000000000) + %conv6 = fptrunc double %0 to float + %arrayidx7 = getelementptr inbounds float, float* undef, i64 3 + store float %conv6, float* %arrayidx7, align 4 + ret void +} + +; Function Attrs: nounwind readnone speculatable +declare double @llvm.cos.f64(double) #1 Index: test/Transforms/SLPVectorizer/X86/memory-dep.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/memory-dep.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -slp-vectorizer -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.anon.1.2.3.4.87 = type { [6 x [6 x i16]], [6 x [6 x i32]], [0 x [4 x [4 x i32]]] } + +@f = external dso_local local_unnamed_addr global %struct.anon.1.2.3.4.87, align 4 + +; Function Attrs: norecurse nounwind uwtable +define dso_local void @itrans() local_unnamed_addr #0 { +; CHECK-LABEL: @itrans( +; CHECK-NEXT: entry: +; CHECK-NEXT: store i32 undef, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 2), align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 undef, i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 undef, i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> , [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> [[TMP6]], undef +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> undef, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = shl <4 x i32> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = icmp slt <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = zext <4 x i1> [[TMP10]] to <4 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = icmp slt <4 x i32> undef, [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = zext <4 x i1> [[TMP12]] to <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* bitcast (i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 0) to <4 x i32>*), align 4 +; CHECK-NEXT: ret void +; +entry: + %add8 = add nsw i32 undef, undef + store i32 undef, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4 + %add15 = add nsw i32 undef, undef + %add26 = add nsw i32 %add8, 2 + %sub27 = sub i32 %add26, undef + %add33 = add nsw i32 %sub27, undef + %shl = shl i32 %add33, 6 + %cmp.i = icmp slt i32 %shl, 1 + %conv.i = zext i1 %cmp.i to i32 + %cmp1.i = icmp slt i32 undef, %conv.i + %conv2.i = zext i1 %cmp1.i to i32 + store i32 %conv2.i, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 0), align 4 + %add26.1 = add nsw i32 %add15, 2 + %sub27.1 = sub i32 %add26.1, undef + %add33.1 = add nsw i32 %sub27.1, undef + %shl.1 = shl i32 %add33.1, 6 + %cmp.i.1 = icmp slt i32 %shl.1, 1 + %conv.i.1 = zext i1 %cmp.i.1 to i32 + %cmp1.i.1 = icmp slt i32 undef, %conv.i.1 + %conv2.i.1 = zext i1 %cmp1.i.1 to i32 + store i32 %conv2.i.1, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 1), align 4 + %0 = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 2), align 4 + %add26.2 = add nsw i32 %0, 2 + %sub27.2 = sub i32 %add26.2, undef + %add33.2 = add nsw i32 %sub27.2, undef + %shl.2 = shl i32 %add33.2, 6 + %cmp.i.2 = icmp slt i32 %shl.2, 1 + %conv.i.2 = zext i1 %cmp.i.2 to i32 + %cmp1.i.2 = icmp slt i32 undef, %conv.i.2 + %conv2.i.2 = zext i1 %cmp1.i.2 to i32 + store i32 %conv2.i.2, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 2), align 4 + %1 = load i32, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 0, i64 3), align 4 + %add26.3 = add nsw i32 %1, 2 + %sub27.3 = sub i32 %add26.3, undef + %add33.3 = add nsw i32 %sub27.3, undef + %shl.3 = shl i32 %add33.3, 6 + %cmp.i.3 = icmp slt i32 %shl.3, 1 + %conv.i.3 = zext i1 %cmp.i.3 to i32 + %cmp1.i.3 = icmp slt i32 undef, %conv.i.3 + %conv2.i.3 = zext i1 %cmp1.i.3 to i32 + store i32 %conv2.i.3, i32* getelementptr inbounds (%struct.anon.1.2.3.4.87, %struct.anon.1.2.3.4.87* @f, i64 0, i32 1, i64 3, i64 3), align 4 + ret void +} Index: test/Transforms/SLPVectorizer/X86/pr35497.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/pr35497.ll +++ test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -12,20 +12,20 @@ define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 { ; CHECK-LABEL: @_ZN1C10SwitchModeEv( ; CHECK-NEXT: for.body.lr.ph.i: -; CHECK-NEXT: [[OR_1:%.*]] = or i64 undef, 1 -; CHECK-NEXT: store i64 [[OR_1]], i64* undef, align 8 +; CHECK-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> undef, i64 [[BAR5]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i64> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: store i64 [[TMP2]], i64* undef, align 8 ; CHECK-NEXT: [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0 ; CHECK-NEXT: [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 -; CHECK-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i64> [[TMP1]], [[TMP4]] ; CHECK-NEXT: [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0 ; CHECK-NEXT: [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8 ; CHECK-NEXT: ret void ; for.body.lr.ph.i: Index: test/Transforms/SLPVectorizer/X86/rem-bundle.ll =================================================================== --- /dev/null +++ test/Transforms/SLPVectorizer/X86/rem-bundle.ll @@ -0,0 +1,36 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -slp-vectorizer -slp-vectorizer -mcpu=bdver1 < %s | FileCheck %s + +%struct.anon.0.1.2.3.20 = type { i32, i32, i32, i32 } + +; This testcase shows the failure of combining any remainder operation +; in a bundle with non-alternative operations. + +@b = external dso_local local_unnamed_addr global i32, align 4 +@c = external dso_local local_unnamed_addr global %struct.anon.0.1.2.3.20, align 4 + +; Function Attrs: norecurse nounwind uwtable +define dso_local void @fn1() local_unnamed_addr #0 { +; CHECK-LABEL: @fn1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* @b, align 4 +; CHECK-NEXT: store i32 [[TMP0]], i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 3), align 4 +; CHECK-NEXT: [[DIV:%.*]] = sdiv i32 undef, 600 +; CHECK-NEXT: store i32 [[DIV]], i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 2), align 4 +; CHECK-NEXT: [[DIV1:%.*]] = sdiv i32 undef, 60 +; CHECK-NEXT: store i32 [[DIV1]], i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 1), align 4 +; CHECK-NEXT: [[REM:%.*]] = srem i32 undef, 60 +; CHECK-NEXT: store i32 [[REM]], i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 0), align 4 +; CHECK-NEXT: ret void +; +entry: + %0 = load i32, i32* @b, align 4 + store i32 %0, i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 3), align 4 + %div = sdiv i32 undef, 600 + store i32 %div, i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 2), align 4 + %div1 = sdiv i32 undef, 60 + store i32 %div1, i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 1), align 4 + %rem = srem i32 undef, 60 + store i32 %rem, i32* getelementptr inbounds (%struct.anon.0.1.2.3.20, %struct.anon.0.1.2.3.20* @c, i64 0, i32 0), align 4 + ret void +} Index: test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -43,22 +43,16 @@ ; CHECK-LABEL: @add1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -86,22 +80,16 @@ ; CHECK-LABEL: @sub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -205,22 +193,18 @@ ; CHECK-LABEL: @addsub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -248,22 +232,18 @@ ; CHECK-LABEL: @addsub1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -291,22 +271,16 @@ ; CHECK-LABEL: @mul( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[MUL]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9 -; CHECK-NEXT: store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -334,22 +308,16 @@ ; CHECK-LABEL: @shl0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SHL5:%.*]] = shl i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -453,22 +421,16 @@ ; CHECK-LABEL: @add1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -496,22 +458,16 @@ ; CHECK-LABEL: @sub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -615,22 +571,18 @@ ; CHECK-LABEL: @addsub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[SUB5]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -658,22 +610,18 @@ ; CHECK-LABEL: @addsub1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -701,22 +649,16 @@ ; CHECK-LABEL: @mulf( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 -; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -825,22 +767,16 @@ ; CHECK-LABEL: @sub0fn( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: