Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -162,65 +162,159 @@ case Instruction::Sub: return Instruction::Add; default: - return 0; + return Op; } } -///\returns bool representing if Opcode \p Op can be part -/// of an alternate sequence which can later be merged as -/// a ShuffleVector instruction. -static bool canCombineAsAltInst(unsigned Op) { - return Op == Instruction::FAdd || Op == Instruction::FSub || - Op == Instruction::Sub || Op == Instruction::Add; +/// true if the \p Value is odd, false otherwise. +static bool isOdd(unsigned Value) { + return Value & 1; } -/// \returns ShuffleVector instruction if instructions in \p VL have -/// alternate fadd,fsub / fsub,fadd/add,sub/sub,add sequence. -/// (i.e. e.g. opcodes of fadd,fsub,fadd,fsub...) -static unsigned isAltInst(ArrayRef VL) { - Instruction *I0 = dyn_cast(VL[0]); +/// Checks if the \p CheckedOpcode is equal \p Opcode or \p AltOpcode. +static bool sameOpcodeOrAlt(unsigned Opcode, unsigned AltOpcode, + unsigned CheckedOpcode) { + return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode; +} + +/// Chooses the correct key for scheduling data. If \p Op has the same (or +/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p +/// OpValue. +static Value *chooseKey(Value *OpValue, Value *Op) { + auto *I = dyn_cast(Op); + if (!I) + return OpValue; + auto *OpInst = cast(OpValue); + unsigned OpInstOpcode = OpInst->getOpcode(); + unsigned IOpcode = I->getOpcode(); + if (sameOpcodeOrAlt(OpInstOpcode, getAltOpcode(OpInstOpcode), IOpcode)) + return Op; + return OpValue; +} + +/// Checks if the \p Opcode can be considered as an operand of a (possibly) +/// binary operation \p I. +/// \returns The code of the binary operation of instruction \p I if the +/// instruction with \p Opcode can be considered as an operand of \p I with the +/// default value. +static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) { + assert(!sameOpcodeOrAlt(Opcode, getAltOpcode(Opcode), I->getOpcode())); + if (Opcode != Instruction::PHI && + (I->getType()->isIntegerTy() || I->hasUnsafeAlgebra()) && + isa(I)) + return I->getOpcode(); + return 0; +} + +namespace { +/// Contains data for the instructions going to be vectorized. +struct RawInstructionsData { + /// Main Opcode of the instructions going to be vectorized. + unsigned Opcode = 0; + /// Position of the first instruction with the \a Opcode. + unsigned OpcodePos = 0; + /// Need an additional analysis (if at least one of the instruction is not + /// same instruction kind as an instruction at OpcodePos position in the + /// list). + bool NeedAnalysis = false; + /// The list of instructions have some instructions with alternate opcodes. + bool HasAltOpcodes = false; +}; +} // namespace + +/// Checks the list of the vectorized instructions \p VL and returns info about +/// this list. +static RawInstructionsData getMainOpcode(ArrayRef VL) { + auto *I0 = dyn_cast(VL[0]); + if (!I0) + return {}; + RawInstructionsData Res; unsigned Opcode = I0->getOpcode(); unsigned AltOpcode = getAltOpcode(Opcode); - for (int i = 1, e = VL.size(); i < e; i++) { - Instruction *I = dyn_cast(VL[i]); - if (!I || I->getOpcode() != ((i & 1) ? AltOpcode : Opcode)) - return 0; + unsigned NewOpcodePos = 0; + for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) { + auto *I = dyn_cast(VL[Cnt]); + if (!I) + return {}; + if (!sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) { + if (unsigned NewOpcode = tryToRepresentAsInstArg(Opcode, I)) { + if (!Instruction::isBinaryOp(Opcode) || + !Instruction::isCommutative(Opcode)) { + NewOpcodePos = Cnt; + Opcode = NewOpcode; + AltOpcode = getAltOpcode(Opcode); + Res.NeedAnalysis = true; + } + } else if (tryToRepresentAsInstArg(I->getOpcode(), + cast(VL[NewOpcodePos]))) + Res.NeedAnalysis = true; + else + return {}; + } else if (Opcode != I->getOpcode()) { + Res.HasAltOpcodes = true; + if (Res.NeedAnalysis && isOdd(NewOpcodePos)) + std::swap(Opcode, AltOpcode); + } } - return Instruction::ShuffleVector; + Res.Opcode = Opcode; + Res.OpcodePos = NewOpcodePos; + return Res; } +namespace { +/// Main data required for vectorization of instructions. +struct InstructionsState { + /// The main opcode for the list of instructions. + unsigned Opcode = 0; + /// The very first instruction in the list with the main opcode. + Value *OpValue = nullptr; + /// Some of the instructions in the list have alternate opcodes. + bool IsAltShuffle = false; + InstructionsState() = default; + InstructionsState(unsigned Opcode, Value *OpValue, bool IsAltShuffle) + : Opcode(Opcode), OpValue(OpValue), IsAltShuffle(IsAltShuffle) {} +}; +} // namespace + /// \returns The opcode if all of the Instructions in \p VL have the same /// opcode, or zero. -static unsigned getSameOpcode(ArrayRef VL) { - Instruction *I0 = dyn_cast(VL[0]); - if (!I0) - return 0; - unsigned Opcode = I0->getOpcode(); - for (int i = 1, e = VL.size(); i < e; i++) { - Instruction *I = dyn_cast(VL[i]); - if (!I || Opcode != I->getOpcode()) { - if (canCombineAsAltInst(Opcode) && i == 1) - return isAltInst(VL); - return 0; +static InstructionsState getSameOpcode(ArrayRef VL) { + auto Res = getMainOpcode(VL); + unsigned Opcode = Res.Opcode; + if (!Res.NeedAnalysis && !Res.HasAltOpcodes) + return {Opcode, VL[Res.OpcodePos], /*IsAltShuffle=*/false}; + auto *OpInst = cast(VL[Res.OpcodePos]); + unsigned AltOpcode = getAltOpcode(Opcode); + for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { + auto *I = cast(VL[Cnt]); + unsigned InstOpcode = I->getOpcode(); + if (Res.NeedAnalysis && !sameOpcodeOrAlt(Opcode, AltOpcode, InstOpcode)) + if (tryToRepresentAsInstArg(InstOpcode, OpInst)) + InstOpcode = (Res.HasAltOpcodes && isOdd(Cnt)) ? AltOpcode : Opcode; + if ((Res.HasAltOpcodes && + InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) || + (!Res.HasAltOpcodes && InstOpcode != Opcode)) { + return {0, OpInst, /*IsAltShuffle=*/false}; } } - return Opcode; + return {Opcode, OpInst, Res.HasAltOpcodes}; } /// Get the intersection (logical and) of all of the potential IR flags /// of each scalar operation (VL) that will be converted into a vector (I). /// Flag set: NSW, NUW, exact, and all of fast-math. -static void propagateIRFlags(Value *I, ArrayRef VL) { +static void propagateIRFlags(Value *I, ArrayRef VL, Value *OpValue) { if (auto *VecOp = dyn_cast(I)) { - if (auto *Intersection = dyn_cast(VL[0])) { - // Intersection is initialized to the 0th scalar, - // so start counting from index '1'. - for (int i = 1, e = VL.size(); i < e; ++i) { - if (auto *Scalar = dyn_cast(VL[i])) - Intersection->andIRFlags(Scalar); - } - VecOp->copyIRFlags(Intersection); + auto *Intersection = cast(OpValue); + const unsigned Opcode = Intersection->getOpcode(); + for (auto *V : VL) { + auto *I = dyn_cast(V); + if (!I) + continue; + if (Opcode == I->getOpcode()) + Intersection->andIRFlags(V); } + VecOp->copyIRFlags(Intersection); } } @@ -236,10 +330,10 @@ } /// \returns True if Extract{Value,Element} instruction extracts element Idx. -static bool matchExtractIndex(Instruction *E, unsigned Idx, unsigned Opcode) { - assert(Opcode == Instruction::ExtractElement || - Opcode == Instruction::ExtractValue); - if (Opcode == Instruction::ExtractElement) { +static bool matchExtractIndex(Instruction *E, unsigned Idx) { + assert(E->getOpcode() == Instruction::ExtractElement || + E->getOpcode() == Instruction::ExtractValue); + if (E->getOpcode() == Instruction::ExtractElement) { ConstantInt *CI = dyn_cast(E->getOperand(1)); return CI && CI->getZExtValue() == Idx; } else { @@ -348,6 +442,7 @@ void deleteTree() { VectorizableTree.clear(); ScalarToTreeEntry.clear(); + ExtraScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); NumLoadsWantToKeepOrder = 0; @@ -408,7 +503,7 @@ /// \returns True if the ExtractElement/ExtractValue instructions in VL can /// be vectorized to use the original vector (or aggregate "bitcast" to a vector). - bool canReuseExtract(ArrayRef VL, unsigned Opcode) const; + bool canReuseExtract(ArrayRef VL, Value *OpValue) const; /// Vectorize a single entry in the tree. Value *vectorizeTree(TreeEntry *E); @@ -418,7 +513,7 @@ /// \returns the pointer to the vectorized value if \p VL is already /// vectorized, or NULL. They may happen in cycles. - Value *alreadyVectorized(ArrayRef VL) const; + Value *alreadyVectorized(ArrayRef VL, Value *OpValue) const; /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. @@ -431,7 +526,7 @@ /// \brief Set the Builder insert point to one after the last instruction in /// the bundle - void setInsertPointAfterBundle(ArrayRef VL); + void setInsertPointAfterBundle(ArrayRef VL, Value *OpValue); /// \returns a vector from a collection of scalars in \p VL. Value *Gather(ArrayRef VL, VectorType *Ty); @@ -442,17 +537,16 @@ /// \reorder commutative operands in alt shuffle if they result in /// vectorized code. - void reorderAltShuffleOperands(ArrayRef VL, + void reorderAltShuffleOperands(unsigned Opcode, ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right); /// \reorder commutative operands to get better probability of /// generating vectorized code. - void reorderInputsAccordingToOpcode(ArrayRef VL, + void reorderInputsAccordingToOpcode(unsigned Opcode, ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right); struct TreeEntry { - TreeEntry() : Scalars(), VectorizedValue(nullptr), - NeedToGather(0) {} + TreeEntry() = default; /// \returns true if the scalars in VL are equal to this entry. bool isSame(ArrayRef VL) const { @@ -464,25 +558,44 @@ ValueList Scalars; /// The Scalars are vectorized into this value. It is initialized to Null. - Value *VectorizedValue; + Value *VectorizedValue = nullptr; /// Do we need to gather this sequence ? - bool NeedToGather; + bool NeedToGather = false; + + /// Info about instruction in this tree entry. + InstructionsState State; }; /// Create a new VectorizableTree entry. - TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized) { + TreeEntry *newTreeEntry(ArrayRef VL, bool Vectorized, + const InstructionsState &S) { + assert((!Vectorized || S.Opcode != 0) && + "Vectorized TreeEntry without opcode"); VectorizableTree.emplace_back(); int idx = VectorizableTree.size() - 1; TreeEntry *Last = &VectorizableTree[idx]; Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end()); Last->NeedToGather = !Vectorized; if (Vectorized) { + Last->State = S; + unsigned AltOpcode = getAltOpcode(S.Opcode); for (int i = 0, e = VL.size(); i != e; ++i) { - assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = idx; + unsigned RealOpcode = + (S.IsAltShuffle && isOdd(i)) ? AltOpcode : S.Opcode; + Value *Key = (cast(VL[i])->getOpcode() == RealOpcode) + ? VL[i] + : S.OpValue; + assert(!getTreeEntry(VL[i], Key) && "Scalar already in tree!"); + if (VL[i] == Key) + ScalarToTreeEntry[Key] = idx; + else + ExtraScalarToTreeEntry[VL[i]][Key] = idx; } } else { + Last->State.Opcode = 0; + Last->State.OpValue = VL[0]; + Last->State.IsAltShuffle = false; MustGather.insert(VL.begin(), VL.end()); } return Last; @@ -492,8 +605,38 @@ /// Holds all of the tree entries. std::vector VectorizableTree; + TreeEntry *getTreeEntry(Value *V) { + auto I = ScalarToTreeEntry.find(V); + if (I != ScalarToTreeEntry.end()) + return &VectorizableTree[I->second]; + return nullptr; + } + + const TreeEntry *getTreeEntry(Value *V) const { + auto I = ScalarToTreeEntry.find(V); + if (I != ScalarToTreeEntry.end()) + return &VectorizableTree[I->second]; + return nullptr; + } + + TreeEntry *getTreeEntry(Value *V, Value *OpValue) { + if (V == OpValue) + return getTreeEntry(V); + auto I = ExtraScalarToTreeEntry.find(V); + if (I != ExtraScalarToTreeEntry.end()) { + auto &STT = I->second; + auto STTI = STT.find(OpValue); + if (STTI != STT.end()) + return &VectorizableTree[STTI->second]; + } + return nullptr; + } + /// Maps a specific scalar to its tree entry. - SmallDenseMap ScalarToTreeEntry; + SmallDenseMap ScalarToTreeEntry; + + /// Maps a specific scalar to its tree entry(s) with leading scalar. + SmallDenseMap> ExtraScalarToTreeEntry; /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -583,9 +726,10 @@ : Inst(nullptr), FirstInBundle(nullptr), NextInBundle(nullptr), NextLoadStore(nullptr), SchedulingRegionID(0), SchedulingPriority(0), Dependencies(InvalidDeps), UnscheduledDeps(InvalidDeps), - UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false) {} + UnscheduledDepsInBundle(InvalidDeps), IsScheduled(false), + OpValue(nullptr) {} - void init(int BlockSchedulingRegionID) { + void init(int BlockSchedulingRegionID, Value *OpValue) { FirstInBundle = this; NextInBundle = nullptr; NextLoadStore = nullptr; @@ -593,6 +737,7 @@ SchedulingRegionID = BlockSchedulingRegionID; UnscheduledDepsInBundle = UnscheduledDeps; clearDependencies(); + this->OpValue = OpValue; } /// Returns true if the dependency information has been calculated. @@ -697,6 +842,9 @@ /// True if this instruction is scheduled (or considered as scheduled in the /// dry-run). bool IsScheduled; + + /// Opcode of the current instruction in the schedule data. + Value *OpValue; }; #ifndef NDEBUG @@ -747,6 +895,18 @@ return nullptr; } + ScheduleData *getScheduleData(Value *V, Value *Key) { + if (V == Key) + return getScheduleData(V); + auto I = ExtraScheduleDataMap.find(V); + if (I != ExtraScheduleDataMap.end()) { + ScheduleData *SD = I->second[Key]; + if (SD && SD->SchedulingRegionID == SchedulingRegionID) + return SD; + } + return nullptr; + } + bool isInSchedulingRegion(ScheduleData *SD) { return SD->SchedulingRegionID == SchedulingRegionID; } @@ -760,59 +920,83 @@ ScheduleData *BundleMember = SD; while (BundleMember) { - // Handle the def-use chain dependencies. - for (Use &U : BundleMember->Inst->operands()) { - ScheduleData *OpDef = getScheduleData(U.get()); - if (OpDef && OpDef->hasValidDependencies() && - OpDef->incrementUnscheduledDeps(-1) == 0) { - // There are no more unscheduled dependencies after decrementing, - // so we can put the dependent instruction into the ready list. - ScheduleData *DepBundle = OpDef->FirstInBundle; - assert(!DepBundle->IsScheduled && - "already scheduled bundle gets ready"); - ReadyList.insert(DepBundle); - DEBUG(dbgs() << "SLP: gets ready (def): " << *DepBundle << "\n"); + if (BundleMember->Inst == BundleMember->OpValue) { + // Handle the def-use chain dependencies. + for (Use &U : BundleMember->Inst->operands()) { + auto *I = dyn_cast(U.get()); + if (!I) + continue; + doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) { + if (OpDef && OpDef->hasValidDependencies() && + OpDef->incrementUnscheduledDeps(-1) == 0) { + // There are no more unscheduled dependencies after + // decrementing, + // so we can put the dependent instruction into the ready list. + ScheduleData *DepBundle = OpDef->FirstInBundle; + assert(!DepBundle->IsScheduled && + "already scheduled bundle gets ready"); + ReadyList.insert(DepBundle); + DEBUG(dbgs() + << "SLP: gets ready (def): " << *DepBundle << "\n"); + } + }); } - } - // Handle the memory dependencies. - for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { - if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { - // There are no more unscheduled dependencies after decrementing, - // so we can put the dependent instruction into the ready list. - ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; - assert(!DepBundle->IsScheduled && - "already scheduled bundle gets ready"); - ReadyList.insert(DepBundle); - DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle << "\n"); + // Handle the memory dependencies. + for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) { + if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) { + // There are no more unscheduled dependencies after decrementing, + // so we can put the dependent instruction into the ready list. + ScheduleData *DepBundle = MemoryDepSD->FirstInBundle; + assert(!DepBundle->IsScheduled && + "already scheduled bundle gets ready"); + ReadyList.insert(DepBundle); + DEBUG(dbgs() << "SLP: gets ready (mem): " << *DepBundle + << "\n"); + } } } BundleMember = BundleMember->NextInBundle; } } + void doForAllOpcodes(Value *V, + function_ref Action) { + if (ScheduleData *SD = getScheduleData(V)) + Action(SD); + auto I = ExtraScheduleDataMap.find(V); + if (I != ExtraScheduleDataMap.end()) + for (auto &P : I->second) + if (P.second->SchedulingRegionID == SchedulingRegionID) + Action(P.second); + } + /// Put all instructions into the ReadyList which are ready for scheduling. template void initialFillReadyList(ReadyListType &ReadyList) { for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - ScheduleData *SD = getScheduleData(I); - if (SD->isSchedulingEntity() && SD->isReady()) { - ReadyList.insert(SD); - DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); - } + doForAllOpcodes(I, [&ReadyList, I](ScheduleData *SD) { + if (SD->isSchedulingEntity() && SD->isReady()) { + ReadyList.insert(SD); + DEBUG(dbgs() << "SLP: initially in ready list: " << *I << "\n"); + } + }); } } /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. - bool tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP); + bool tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, Value *OpValue); /// Un-bundles a group of instructions. - void cancelScheduling(ArrayRef VL); + void cancelScheduling(ArrayRef VL, Value *OpValue); + + /// Allocates schedule data chunk. + ScheduleData *allocateScheduleDataChunks(); /// Extends the scheduling region so that V is inside the region. /// \returns true if the region size is within the limit. - bool extendSchedulingRegion(Value *V); + bool extendSchedulingRegion(Value *V, Value *OpValue); /// Initialize the ScheduleData structures for new instructions in the /// scheduling region. @@ -845,6 +1029,10 @@ /// ScheduleData structures are recycled. DenseMap ScheduleDataMap; + /// Attaches ScheduleData to Instruction with the leading key. + DenseMap> + ExtraScheduleDataMap; + struct ReadyList : SmallVector { void insert(ScheduleData *SD) { push_back(SD); } }; @@ -932,12 +1120,18 @@ for (TreeEntry &EIdx : VectorizableTree) { TreeEntry *Entry = &EIdx; + // No need to handle users of gathered values. + if (Entry->NeedToGather) + continue; + // For each lane: + const unsigned Opcode = Entry->State.Opcode; + const unsigned AltOpcode = getAltOpcode(Opcode); for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - // No need to handle users of gathered values. - if (Entry->NeedToGather) + if (!sameOpcodeOrAlt(Opcode, AltOpcode, + cast(Scalar)->getOpcode())) continue; for (User *U : Scalar->users()) { @@ -948,9 +1142,7 @@ continue; // Skip in-tree scalars that become vectors - if (ScalarToTreeEntry.count(U)) { - int Idx = ScalarToTreeEntry[U]; - TreeEntry *UseEntry = &VectorizableTree[Idx]; + if (TreeEntry *UseEntry = getTreeEntry(U)) { Value *UseScalar = UseEntry->Scalars[0]; // Some in-tree scalars will remain as scalar in vectorized // instructions. If that is the case, the one in Lane 0 will @@ -959,7 +1151,7 @@ !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U << ".\n"); - assert(!VectorizableTree[Idx].NeedToGather && "Bad state"); + assert(!UseEntry->NeedToGather && "Bad state"); continue; } } @@ -976,45 +1168,66 @@ } } +static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) { + switch(Opcode) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Or: + case Instruction::Xor: + return ConstantInt::getNullValue(Ty); + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return ConstantInt::get(Ty, /*V=*/1); + case Instruction::FAdd: + case Instruction::FSub: + return ConstantFP::get(Ty, /*V=*/0.0); + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + return ConstantFP::get(Ty, /*V=*/1.0); + case Instruction::And: + return ConstantInt::getAllOnesValue(Ty); + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + return ConstantInt::getNullValue(Type::getInt32Ty(Ty->getContext())); + default: + break; + } + llvm_unreachable("unknown binop for default constant value"); +} void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { - bool isAltShuffle = false; assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); + InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, S); return; } // Don't handle vectors. - if (VL[0]->getType()->isVectorTy()) { + if (S.OpValue->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, S); return; } - if (StoreInst *SI = dyn_cast(VL[0])) + if (StoreInst *SI = dyn_cast(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, S); return; } - unsigned Opcode = getSameOpcode(VL); - - // Check that this shuffle vector refers to the alternate - // sequence of opcodes. - if (Opcode == Instruction::ShuffleVector) { - Instruction *I0 = dyn_cast(VL[0]); - unsigned Op = I0->getOpcode(); - if (Op != Instruction::ShuffleVector) - isAltShuffle = true; - } // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) { + if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) { DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, S); return; } @@ -1026,33 +1239,37 @@ if (EphValues.count(VL[i])) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, S); return; } } // Check if this is a duplicate of another entry. - if (ScalarToTreeEntry.count(VL[0])) { - int Idx = ScalarToTreeEntry[VL[0]]; - TreeEntry *E = &VectorizableTree[Idx]; + if (TreeEntry *E = getTreeEntry(S.OpValue)) { for (unsigned i = 0, e = VL.size(); i != e; ++i) { DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n"); if (E->Scalars[i] != VL[i]) { DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, S); return; } } - DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n"); + DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue << ".\n"); return; } // Check that none of the instructions in the bundle are already in the tree. + unsigned AltOpcode = getAltOpcode(S.Opcode); for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (ScalarToTreeEntry.count(VL[i])) { + unsigned RealOpcode = (S.IsAltShuffle && isOdd(i)) ? AltOpcode : S.Opcode; + auto *I = dyn_cast(VL[i]); + if (!I) + continue; + Value *Key = (I->getOpcode() == RealOpcode) ? I : S.OpValue; + if (getTreeEntry(I, Key)) { DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, S); return; } } @@ -1062,21 +1279,21 @@ for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, S); return; } } // Check that all of the users of the scalars that we want to vectorize are // schedulable. - Instruction *VL0 = cast(VL[0]); - BasicBlock *BB = cast(VL0)->getParent(); + auto *VL0 = cast(S.OpValue); + BasicBlock *BB = VL0->getParent(); if (!DT->isReachableFromEntry(BB)) { // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, S); return; } @@ -1085,7 +1302,7 @@ for (unsigned j = i+1; j < e; ++j) if (VL[i] == VL[j]) { DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, S); return; } @@ -1095,17 +1312,17 @@ } BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, this)) { + if (!BS.tryScheduleBundle(VL, this, S.OpValue)) { DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); - assert((!BS.getScheduleData(VL[0]) || - !BS.getScheduleData(VL[0])->isPartOfBundle()) && + assert((!BS.getScheduleData(VL0) || + !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false); + newTreeEntry(VL, false, S); return; } DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); - switch (Opcode) { + switch (S.IsAltShuffle ? Instruction::ShuffleVector : S.Opcode) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); @@ -1116,13 +1333,13 @@ cast(VL[j])->getIncomingValueForBlock(PH->getIncomingBlock(i))); if (Term) { DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, S); DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1138,13 +1355,13 @@ } case Instruction::ExtractValue: case Instruction::ExtractElement: { - bool Reuse = canReuseExtract(VL, Opcode); + bool Reuse = canReuseExtract(VL, VL0); if (Reuse) { DEBUG(dbgs() << "SLP: Reusing extract sequence.\n"); } else { - BS.cancelScheduling(VL); + BS.cancelScheduling(VL, VL0); } - newTreeEntry(VL, Reuse); + newTreeEntry(VL, Reuse, S); return; } case Instruction::Load: { @@ -1155,12 +1372,12 @@ // loading/storing it as an i8 struct. If we vectorize loads/stores from // such a struct we read/write packed bits disagreeing with the // unvectorized version. - Type *ScalarTy = VL[0]->getType(); + Type *ScalarTy = VL0->getType(); if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1170,8 +1387,8 @@ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { LoadInst *L = cast(VL[i]); if (!L->isSimple()) { - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1193,7 +1410,7 @@ if (Consecutive) { ++NumLoadsWantToKeepOrder; - newTreeEntry(VL, true); + newTreeEntry(VL, true, S); DEBUG(dbgs() << "SLP: added a vector of loads.\n"); return; } @@ -1207,8 +1424,8 @@ break; } - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); if (ReverseConsecutive) { ++NumLoadsWantToChangeOrder; @@ -1234,13 +1451,13 @@ for (unsigned i = 0; i < VL.size(); ++i) { Type *Ty = cast(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, S); DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1257,19 +1474,19 @@ case Instruction::FCmp: { // Check that all of the compares have the same predicate. CmpInst::Predicate P0 = cast(VL0)->getPredicate(); - Type *ComparedTy = cast(VL[0])->getOperand(0)->getType(); + Type *ComparedTy = VL0->getOperand(0)->getType(); for (unsigned i = 1, e = VL.size(); i < e; ++i) { CmpInst *Cmp = cast(VL[i]); if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, S); DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1301,14 +1518,14 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: { - newTreeEntry(VL, true); + newTreeEntry(VL, true, S); DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to // have the same opcode. if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right); + reorderInputsAccordingToOpcode(S.Opcode, VL, Left, Right); buildTree_rec(Left, Depth + 1); buildTree_rec(Right, Depth + 1); return; @@ -1317,10 +1534,20 @@ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *V : VL) { + auto *I = cast(V); + Value *Operand; + if (I->getOpcode() != S.Opcode) { + assert(Instruction::isBinaryOp(S.Opcode)); + Operand = isOdd(i) + ? getDefaultConstantForOpcode(S.Opcode, I->getType()) + : V; + } else + Operand = I->getOperand(i); + Operands.push_back(Operand); + } - buildTree_rec(Operands, Depth+1); + buildTree_rec(Operands, Depth + 1); } return; } @@ -1329,21 +1556,21 @@ for (unsigned j = 0; j < VL.size(); ++j) { if (cast(VL[j])->getNumOperands() != 2) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); return; } } // We can't combine several GEPs into one vector if they operate on // different types. - Type *Ty0 = cast(VL0)->getOperand(0)->getType(); + Type *Ty0 = VL0->getOperand(0)->getType(); for (unsigned j = 0; j < VL.size(); ++j) { Type *CurTy = cast(VL[j])->getOperand(0)->getType(); if (Ty0 != CurTy) { DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); return; } } @@ -1354,13 +1581,13 @@ if (!isa(Op)) { DEBUG( dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, S); DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1376,13 +1603,13 @@ // Check if the stores are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true); + newTreeEntry(VL, true, S); DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; @@ -1394,13 +1621,13 @@ } case Instruction::Call: { // Check if the calls are all to the same vectorizable intrinsic. - CallInst *CI = cast(VL[0]); + CallInst *CI = cast(VL0); // Check if this is an Intrinsic call or something that can be // represented by an intrinsic call Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1413,8 +1640,8 @@ if (!CI2 || CI2->getCalledFunction() != Int || getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1424,8 +1651,8 @@ if (hasVectorInstrinsicScalarOpd(ID, 1)) { Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument "<< A1I<<"!=" << A1J << "\n"); @@ -1437,15 +1664,15 @@ !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(), CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true); + newTreeEntry(VL, true, S); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -1460,19 +1687,19 @@ case Instruction::ShuffleVector: { // If this is not an alternate sequence of opcode like add-sub // then do not vectorize this instruction. - if (!isAltShuffle) { - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + if (!S.IsAltShuffle) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true); + newTreeEntry(VL, true, S); DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; - reorderAltShuffleOperands(VL, Left, Right); + reorderAltShuffleOperands(S.Opcode, VL, Left, Right); buildTree_rec(Left, Depth + 1); buildTree_rec(Right, Depth + 1); return; @@ -1481,16 +1708,26 @@ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *V : VL) { + auto *I = cast(V); + Value *Operand; + if (!sameOpcodeOrAlt(S.Opcode, AltOpcode, I->getOpcode())) { + assert(Instruction::isBinaryOp(S.Opcode)); + Operand = isOdd(i) + ? getDefaultConstantForOpcode(S.Opcode, I->getType()) + : V; + } else + Operand = I->getOperand(i); + Operands.push_back(Operand); + } buildTree_rec(Operands, Depth + 1); } return; } default: - BS.cancelScheduling(VL); - newTreeEntry(VL, false); + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, false, S); DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -1521,19 +1758,18 @@ return N; } -bool BoUpSLP::canReuseExtract(ArrayRef VL, unsigned Opcode) const { - assert(Opcode == Instruction::ExtractElement || - Opcode == Instruction::ExtractValue); - assert(Opcode == getSameOpcode(VL) && "Invalid opcode"); +bool BoUpSLP::canReuseExtract(ArrayRef VL, Value *OpValue) const { + Instruction *E0 = cast(OpValue); + assert(E0->getOpcode() == Instruction::ExtractElement || + E0->getOpcode() == Instruction::ExtractValue); + assert(E0->getOpcode() == getSameOpcode(VL).Opcode && "Invalid opcode"); // Check if all of the extracts come from the same vector and from the // correct offset. - Value *VL0 = VL[0]; - Instruction *E0 = cast(VL0); Value *Vec = E0->getOperand(0); // We have to extract from a vector/aggregate with the same number of elements. unsigned NElts; - if (Opcode == Instruction::ExtractValue) { + if (E0->getOpcode() == Instruction::ExtractValue) { const DataLayout &DL = E0->getModule()->getDataLayout(); NElts = canMapToVector(Vec->getType(), DL); if (!NElts) @@ -1550,14 +1786,11 @@ return false; // Check that all of the indices extract from the correct offset. - if (!matchExtractIndex(E0, 0, Opcode)) - return false; - - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - Instruction *E = cast(VL[i]); - if (!matchExtractIndex(E, i, Opcode)) + for (unsigned I = 0, E = VL.size(); I < E; ++I) { + Instruction *Inst = cast(VL[I]); + if (!matchExtractIndex(Inst, I)) return false; - if (E->getOperand(0) != Vec) + if (Inst->getOperand(0) != Vec) return false; } @@ -1567,16 +1800,17 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef VL = E->Scalars; - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) + Type *ScalarTy = E->State.OpValue->getType(); + if (StoreInst *SI = dyn_cast(E->State.OpValue)) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); // If we have computed a smaller type for the expression, update VecTy so // that the costs will be accurate. - if (MinBWs.count(VL[0])) + if (MinBWs.count(E->State.OpValue)) VecTy = VectorType::get( - IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + IntegerType::get(F->getContext(), MinBWs[E->State.OpValue].first), + VL.size()); if (E->NeedToGather) { if (allConstant(VL)) @@ -1586,16 +1820,16 @@ } return getGatherCost(E->Scalars); } - unsigned Opcode = getSameOpcode(VL); - assert(Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); - Instruction *VL0 = cast(VL[0]); - switch (Opcode) { + assert(E->State.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + Instruction *VL0 = cast(E->State.OpValue); + switch (E->State.IsAltShuffle ? Instruction::ShuffleVector + : E->State.Opcode) { case Instruction::PHI: { return 0; } case Instruction::ExtractValue: case Instruction::ExtractElement: { - if (canReuseExtract(VL, Opcode)) { + if (canReuseExtract(VL, E->State.OpValue)) { int DeadCost = 0; for (unsigned i = 0, e = VL.size(); i < e; ++i) { Instruction *E = cast(VL[i]); @@ -1636,8 +1870,9 @@ // Calculate the cost of this instruction. VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * - TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty()); - int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy); + TTI->getCmpSelInstrCost(E->State.Opcode, ScalarTy, + Builder.getInt1Ty()); + int VecCost = TTI->getCmpSelInstrCost(E->State.Opcode, VecTy, MaskTy); return VecCost - ScalarCost; } case Instruction::Add: @@ -1674,32 +1909,33 @@ // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt = nullptr; - for (unsigned i = 0; i < VL.size(); ++i) { - const Instruction *I = cast(VL[i]); - if (!isa(I->getOperand(1))) { - Op2VK = TargetTransformInfo::OK_AnyValue; - break; - } - if (i == 0) { - CInt = cast(I->getOperand(1)); - continue; + if (auto *CInt = dyn_cast(VL0->getOperand(1))) { + const unsigned Opcode = E->State.Opcode; + for (auto *V : VL) { + auto *I = cast(V); + if (I == VL0 || Opcode != I->getOpcode()) + continue; + if (!isa(I->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_AnyValue; + break; + } + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && + CInt != cast(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } + // FIXME: Currently cost of model modification for division by power of + // 2 is handled for X86 and AArch64. Add support for other targets. if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && - CInt != cast(I->getOperand(1))) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; - } - // FIXME: Currently cost of model modification for division by power of - // 2 is handled for X86 and AArch64. Add support for other targets. - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && - CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_PowerOf2; + CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; + } else + Op2VK = TargetTransformInfo::OK_AnyValue; int ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, - Op2VK, Op1VP, Op2VP); - int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, - Op1VP, Op2VP); + TTI->getArithmeticInstrCost(E->State.Opcode, ScalarTy, + Op1VK, Op2VK, Op1VP, Op2VP); + int VecCost = TTI->getArithmeticInstrCost(E->State.Opcode, VecTy, Op1VK, + Op2VK, Op1VP, Op2VP); return VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -1766,23 +2002,18 @@ TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = TargetTransformInfo::OK_AnyValue; - int ScalarCost = 0; - int VecCost = 0; - for (Value *i : VL) { - Instruction *I = cast(i); - if (!I) - break; - ScalarCost += - TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy, Op1VK, Op2VK); - } + unsigned AltOpcode = getAltOpcode(E->State.Opcode); + int ScalarCost = + TTI->getArithmeticInstrCost(E->State.Opcode, ScalarTy, Op1VK, Op2VK) * + VL.size() / 2; + ScalarCost += + TTI->getArithmeticInstrCost(AltOpcode, ScalarTy, Op1VK, Op2VK) * + VL.size() / 2; // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. - Instruction *I0 = cast(VL[0]); - VecCost = - TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy, Op1VK, Op2VK); - Instruction *I1 = cast(VL[1]); - VecCost += - TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy, Op1VK, Op2VK); + int VecCost = + TTI->getArithmeticInstrCost(E->State.Opcode, VecTy, Op1VK, Op2VK); + VecCost += TTI->getArithmeticInstrCost(AltOpcode, VecTy, Op1VK, Op2VK); VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Alternate, VecTy, 0); return VecCost - ScalarCost; @@ -1849,7 +2080,7 @@ Instruction *PrevInst = nullptr; for (const auto &N : VectorizableTree) { - Instruction *Inst = dyn_cast(N.Scalars[0]); + Instruction *Inst = dyn_cast(N.State.OpValue); if (!Inst) continue; @@ -1861,7 +2092,7 @@ // Update LiveValues. LiveValues.erase(PrevInst); for (auto &J : PrevInst->operands()) { - if (isa(&*J) && ScalarToTreeEntry.count(&*J)) + if (isa(&*J) && getTreeEntry(&*J)) LiveValues.insert(cast(&*J)); } @@ -1909,7 +2140,7 @@ for (TreeEntry &TE : VectorizableTree) { int C = getEntryCost(&TE); DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle that starts with " - << *TE.Scalars[0] << ".\n"); + << *TE.State.OpValue << ".\n"); Cost += C; } @@ -1930,7 +2161,7 @@ // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0].State.OpValue; if (MinBWs.count(ScalarRoot)) { auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto Extend = @@ -1979,13 +2210,20 @@ // load a[3] + load b[3] // Reordering the second load b[1] load a[1] would allow us to vectorize this // code. -void BoUpSLP::reorderAltShuffleOperands(ArrayRef VL, +void BoUpSLP::reorderAltShuffleOperands(unsigned Opcode, ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right) { // Push left and right operands of binary operation into Left and Right - for (Value *i : VL) { - Left.push_back(cast(i)->getOperand(0)); - Right.push_back(cast(i)->getOperand(1)); + unsigned AltOpcode = getAltOpcode(Opcode); + for (Value *V : VL) { + auto *I = cast(V); + if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) { + Left.push_back(I->getOperand(0)); + Right.push_back(I->getOperand(1)); + } else { + Left.push_back(I); + Right.push_back(getDefaultConstantForOpcode(Opcode, I->getType())); + } } // Reorder if we have a commutative operation and consecutive access @@ -2030,34 +2268,37 @@ // The vectorizer is trying to either have all elements one side being // instruction with the same opcode to enable further vectorization, or having // a splat to lower the vectorizing cost. -static bool shouldReorderOperands(int i, Instruction &I, - SmallVectorImpl &Left, - SmallVectorImpl &Right, - bool AllSameOpcodeLeft, - bool AllSameOpcodeRight, bool SplatLeft, - bool SplatRight) { - Value *VLeft = I.getOperand(0); - Value *VRight = I.getOperand(1); +static bool shouldReorderOperands( + int Idx, unsigned Opcode, Instruction &I, ArrayRef Left, + ArrayRef Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, + bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { + if (I.getOpcode() == Opcode) { + VLeft = I.getOperand(0); + VRight = I.getOperand(1); + } else { + VLeft = &I; + VRight = getDefaultConstantForOpcode(Opcode, I.getType()); + } // If we have "SplatRight", try to see if commuting is needed to preserve it. if (SplatRight) { - if (VRight == Right[i - 1]) + if (VRight == Right[Idx - 1]) // Preserve SplatRight return false; - if (VLeft == Right[i - 1]) { + if (VLeft == Right[Idx - 1]) { // Commuting would preserve SplatRight, but we don't want to break // SplatLeft either, i.e. preserve the original order if possible. // (FIXME: why do we care?) - if (SplatLeft && VLeft == Left[i - 1]) + if (SplatLeft && VLeft == Left[Idx - 1]) return false; return true; } } // Symmetrically handle Right side. if (SplatLeft) { - if (VLeft == Left[i - 1]) + if (VLeft == Left[Idx - 1]) // Preserve SplatLeft return false; - if (VRight == Left[i - 1]) + if (VRight == Left[Idx - 1]) return true; } @@ -2067,7 +2308,7 @@ // If we have "AllSameOpcodeRight", try to see if the left operands preserves // it and not the right, in this case we want to commute. if (AllSameOpcodeRight) { - unsigned RightPrevOpcode = cast(Right[i - 1])->getOpcode(); + unsigned RightPrevOpcode = cast(Right[Idx - 1])->getOpcode(); if (IRight && RightPrevOpcode == IRight->getOpcode()) // Do not commute, a match on the right preserves AllSameOpcodeRight return false; @@ -2077,14 +2318,14 @@ // AllSameOpcodeLeft, i.e. preserve the original order if possible. // (FIXME: why do we care?) if (AllSameOpcodeLeft && ILeft && - cast(Left[i - 1])->getOpcode() == ILeft->getOpcode()) + cast(Left[Idx - 1])->getOpcode() == ILeft->getOpcode()) return false; return true; } } // Symmetrically handle Left side. if (AllSameOpcodeLeft) { - unsigned LeftPrevOpcode = cast(Left[i - 1])->getOpcode(); + unsigned LeftPrevOpcode = cast(Left[Idx - 1])->getOpcode(); if (ILeft && LeftPrevOpcode == ILeft->getOpcode()) return false; if (IRight && LeftPrevOpcode == IRight->getOpcode()) @@ -2093,15 +2334,24 @@ return false; } -void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, +void BoUpSLP::reorderInputsAccordingToOpcode(unsigned Opcode, + ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right) { if (VL.size()) { // Peel the first iteration out of the loop since there's nothing // interesting to do anyway and it simplifies the checks in the loop. - auto VLeft = cast(VL[0])->getOperand(0); - auto VRight = cast(VL[0])->getOperand(1); + auto *I = cast(VL[0]); + Value *VLeft; + Value *VRight; + if (I->getOpcode() == Opcode) { + VLeft = I->getOperand(0); + VRight = I->getOperand(1); + } else { + VLeft = I; + VRight = getDefaultConstantForOpcode(Opcode, I->getType()); + } if (!isa(VRight) && isa(VLeft)) // Favor having instruction to the right. FIXME: why? std::swap(VLeft, VRight); @@ -2118,16 +2368,21 @@ for (unsigned i = 1, e = VL.size(); i != e; ++i) { Instruction *I = cast(VL[i]); - assert(I->isCommutative() && "Can only process commutative instruction"); + assert(((I->getOpcode() == Opcode && I->isCommutative()) || + (I->getOpcode() != Opcode && Instruction::isCommutative(Opcode))) && + "Can only process commutative instruction"); // Commute to favor either a splat or maximizing having the same opcodes on // one side. - if (shouldReorderOperands(i, *I, Left, Right, AllSameOpcodeLeft, - AllSameOpcodeRight, SplatLeft, SplatRight)) { - Left.push_back(I->getOperand(1)); - Right.push_back(I->getOperand(0)); + Value *VLeft; + Value *VRight; + if (shouldReorderOperands(i, Opcode, *I, Left, Right, AllSameOpcodeLeft, + AllSameOpcodeRight, SplatLeft, SplatRight, VLeft, + VRight)) { + Left.push_back(VRight); + Right.push_back(VLeft); } else { - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); + Left.push_back(VLeft); + Right.push_back(VRight); } // Update Splat* and AllSameOpcode* after the insertion. SplatRight = SplatRight && (Right[i - 1] == Right[i]); @@ -2180,14 +2435,18 @@ } } -void BoUpSLP::setInsertPointAfterBundle(ArrayRef VL) { +void BoUpSLP::setInsertPointAfterBundle(ArrayRef VL, Value *OpValue) { // Get the basic block this bundle is in. All instructions in the bundle // should be in this block. - auto *Front = cast(VL.front()); + auto *Front = cast(OpValue); auto *BB = Front->getParent(); - assert(all_of(make_range(VL.begin(), VL.end()), [&](Value *V) -> bool { - return cast(V)->getParent() == BB; + const unsigned Opcode = cast(OpValue)->getOpcode(); + const unsigned AltOpcode = getAltOpcode(Opcode); + assert(all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { + return !sameOpcodeOrAlt(Opcode, AltOpcode, + cast(V)->getOpcode()) || + cast(V)->getParent() == BB; })); // The last instruction in the bundle in program order. @@ -2198,10 +2457,12 @@ // VL.back() and iterate over schedule data until we reach the end of the // bundle. The end of the bundle is marked by null ScheduleData. if (BlocksSchedules.count(BB)) { - auto *Bundle = BlocksSchedules[BB]->getScheduleData(VL.back()); + auto *Bundle = + BlocksSchedules[BB]->getScheduleData(chooseKey(OpValue, VL.back())); if (Bundle && Bundle->isPartOfBundle()) for (; Bundle; Bundle = Bundle->NextInBundle) - LastInst = Bundle->Inst; + if (Bundle->OpValue == Bundle->Inst) + LastInst = Bundle->Inst; } // LastInst can still be null at this point if there's either not an entry @@ -2225,7 +2486,7 @@ if (!LastInst) { SmallPtrSet Bundle(VL.begin(), VL.end()); for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { - if (Bundle.erase(&I)) + if (Bundle.erase(&I) && sameOpcodeOrAlt(Opcode, AltOpcode, I.getOpcode())) LastInst = &I; if (Bundle.empty()) break; @@ -2248,9 +2509,7 @@ CSEBlocks.insert(Insrt->getParent()); // Add to our 'need-to-extract' list. - if (ScalarToTreeEntry.count(VL[i])) { - int Idx = ScalarToTreeEntry[VL[i]]; - TreeEntry *E = &VectorizableTree[Idx]; + if (TreeEntry *E = getTreeEntry(VL[i])) { // Find which lane we need to extract. int FoundLane = -1; for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) { @@ -2269,12 +2528,8 @@ return Vec; } -Value *BoUpSLP::alreadyVectorized(ArrayRef VL) const { - SmallDenseMap::const_iterator Entry - = ScalarToTreeEntry.find(VL[0]); - if (Entry != ScalarToTreeEntry.end()) { - int Idx = Entry->second; - const TreeEntry *En = &VectorizableTree[Idx]; +Value *BoUpSLP::alreadyVectorized(ArrayRef VL, Value *OpValue) const { + if (const TreeEntry *En = getTreeEntry(OpValue)) { if (En->isSame(VL) && En->VectorizedValue) return En->VectorizedValue; } @@ -2282,15 +2537,16 @@ } Value *BoUpSLP::vectorizeTree(ArrayRef VL) { - if (ScalarToTreeEntry.count(VL[0])) { - int Idx = ScalarToTreeEntry[VL[0]]; - TreeEntry *E = &VectorizableTree[Idx]; - if (E->isSame(VL)) - return vectorizeTree(E); + InstructionsState S = getSameOpcode(VL); + if (S.Opcode) { + if (TreeEntry *E = getTreeEntry(S.OpValue)) { + if (E->isSame(VL)) + return vectorizeTree(E); + } } - Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) + Type *ScalarTy = S.OpValue->getType(); + if (StoreInst *SI = dyn_cast(S.OpValue)) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); @@ -2301,26 +2557,25 @@ IRBuilder<>::InsertPointGuard Guard(Builder); if (E->VectorizedValue) { - DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); + DEBUG(dbgs() << "SLP: Diamond merged for " << *E->State.OpValue << ".\n"); return E->VectorizedValue; } - Instruction *VL0 = cast(E->Scalars[0]); + Instruction *VL0 = cast(E->State.OpValue); Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast(VL0)) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); if (E->NeedToGather) { - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); auto *V = Gather(E->Scalars, VecTy); E->VectorizedValue = V; return V; } - unsigned Opcode = getSameOpcode(E->Scalars); - - switch (Opcode) { + switch (E->State.IsAltShuffle ? + Instruction::ShuffleVector : E->State.Opcode) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI()); @@ -2357,18 +2612,18 @@ } case Instruction::ExtractElement: { - if (canReuseExtract(E->Scalars, Instruction::ExtractElement)) { + if (canReuseExtract(E->Scalars, VL0)) { Value *V = VL0->getOperand(0); E->VectorizedValue = V; return V; } - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); auto *V = Gather(E->Scalars, VecTy); E->VectorizedValue = V; return V; } case Instruction::ExtractValue: { - if (canReuseExtract(E->Scalars, Instruction::ExtractValue)) { + if (canReuseExtract(E->Scalars, VL0)) { LoadInst *LI = cast(VL0->getOperand(0)); Builder.SetInsertPoint(LI); PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); @@ -2377,7 +2632,7 @@ E->VectorizedValue = V; return propagateMetadata(V, E->Scalars); } - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); auto *V = Gather(E->Scalars, VecTy); E->VectorizedValue = V; return V; @@ -2398,11 +2653,11 @@ for (Value *V : E->Scalars) INVL.push_back(cast(V)->getOperand(0)); - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); Value *InVec = vectorizeTree(INVL); - if (Value *V = alreadyVectorized(E->Scalars)) + if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; CastInst *CI = dyn_cast(VL0); @@ -2419,23 +2674,23 @@ RHSV.push_back(cast(V)->getOperand(1)); } - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); Value *L = vectorizeTree(LHSV); Value *R = vectorizeTree(RHSV); - if (Value *V = alreadyVectorized(E->Scalars)) + if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V; - if (Opcode == Instruction::FCmp) + if (E->State.Opcode == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); else V = Builder.CreateICmp(P0, L, R); E->VectorizedValue = V; - propagateIRFlags(E->VectorizedValue, E->Scalars); + propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; return V; } @@ -2447,13 +2702,13 @@ FalseVec.push_back(cast(V)->getOperand(2)); } - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); Value *Cond = vectorizeTree(CondVec); Value *True = vectorizeTree(TrueVec); Value *False = vectorizeTree(FalseVec); - if (Value *V = alreadyVectorized(E->Scalars)) + if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; Value *V = Builder.CreateSelect(Cond, True, False); @@ -2481,25 +2736,33 @@ case Instruction::Xor: { ValueList LHSVL, RHSVL; if (isa(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(E->Scalars, LHSVL, RHSVL); + reorderInputsAccordingToOpcode(E->State.Opcode, E->Scalars, LHSVL, + RHSVL); else for (Value *V : E->Scalars) { - LHSVL.push_back(cast(V)->getOperand(0)); - RHSVL.push_back(cast(V)->getOperand(1)); + auto *I = cast(V); + if (I->getOpcode() == E->State.Opcode) { + LHSVL.push_back(I->getOperand(0)); + RHSVL.push_back(I->getOperand(1)); + } else { + LHSVL.push_back(V); + RHSVL.push_back( + getDefaultConstantForOpcode(E->State.Opcode, I->getType())); + } } - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); Value *LHS = vectorizeTree(LHSVL); Value *RHS = vectorizeTree(RHSVL); - if (Value *V = alreadyVectorized(E->Scalars)) + if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; - BinaryOperator *BinOp = cast(VL0); - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), LHS, RHS); + Value *V = Builder.CreateBinOp( + static_cast(E->State.Opcode), LHS, RHS); E->VectorizedValue = V; - propagateIRFlags(E->VectorizedValue, E->Scalars); + propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; if (Instruction *I = dyn_cast(V)) @@ -2510,7 +2773,7 @@ case Instruction::Load: { // Loads are inserted at the head of the tree because we don't want to // sink them all the way down past store instructions. - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); LoadInst *LI = cast(VL0); Type *ScalarLoadTy = LI->getType(); @@ -2522,9 +2785,9 @@ // The pointer operand uses an in-tree scalar so we add the new BitCast to // ExternalUses list to make sure that an extract will be generated in the // future. - if (ScalarToTreeEntry.count(LI->getPointerOperand())) - ExternalUses.push_back( - ExternalUser(LI->getPointerOperand(), cast(VecPtr), 0)); + Value *PO = LI->getPointerOperand(); + if (getTreeEntry(PO)) + ExternalUses.push_back(ExternalUser(PO, cast(VecPtr), 0)); unsigned Alignment = LI->getAlignment(); LI = Builder.CreateLoad(VecPtr); @@ -2545,7 +2808,7 @@ for (Value *V : E->Scalars) ValueOp.push_back(cast(V)->getValueOperand()); - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); Value *VecValue = vectorizeTree(ValueOp); Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(), @@ -2555,9 +2818,9 @@ // The pointer operand uses an in-tree scalar so we add the new BitCast to // ExternalUses list to make sure that an extract will be generated in the // future. - if (ScalarToTreeEntry.count(SI->getPointerOperand())) - ExternalUses.push_back( - ExternalUser(SI->getPointerOperand(), cast(VecPtr), 0)); + Value *PO = SI->getPointerOperand(); + if (getTreeEntry(PO)) + ExternalUses.push_back(ExternalUser(PO, cast(VecPtr), 0)); if (!Alignment) { Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); @@ -2568,7 +2831,7 @@ return propagateMetadata(S, E->Scalars); } case Instruction::GetElementPtr: { - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); ValueList Op0VL; for (Value *V : E->Scalars) @@ -2599,7 +2862,7 @@ } case Instruction::Call: { CallInst *CI = cast(VL0); - setInsertPointAfterBundle(E->Scalars); + setInsertPointAfterBundle(E->Scalars, VL0); Function *FI; Intrinsic::ID IID = Intrinsic::not_intrinsic; Value *ScalarArg = nullptr; @@ -2612,7 +2875,7 @@ // ctlz,cttz and powi are special intrinsics whose second argument is // a scalar. This argument should not be vectorized. if (hasVectorInstrinsicScalarOpd(IID, 1) && j == 1) { - CallInst *CEI = cast(E->Scalars[0]); + CallInst *CEI = cast(VL0); ScalarArg = CEI->getArgOperand(j); OpVecs.push_back(CEI->getArgOperand(j)); continue; @@ -2638,34 +2901,35 @@ // The scalar argument uses an in-tree scalar so we add the new vectorized // call to ExternalUses list to make sure that an extract will be // generated in the future. - if (ScalarArg && ScalarToTreeEntry.count(ScalarArg)) + if (ScalarArg && getTreeEntry(ScalarArg)) ExternalUses.push_back(ExternalUser(ScalarArg, cast(V), 0)); E->VectorizedValue = V; - propagateIRFlags(E->VectorizedValue, E->Scalars); + propagateIRFlags(E->VectorizedValue, E->Scalars, VL0); ++NumVectorInstructions; return V; } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - assert(isa(VL0) && "Invalid Shuffle Vector Operand"); - reorderAltShuffleOperands(E->Scalars, LHSVL, RHSVL); - setInsertPointAfterBundle(E->Scalars); + assert(Instruction::isBinaryOp(E->State.Opcode) && + "Invalid Shuffle Vector Operand"); + reorderAltShuffleOperands(E->State.Opcode, E->Scalars, LHSVL, RHSVL); + setInsertPointAfterBundle(E->Scalars, VL0); Value *LHS = vectorizeTree(LHSVL); Value *RHS = vectorizeTree(RHSVL); - if (Value *V = alreadyVectorized(E->Scalars)) + if (Value *V = alreadyVectorized(E->Scalars, VL0)) return V; // Create a vector of LHS op1 RHS - BinaryOperator *BinOp0 = cast(VL0); - Value *V0 = Builder.CreateBinOp(BinOp0->getOpcode(), LHS, RHS); + Value *V0 = Builder.CreateBinOp( + static_cast(E->State.Opcode), LHS, RHS); + unsigned AltOpcode = getAltOpcode(E->State.Opcode); // Create a vector of LHS op2 RHS - Instruction *VL1 = cast(E->Scalars[1]); - BinaryOperator *BinOp1 = cast(VL1); - Value *V1 = Builder.CreateBinOp(BinOp1->getOpcode(), LHS, RHS); + Value *V1 = Builder.CreateBinOp( + static_cast(AltOpcode), LHS, RHS); // Create shuffle to take alternate operations from the vector. // Also, gather up odd and even scalar ops to propagate IR flags to @@ -2674,7 +2938,7 @@ unsigned e = E->Scalars.size(); SmallVector Mask(e); for (unsigned i = 0; i < e; ++i) { - if (i & 1) { + if (isOdd(i)) { Mask[i] = Builder.getInt32(e + i); OddScalars.push_back(E->Scalars[i]); } else { @@ -2684,8 +2948,12 @@ } Value *ShuffleMask = ConstantVector::get(Mask); - propagateIRFlags(V0, EvenScalars); - propagateIRFlags(V1, OddScalars); + InstructionsState S = getSameOpcode(EvenScalars); + assert(!S.IsAltShuffle); + propagateIRFlags(V0, EvenScalars, S.OpValue); + S = getSameOpcode(OddScalars); + assert(!S.IsAltShuffle); + propagateIRFlags(V1, OddScalars, S.OpValue); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); E->VectorizedValue = V; @@ -2714,7 +2982,7 @@ // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0].State.OpValue; if (MinBWs.count(ScalarRoot)) { if (auto *I = dyn_cast(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); @@ -2746,10 +3014,9 @@ // has multiple uses of the same value. if (!is_contained(Scalar->users(), User)) continue; - assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar"); + TreeEntry *E = getTreeEntry(Scalar); + assert(E && "Invalid scalar"); - int Idx = ScalarToTreeEntry[Scalar]; - TreeEntry *E = &VectorizableTree[Idx]; assert(!E->NeedToGather && "Extracting from a gather list"); Value *Vec = E->VectorizedValue; @@ -2798,14 +3065,21 @@ for (TreeEntry &EIdx : VectorizableTree) { TreeEntry *Entry = &EIdx; + // No need to handle users of gathered values. + if (Entry->NeedToGather) + continue; + + assert(Entry->VectorizedValue && "Can't find vectorizable value"); + // For each lane: + const unsigned Opcode = Entry->State.Opcode; + const unsigned AltOpcode = getAltOpcode(Opcode); for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - // No need to handle users of gathered values. - if (Entry->NeedToGather) - continue; - assert(Entry->VectorizedValue && "Can't find vectorizable value"); + if (!sameOpcodeOrAlt(Opcode, AltOpcode, + cast(Scalar)->getOpcode())) + continue; Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { @@ -2813,9 +3087,8 @@ for (User *U : Scalar->users()) { DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n"); - assert((ScalarToTreeEntry.count(U) || - // It is legal to replace users in the ignorelist by undef. - is_contained(UserIgnoreList, U)) && + // It is legal to replace users in the ignorelist by undef. + assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) && "Replacing out-of-tree value with undef"); } #endif @@ -2920,8 +3193,8 @@ // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, - BoUpSLP *SLP) { - if (isa(VL[0])) + BoUpSLP *SLP, Value *OpValue) { + if (isa(OpValue)) return true; // Initialize the instruction bundle. @@ -2929,17 +3202,17 @@ ScheduleData *PrevInBundle = nullptr; ScheduleData *Bundle = nullptr; bool ReSchedule = false; - DEBUG(dbgs() << "SLP: bundle: " << *VL[0] << "\n"); + DEBUG(dbgs() << "SLP: bundle: " << *OpValue << "\n"); // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { - if (!extendSchedulingRegion(V)) + if (!extendSchedulingRegion(V, OpValue)) return false; } for (Value *V : VL) { - ScheduleData *BundleMember = getScheduleData(V); + ScheduleData *BundleMember = getScheduleData(V, chooseKey(OpValue, V)); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); if (BundleMember->IsScheduled) { @@ -2971,8 +3244,9 @@ // It is seldom that this needs to be done a second time after adding the // initial bundle to the region. for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - ScheduleData *SD = getScheduleData(I); - SD->clearDependencies(); + doForAllOpcodes(I, [](ScheduleData *SD) { + SD->clearDependencies(); + }); } ReSchedule = true; } @@ -3000,17 +3274,18 @@ } } if (!Bundle->isReady()) { - cancelScheduling(VL); + cancelScheduling(VL, OpValue); return false; } return true; } -void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL) { - if (isa(VL[0])) +void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef VL, + Value *OpValue) { + if (isa(OpValue)) return; - ScheduleData *Bundle = getScheduleData(VL[0]); + ScheduleData *Bundle = getScheduleData(OpValue)->FirstInBundle; DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); @@ -3032,17 +3307,44 @@ } } -bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V) { - if (getScheduleData(V)) +BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() { + // Allocate a new ScheduleData for the instruction. + if (ChunkPos >= ChunkSize) { + ScheduleDataChunks.push_back(llvm::make_unique(ChunkSize)); + ChunkPos = 0; + } + return &(ScheduleDataChunks.back()[ChunkPos++]); +} + +bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V, + Value *OpValue) { + if (getScheduleData(V, chooseKey(OpValue, V))) return true; Instruction *I = dyn_cast(V); assert(I && "bundle member must be an instruction"); assert(!isa(I) && "phi nodes don't need to be scheduled"); + auto &&CheckSheduleForI = [this, OpValue](Instruction *I) -> bool { + if (ScheduleData *ISD = getScheduleData(I)) { + assert(isInSchedulingRegion(ISD) && + "new ScheduleData already in scheduling region"); + (void)ISD; + ScheduleData *SD = allocateScheduleDataChunks(); + SD->Inst = I; + SD->init(SchedulingRegionID, OpValue); + ExtraScheduleDataMap[I][OpValue] = SD; + return true; + } + return false; + }; + if (CheckSheduleForI(I)) + return true; if (!ScheduleStart) { // It's the first instruction in the new region. initScheduleData(I, I->getNextNode(), nullptr, nullptr); ScheduleStart = I; ScheduleEnd = I->getNextNode(); + if (chooseKey(OpValue, I) != I) + (void)CheckSheduleForI(I); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n"); return true; @@ -3064,6 +3366,8 @@ if (&*UpIter == I) { initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion); ScheduleStart = I; + if (chooseKey(OpValue, I) != I) + (void)CheckSheduleForI(I); DEBUG(dbgs() << "SLP: extend schedule region start to " << *I << "\n"); return true; } @@ -3074,6 +3378,8 @@ initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion, nullptr); ScheduleEnd = I->getNextNode(); + if (chooseKey(OpValue, I) != I) + (void)CheckSheduleForI(I); assert(ScheduleEnd && "tried to vectorize a TerminatorInst?"); DEBUG(dbgs() << "SLP: extend schedule region end to " << *I << "\n"); return true; @@ -3094,19 +3400,13 @@ for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { ScheduleData *SD = ScheduleDataMap[I]; if (!SD) { - // Allocate a new ScheduleData for the instruction. - if (ChunkPos >= ChunkSize) { - ScheduleDataChunks.push_back( - llvm::make_unique(ChunkSize)); - ChunkPos = 0; - } - SD = &(ScheduleDataChunks.back()[ChunkPos++]); + SD = allocateScheduleDataChunks(); ScheduleDataMap[I] = SD; SD->Inst = I; } assert(!isInSchedulingRegion(SD) && "new ScheduleData already in scheduling region"); - SD->init(SchedulingRegionID); + SD->init(SchedulingRegionID, I); if (I->mayReadOrWriteMemory()) { // Update the linked list of memory accessing instructions. @@ -3147,26 +3447,36 @@ BundleMember->Dependencies = 0; BundleMember->resetUnscheduledDeps(); - // Handle def-use chain dependencies. - for (User *U : BundleMember->Inst->users()) { - if (isa(U)) { - ScheduleData *UseSD = getScheduleData(U); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { - BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) { - BundleMember->incrementUnscheduledDeps(1); - } - if (!DestBundle->hasValidDependencies()) { - WorkList.push_back(DestBundle); + if (BundleMember->OpValue != BundleMember->Inst) { + ScheduleData *UseSD = getScheduleData(BundleMember->Inst); + if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { + BundleMember->Dependencies++; + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) + BundleMember->incrementUnscheduledDeps(1); + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); + } + } else { + // Handle def-use chain dependencies. + for (User *U : BundleMember->Inst->users()) { + if (isa(U)) { + ScheduleData *UseSD = getScheduleData(U); + if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { + BundleMember->Dependencies++; + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) + BundleMember->incrementUnscheduledDeps(1); + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); } + } else { + // I'm not sure if this can ever happen. But we need to be safe. + // This lets the instruction/bundle never be scheduled and + // eventually disable vectorization. + BundleMember->Dependencies++; + BundleMember->incrementUnscheduledDeps(1); } - } else { - // I'm not sure if this can ever happen. But we need to be safe. - // This lets the instruction/bundle never be scheduled and - // eventually disable vectorization. - BundleMember->Dependencies++; - BundleMember->incrementUnscheduledDeps(1); } } @@ -3243,10 +3553,11 @@ assert(ScheduleStart && "tried to reset schedule on block which has not been scheduled"); for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - ScheduleData *SD = getScheduleData(I); - assert(isInSchedulingRegion(SD)); - SD->IsScheduled = false; - SD->resetUnscheduledDeps(); + doForAllOpcodes(I, [this](ScheduleData *SD) { + assert(isInSchedulingRegion(SD)); + SD->IsScheduled = false; + SD->resetUnscheduledDeps(); + }); } ReadyInsts.clear(); } @@ -3276,15 +3587,17 @@ int NumToSchedule = 0; for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd; I = I->getNextNode()) { - ScheduleData *SD = BS->getScheduleData(I); - assert( - SD->isPartOfBundle() == (ScalarToTreeEntry.count(SD->Inst) != 0) && - "scheduler and vectorizer have different opinion on what is a bundle"); - SD->FirstInBundle->SchedulingPriority = Idx++; - if (SD->isSchedulingEntity()) { - BS->calculateDependencies(SD, false, this); - NumToSchedule++; - } + BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { + assert(SD->isPartOfBundle() == + (getTreeEntry(SD->Inst, SD->OpValue) != nullptr) && + "scheduler and vectorizer have different opinion on what is a " + "bundle"); + SD->FirstInBundle->SchedulingPriority = Idx++; + if (SD->isSchedulingEntity()) { + BS->calculateDependencies(SD, false, this); + NumToSchedule++; + } + }); } BS->initialFillReadyList(ReadyInsts); @@ -3300,12 +3613,14 @@ ScheduleData *BundleMember = picked; while (BundleMember) { Instruction *pickedInst = BundleMember->Inst; - if (LastScheduledInst->getNextNode() != pickedInst) { + bool SameOpcode = pickedInst == BundleMember->OpValue; + if (SameOpcode && LastScheduledInst->getNextNode() != pickedInst) { BS->BB->getInstList().remove(pickedInst); BS->BB->getInstList().insert(LastScheduledInst->getIterator(), pickedInst); } - LastScheduledInst = pickedInst; + if (SameOpcode) + LastScheduledInst = pickedInst; BundleMember = BundleMember->NextInBundle; } Index: test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -43,22 +43,16 @@ ; CHECK-LABEL: @add1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -86,22 +80,16 @@ ; CHECK-LABEL: @sub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -205,22 +193,18 @@ ; CHECK-LABEL: @addsub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -248,22 +232,18 @@ ; CHECK-LABEL: @addsub1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -291,22 +271,16 @@ ; CHECK-LABEL: @mul( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[MUL]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9 -; CHECK-NEXT: store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -334,22 +308,16 @@ ; CHECK-LABEL: @shl0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SHL5:%.*]] = shl i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -453,22 +421,16 @@ ; CHECK-LABEL: @add1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -496,22 +458,16 @@ ; CHECK-LABEL: @sub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -615,22 +571,18 @@ ; CHECK-LABEL: @addsub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[SUB5]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -658,22 +610,18 @@ ; CHECK-LABEL: @addsub1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -701,22 +649,16 @@ ; CHECK-LABEL: @mulf( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 -; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -825,22 +767,16 @@ ; CHECK-LABEL: @sub0fn( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: