Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -333,7 +333,7 @@ case Instruction::Sub: return Instruction::Add; default: - return 0; + return Op; } } @@ -346,6 +346,23 @@ return Opcode == CheckedOpcode || AltOpcode == CheckedOpcode; } +/// Checks if the \p Opcode can be considered as an operand of a (possibly) +/// binary operation \p I. +/// \returns The code of the binary operation of instruction \p I if the +/// instruction with \p Opcode can be considered as an operand of \p I with the +/// default value. +static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) { + assert(!sameOpcodeOrAlt(Opcode, getAltOpcode(Opcode), I->getOpcode()) && + "Invalid Opcode"); + if (Opcode != Instruction::PHI && isa(I) && + I->getOpcode() != Instruction::SRem && + I->getOpcode() != Instruction::URem && + I->getOpcode() != Instruction::FRem && + (I->getType()->isIntegerTy() || cast(I)->isFast())) + return I->getOpcode(); + return 0; +} + /// Chooses the correct key for scheduling data. If \p Op has the same (or /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p /// OpValue. @@ -367,7 +384,12 @@ struct RawInstructionsData { /// Main Opcode of the instructions going to be vectorized. unsigned Opcode = 0; - + /// Position of the first instruction with the \a Opcode. + unsigned OpcodePos = 0; + /// Need an additional analysis (if at least one of the instruction is not + /// same instruction kind as an instruction at OpcodePos position in the + /// list). + bool NeedAnalysis = false; /// The list of instructions have some instructions with alternate opcodes. bool HasAltOpcodes = false; }; @@ -382,16 +404,39 @@ return {}; RawInstructionsData Res; unsigned Opcode = I0->getOpcode(); + unsigned AltOpcode = getAltOpcode(Opcode); + unsigned NewOpcodePos = 0; // Walk through the list of the vectorized instructions // in order to check its structure described by RawInstructionsData. for (unsigned Cnt = 0, E = VL.size(); Cnt != E; ++Cnt) { auto *I = dyn_cast(VL[Cnt]); if (!I) return {}; - if (Opcode != I->getOpcode()) - Res.HasAltOpcodes = true; + if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) { + if (Opcode != I->getOpcode()) { + Res.HasAltOpcodes = true; + if (Res.NeedAnalysis && isOdd(NewOpcodePos)) + std::swap(Opcode, AltOpcode); + } + continue; + } + if (unsigned NewOpcode = tryToRepresentAsInstArg(Opcode, I)) { + if (!Instruction::isBinaryOp(Opcode) || + !Instruction::isCommutative(Opcode)) { + NewOpcodePos = Cnt; + Opcode = NewOpcode; + AltOpcode = getAltOpcode(Opcode); + Res.NeedAnalysis = true; + } + } else if (tryToRepresentAsInstArg(I->getOpcode(), + cast(VL[NewOpcodePos]))) { + Res.NeedAnalysis = true; + } else { + return {}; + } } Res.Opcode = Opcode; + Res.OpcodePos = NewOpcodePos; return Res; } @@ -408,9 +453,14 @@ /// Some of the instructions in the list have alternate opcodes. bool IsAltShuffle = false; + /// Some of the instructions in the list have non alternate opcodes. + bool IsNonAlt = false; + InstructionsState() = default; - InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle) - : OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle) {} + InstructionsState(Value *OpValue, unsigned Opcode, bool IsAltShuffle, + bool IsNonAlt) + : OpValue(OpValue), Opcode(Opcode), IsAltShuffle(IsAltShuffle), + IsNonAlt(IsNonAlt) {} }; } // end anonymous namespace @@ -421,23 +471,31 @@ static InstructionsState getSameOpcode(ArrayRef VL) { auto Res = getMainOpcode(VL); unsigned Opcode = Res.Opcode; - if (!Res.HasAltOpcodes) - return InstructionsState(VL[0], Opcode, false); - auto *OpInst = cast(VL[0]); unsigned AltOpcode = getAltOpcode(Opcode); + bool IsNonAlt = llvm::any_of(VL, [Opcode, AltOpcode](Value *V) { + return isa(V) && + !sameOpcodeOrAlt(Opcode, AltOpcode, + cast(V)->getOpcode()); + }); + if (!Res.NeedAnalysis && !Res.HasAltOpcodes) + return InstructionsState(VL[Res.OpcodePos], Opcode, false, IsNonAlt); + auto *OpInst = cast(VL[Res.OpcodePos]); // Examine each element in the list instructions VL to determine // if some operations there could be considered as an alternative - // (for example as subtraction relates to addition operation). + // (for example as subtraction relates to addition operation) or + // operation could be an operand of a (possibly) binary operation. for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { auto *I = cast(VL[Cnt]); unsigned InstOpcode = I->getOpcode(); + if (Res.NeedAnalysis && !sameOpcodeOrAlt(Opcode, AltOpcode, InstOpcode)) + if (tryToRepresentAsInstArg(InstOpcode, OpInst)) + InstOpcode = (Res.HasAltOpcodes && isOdd(Cnt)) ? AltOpcode : Opcode; if ((Res.HasAltOpcodes && InstOpcode != (isOdd(Cnt) ? AltOpcode : Opcode)) || - (!Res.HasAltOpcodes && InstOpcode != Opcode)) { - return InstructionsState(OpInst, 0, false); - } + (!Res.HasAltOpcodes && InstOpcode != Opcode)) + return InstructionsState(OpInst, 0, false, IsNonAlt); } - return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes); + return InstructionsState(OpInst, Opcode, Res.HasAltOpcodes, IsNonAlt); } /// \returns true if all of the values in \p VL have the same type or false @@ -588,6 +646,7 @@ void deleteTree() { VectorizableTree.clear(); ScalarToTreeEntry.clear(); + ExtraScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); NumOpsWantToKeepOrder.clear(); @@ -747,10 +806,14 @@ /// The TreeEntry index containing the user of this entry. We can actually /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + + /// Info about instruction in this tree entry. + InstructionsState State; }; /// Create a new VectorizableTree entry. void newTreeEntry(ArrayRef VL, bool Vectorized, int &UserTreeIdx, + const InstructionsState &S, ArrayRef ReuseShuffleIndices = None, ArrayRef ReorderIndices = None) { VectorizableTree.emplace_back(VectorizableTree); @@ -762,11 +825,24 @@ ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; if (Vectorized) { + Last->State = S; + unsigned AltOpcode = getAltOpcode(S.Opcode); for (int i = 0, e = VL.size(); i != e; ++i) { - assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = idx; + unsigned RealOpcode = + (S.IsAltShuffle && isOdd(i)) ? AltOpcode : S.Opcode; + Value *Key = (cast(VL[i])->getOpcode() == RealOpcode) + ? VL[i] + : S.OpValue; + assert(!getTreeEntry(VL[i], Key) && "Scalar already in tree!"); + if (VL[i] == Key) + ScalarToTreeEntry[Key] = idx; + else + ExtraScalarToTreeEntry[VL[i]][Key] = idx; } } else { + Last->State.Opcode = 0; + Last->State.OpValue = VL[0]; + Last->State.IsAltShuffle = false; MustGather.insert(VL.begin(), VL.end()); } @@ -786,8 +862,24 @@ return nullptr; } + TreeEntry *getTreeEntry(Value *V, Value *OpValue) { + if (V == OpValue) + return getTreeEntry(V); + auto I = ExtraScalarToTreeEntry.find(V); + if (I != ExtraScalarToTreeEntry.end()) { + auto &STT = I->second; + auto STTI = STT.find(OpValue); + if (STTI != STT.end()) + return &VectorizableTree[STTI->second]; + } + return nullptr; + } + /// Maps a specific scalar to its tree entry. - SmallDenseMap ScalarToTreeEntry; + SmallDenseMap ScalarToTreeEntry; + + /// Maps a specific scalar to its tree entry(s) with leading scalar. + SmallDenseMap> ExtraScalarToTreeEntry; /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -994,6 +1086,7 @@ /// Opcode of the current instruction in the schedule data. Value *OpValue = nullptr; + }; #ifndef NDEBUG @@ -1063,10 +1156,7 @@ ScheduleData *BundleMember = SD; while (BundleMember) { - if (BundleMember->Inst != BundleMember->OpValue) { - BundleMember = BundleMember->NextInBundle; - continue; - } + // Handle the def-use chain dependencies. for (Use &U : BundleMember->Inst->operands()) { auto *I = dyn_cast(U.get()); @@ -1106,13 +1196,23 @@ void doForAllOpcodes(Value *V, function_ref Action) { - if (ScheduleData *SD = getScheduleData(V)) - Action(SD); + bool Found = false; auto I = ExtraScheduleDataMap.find(V); - if (I != ExtraScheduleDataMap.end()) - for (auto &P : I->second) - if (P.second->SchedulingRegionID == SchedulingRegionID) - Action(P.second); + if (I != ExtraScheduleDataMap.end()) { + for (auto &P : I->second) { + ScheduleData *SD = P.second; + if (SD && !SD->isPartOfBundle()) + continue; + if (SD && SD->SchedulingRegionID == SchedulingRegionID) { + Found = true; + Action(SD); + } + } + } + if (ScheduleData *SD = getScheduleData(V)) { + if (!Found || SD->isPartOfBundle()) + Action(SD); + } } /// Put all instructions into the ReadyList which are ready for scheduling. @@ -1132,7 +1232,8 @@ /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are /// actually moved at this stage. - bool tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, Value *OpValue); + bool tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, + const InstructionsState &S); /// Un-bundles a group of instructions. void cancelScheduling(ArrayRef VL, Value *OpValue); @@ -1387,9 +1488,16 @@ continue; // For each lane: + const unsigned Opcode = Entry->State.Opcode; + const unsigned AltOpcode = getAltOpcode(Opcode); for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; int FoundLane = Lane; + + if (!sameOpcodeOrAlt(Opcode, AltOpcode, + cast(Scalar)->getOpcode())) + continue; + if (!Entry->ReuseShuffleIndices.empty()) { FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(), @@ -1425,6 +1533,11 @@ } } + // Skip partially vectorized bundles with internal + // dependency and non alternative opcode. + if (!getTreeEntry(U) && getTreeEntry(U, Scalar) == Entry) + continue; + // Ignore users in the user ignore list. if (is_contained(UserIgnoreList, UserInst)) continue; @@ -1437,6 +1550,34 @@ } } +static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) { + switch(Opcode) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + return ConstantInt::getNullValue(Ty); + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + return ConstantInt::get(Ty, /*V=*/1); + case Instruction::FAdd: + case Instruction::FSub: + return ConstantFP::get(Ty, /*V=*/0.0); + case Instruction::FMul: + case Instruction::FDiv: + return ConstantFP::get(Ty, /*V=*/1.0); + case Instruction::And: + return ConstantInt::getAllOnesValue(Ty); + default: + break; + } + llvm_unreachable("unknown binop for default constant value"); +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, int UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -1444,31 +1585,48 @@ InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } if (StoreInst *SI = dyn_cast(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.Opcode) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } + unsigned AltOpcode = getAltOpcode(S.Opcode); + if (S.IsNonAlt && VL.size() > 2) { + unsigned SameOrAlt = 0; + for (Value *V : VL) { + auto *Instr = cast(V); + if (sameOpcodeOrAlt(S.Opcode, AltOpcode, Instr->getOpcode())) + SameOrAlt++; + } + if (SameOrAlt <= (VL.size() / 2)) { + LLVM_DEBUG( + dbgs() + << "SLP: Number of pseudo instructions greater than others.\n"); + newTreeEntry(VL, false, UserTreeIdx, S); + return; + } + } + // We now know that this is a vector of instructions of the same type from // the same block. @@ -1477,7 +1635,7 @@ if (EphValues.count(VL[i])) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1487,7 +1645,7 @@ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); if (!E->isSame(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } // Record the reuse of the tree node. FIXME, currently this is only used to @@ -1506,7 +1664,7 @@ if (getTreeEntry(I)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1516,7 +1674,7 @@ for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1530,7 +1688,7 @@ // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } @@ -1550,7 +1708,7 @@ LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } VL = UniqueValues; @@ -1562,12 +1720,12 @@ BlockScheduling &BS = *BSRef.get(); - if (!BS.tryScheduleBundle(VL, this, VL0)) { + if (!BS.tryScheduleBundle(VL, this, S)) { LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n"); assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1588,12 +1746,12 @@ dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1614,7 +1772,7 @@ if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, ReuseShuffleIndicies); return; } @@ -1631,12 +1789,12 @@ auto StoredCurrentOrderAndNum = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++StoredCurrentOrderAndNum->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, ReuseShuffleIndicies, StoredCurrentOrderAndNum->getFirst()); return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); - newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, S, ReuseShuffleIndicies); BS.cancelScheduling(VL, VL0); return; } @@ -1652,7 +1810,7 @@ if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1665,7 +1823,7 @@ auto *L = cast(V); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1695,14 +1853,14 @@ if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, ReuseShuffleIndicies, I->getFirst()); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } @@ -1712,7 +1870,7 @@ LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -1732,13 +1890,13 @@ Type *Ty = cast(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1761,14 +1919,14 @@ if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1800,7 +1958,7 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1816,10 +1974,21 @@ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); - - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + for (Value *VecOp : VL) { + auto *I = cast(VecOp); + if (I->getOpcode() == S.Opcode) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.Opcode) && + "Expected a binary operation."); + Value *Operand = isOdd(i) + ? getDefaultConstantForOpcode(S.Opcode, I->getType()) + : VecOp; + Operands.push_back(Operand); + } + if (allSameType(Operands)) + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; @@ -1829,7 +1998,7 @@ if (cast(VL[j])->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } @@ -1843,7 +2012,7 @@ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } @@ -1855,12 +2024,12 @@ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1877,12 +2046,12 @@ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; @@ -1900,7 +2069,7 @@ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1914,7 +2083,7 @@ getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1925,7 +2094,7 @@ Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument " << A1I << "!=" << A1J << "\n"); return; @@ -1937,22 +2106,32 @@ CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) { - CallInst *CI2 = dyn_cast(j); - Operands.push_back(CI2->getArgOperand(i)); + for (Value *VecOp : VL) { + auto *I = cast(VecOp); + if (I->getOpcode() == S.Opcode) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.Opcode) && + "Expected a binary operation."); + Value *Operand = isOdd(i) + ? getDefaultConstantForOpcode(S.Opcode, I->getType()) + : VecOp; + Operands.push_back(Operand); } - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + if (allSameType(Operands)) + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1961,11 +2140,11 @@ // then do not vectorize this instruction. if (!S.IsAltShuffle) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. @@ -1980,8 +2159,19 @@ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *VecOp : VL) { + auto *I = cast(VecOp); + if (sameOpcodeOrAlt(S.Opcode, AltOpcode, I->getOpcode())) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.Opcode) && + "Expected a binary operation."); + Value *Operand = isOdd(i) + ? getDefaultConstantForOpcode(S.Opcode, I->getType()) + : VecOp; + Operands.push_back(Operand); + } buildTree_rec(Operands, Depth + 1, UserTreeIdx); } @@ -1989,7 +2179,7 @@ default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -2147,11 +2337,10 @@ } return ReuseShuffleCost + getGatherCost(VL); } - InstructionsState S = getSameOpcode(VL); - assert(S.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); - Instruction *VL0 = cast(S.OpValue); - unsigned ShuffleOrOp = S.IsAltShuffle ? - (unsigned) Instruction::ShuffleVector : S.Opcode; + assert(E->State.Opcode && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + auto *VL0 = cast(E->State.OpValue); + unsigned ShuffleOrOp = E->State.IsAltShuffle ? + (unsigned) Instruction::ShuffleVector : E->State.Opcode; switch (ShuffleOrOp) { case Instruction::PHI: return 0; @@ -2239,7 +2428,7 @@ if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * - TTI->getCastInstrCost(S.Opcode, ScalarTy, SrcTy, VL0); + TTI->getCastInstrCost(VL0->getOpcode(), VL0->getType(), SrcTy, VL0); } // Calculate the cost of this instruction. @@ -2261,13 +2450,13 @@ // Calculate the cost of this instruction. if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * - TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, + TTI->getCmpSelInstrCost(VL0->getOpcode(), VL0->getType(), Builder.getInt1Ty(), VL0); } VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * - TTI->getCmpSelInstrCost(S.Opcode, ScalarTy, Builder.getInt1Ty(), VL0); - int VecCost = TTI->getCmpSelInstrCost(S.Opcode, VecTy, MaskTy, VL0); + TTI->getCmpSelInstrCost(ShuffleOrOp, ScalarTy, Builder.getInt1Ty(), VL0); + int VecCost = TTI->getCmpSelInstrCost(ShuffleOrOp, VecTy, MaskTy, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Add: @@ -2293,7 +2482,7 @@ TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; + TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueProperties Op1VP = TargetTransformInfo::OP_None; TargetTransformInfo::OperandValueProperties Op2VP = @@ -2304,40 +2493,40 @@ // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt = nullptr; - for (unsigned i = 0; i < VL.size(); ++i) { - const Instruction *I = cast(VL[i]); - if (!isa(I->getOperand(1))) { - Op2VK = TargetTransformInfo::OK_AnyValue; - break; - } - if (i == 0) { - CInt = cast(I->getOperand(1)); - continue; + if (auto *CInt = dyn_cast(VL0->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_UniformConstantValue; + const unsigned Opcode = E->State.Opcode; + for (auto *V : VL) { + auto *I = cast(V); + if (I == VL0 || Opcode != I->getOpcode()) + continue; + if (!isa(I->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_AnyValue; + break; + } + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && + CInt != cast(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } + // FIXME: Currently cost of model modification for division by power of + // 2 is handled for X86 and AArch64. Add support for other targets. if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && - CInt != cast(I->getOperand(1))) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_PowerOf2; } - // FIXME: Currently cost of model modification for division by power of - // 2 is handled for X86 and AArch64. Add support for other targets. - if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && CInt && - CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_PowerOf2; SmallVector Operands(VL0->operand_values()); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * - TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, + TTI->getArithmeticInstrCost(VL0->getOpcode(), VL0->getType(), Op1VK, Op2VK, Op1VP, Op2VP, Operands); } - int ScalarCost = - VecTy->getNumElements() * - TTI->getArithmeticInstrCost(S.Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, - Op2VP, Operands); - int VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy, Op1VK, Op2VK, - Op1VP, Op2VP, Operands); + int ScalarCost = VecTy->getNumElements() * + TTI->getArithmeticInstrCost(E->State.Opcode, ScalarTy, + Op1VK, Op2VK, Op1VP, Op2VP); + int VecCost = TTI->getArithmeticInstrCost(E->State.Opcode, VecTy, Op1VK, + Op2VK, Op1VP, Op2VP); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -2526,7 +2715,7 @@ Instruction *PrevInst = nullptr; for (const auto &N : VectorizableTree) { - Instruction *Inst = dyn_cast(N.Scalars[0]); + auto *Inst = dyn_cast(N.State.OpValue); if (!Inst) continue; @@ -2610,7 +2799,7 @@ int C = getEntryCost(&TE); LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C - << " for bundle that starts with " << *TE.Scalars[0] + << " for bundle that starts with " << *TE.State.OpValue << ".\n"); Cost += C; } @@ -2632,7 +2821,7 @@ // extend the extracted value back to the original type. Here, we account // for the extract and the added cost of the sign extend if needed. auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth); - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0].State.OpValue; if (MinBWs.count(ScalarRoot)) { auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first); auto Extend = @@ -2709,13 +2898,15 @@ SmallVectorImpl &Right) { // Push left and right operands of binary operation into Left and Right unsigned AltOpcode = getAltOpcode(Opcode); - (void)AltOpcode; for (Value *V : VL) { auto *I = cast(V); - assert(sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode()) && - "Incorrect instruction in vector"); - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); + if (sameOpcodeOrAlt(Opcode, AltOpcode, I->getOpcode())) { + Left.push_back(I->getOperand(0)); + Right.push_back(I->getOperand(1)); + } else { + Left.push_back(I); + Right.push_back(getDefaultConstantForOpcode(Opcode, I->getType())); + } } // Reorder if we have a commutative operation and consecutive access @@ -2764,8 +2955,13 @@ int i, unsigned Opcode, Instruction &I, ArrayRef Left, ArrayRef Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { - VLeft = I.getOperand(0); - VRight = I.getOperand(1); + if (I.getOpcode() == Opcode) { + VLeft = I.getOperand(0); + VRight = I.getOperand(1); + } else { + VLeft = &I; + VRight = getDefaultConstantForOpcode(Opcode, I.getType()); + } // If we have "SplatRight", try to see if commuting is needed to preserve it. if (SplatRight) { if (VRight == Right[i - 1]) @@ -2829,8 +3025,15 @@ // Peel the first iteration out of the loop since there's nothing // interesting to do anyway and it simplifies the checks in the loop. auto *I = cast(VL[0]); - Value *VLeft = I->getOperand(0); - Value *VRight = I->getOperand(1); + Value *VLeft; + Value *VRight; + if (I->getOpcode() == Opcode) { + VLeft = I->getOperand(0); + VRight = I->getOperand(1); + } else { + VLeft = I; + VRight = getDefaultConstantForOpcode(Opcode, I->getType()); + } if (!isa(VRight) && isa(VLeft)) // Favor having instruction to the right. FIXME: why? std::swap(VLeft, VRight); @@ -2935,13 +3138,22 @@ // VL.back() and iterate over schedule data until we reach the end of the // bundle. The end of the bundle is marked by null ScheduleData. if (BlocksSchedules.count(BB)) { - auto *Bundle = + ScheduleData *Bundle = BlocksSchedules[BB]->getScheduleData(isOneOf(OpValue, VL.back())); - if (Bundle && Bundle->isPartOfBundle()) - for (; Bundle; Bundle = Bundle->NextInBundle) - if (Bundle->OpValue == Bundle->Inst) + if (Bundle && Bundle->isPartOfBundle()) { + Bundle = Bundle->FirstInBundle; + int MaxDist = 0; + while (Bundle) { + int Dist = std::distance(BB->begin(), Bundle->Inst->getIterator()); + if (Dist > MaxDist) { LastInst = Bundle->Inst; + MaxDist = Dist; + } + Bundle = Bundle->NextInBundle; + } + } } + // LastInst can still be null at this point if there's either not an entry // for BB in BlocksSchedules or there's no ScheduleData available for @@ -3087,12 +3299,12 @@ IRBuilder<>::InsertPointGuard Guard(Builder); if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " + << *E->State.OpValue << ".\n"); return E->VectorizedValue; } - InstructionsState S = getSameOpcode(E->Scalars); - Instruction *VL0 = cast(E->Scalars[0]); + auto *VL0 = cast(E->State.OpValue); Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast(VL0)) ScalarTy = SI->getValueOperand()->getType(); @@ -3115,8 +3327,8 @@ return V; } - unsigned ShuffleOrOp = S.IsAltShuffle ? - (unsigned) Instruction::ShuffleVector : S.Opcode; + unsigned ShuffleOrOp = E->State.IsAltShuffle ? + (unsigned) Instruction::ShuffleVector : E->State.Opcode; switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); @@ -3281,7 +3493,7 @@ CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V; - if (S.Opcode == Instruction::FCmp) + if (E->State.Opcode == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); else V = Builder.CreateICmp(P0, L, R); @@ -3343,13 +3555,19 @@ case Instruction::Xor: { ValueList LHSVL, RHSVL; if (isa(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(S.Opcode, E->Scalars, LHSVL, + reorderInputsAccordingToOpcode(E->State.Opcode, E->Scalars, LHSVL, RHSVL); else for (Value *V : E->Scalars) { auto *I = cast(V); - LHSVL.push_back(I->getOperand(0)); - RHSVL.push_back(I->getOperand(1)); + if (I->getOpcode() == E->State.Opcode) { + LHSVL.push_back(I->getOperand(0)); + RHSVL.push_back(I->getOperand(1)); + } else { + LHSVL.push_back(V); + RHSVL.push_back( + getDefaultConstantForOpcode(E->State.Opcode, I->getType())); + } } setInsertPointAfterBundle(E->Scalars, VL0); @@ -3363,7 +3581,7 @@ } Value *V = Builder.CreateBinOp( - static_cast(S.Opcode), LHS, RHS); + static_cast(VL0->getOpcode()), LHS, RHS); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); @@ -3545,9 +3763,9 @@ } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - assert(Instruction::isBinaryOp(S.Opcode) && + assert(Instruction::isBinaryOp(E->State.Opcode) && "Invalid Shuffle Vector Operand"); - reorderAltShuffleOperands(S.Opcode, E->Scalars, LHSVL, RHSVL); + reorderAltShuffleOperands(E->State.Opcode, E->Scalars, LHSVL, RHSVL); setInsertPointAfterBundle(E->Scalars, VL0); Value *LHS = vectorizeTree(LHSVL); @@ -3560,9 +3778,9 @@ // Create a vector of LHS op1 RHS Value *V0 = Builder.CreateBinOp( - static_cast(S.Opcode), LHS, RHS); + static_cast(E->State.Opcode), LHS, RHS); - unsigned AltOpcode = getAltOpcode(S.Opcode); + unsigned AltOpcode = getAltOpcode(E->State.Opcode); // Create a vector of LHS op2 RHS Value *V1 = Builder.CreateBinOp( static_cast(AltOpcode), LHS, RHS); @@ -3584,8 +3802,13 @@ } Value *ShuffleMask = ConstantVector::get(Mask); - propagateIRFlags(V0, EvenScalars); - propagateIRFlags(V1, OddScalars); + InstructionsState S = getSameOpcode(EvenScalars); + assert(!S.IsAltShuffle && "Unexpected alternate opcode"); + propagateIRFlags(V0, EvenScalars, S.OpValue); + + S = getSameOpcode(OddScalars); + assert(!S.IsAltShuffle && "Unexpected alternate opcode"); + propagateIRFlags(V1, OddScalars, S.OpValue); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); if (Instruction *I = dyn_cast(V)) @@ -3623,7 +3846,7 @@ // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0].State.OpValue; if (MinBWs.count(ScalarRoot)) { if (auto *I = dyn_cast(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); @@ -3656,6 +3879,7 @@ // has multiple uses of the same value. if (User && !is_contained(Scalar->users(), User)) continue; + TreeEntry *E = getTreeEntry(Scalar); assert(E && "Invalid scalar"); assert(!E->NeedToGather && "Extracting from a gather list"); @@ -3735,9 +3959,22 @@ assert(Entry->VectorizedValue && "Can't find vectorizable value"); // For each lane: + const unsigned Opcode = Entry->State.Opcode; + const unsigned AltOpcode = getAltOpcode(Opcode); for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (!sameOpcodeOrAlt(Opcode, AltOpcode, + cast(Scalar)->getOpcode())) + continue; + + // Skip partially vectorized bundles with internal + // dependency and non alternative opcode. + if (llvm::any_of(Scalar->users(), [this, Entry, Scalar](User *U) { + return !getTreeEntry(U) && getTreeEntry(U, Scalar) == Entry; + })) + continue; + Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { #ifndef NDEBUG @@ -3848,8 +4085,9 @@ // Groups the instructions to a bundle (which is then a single scheduling entity) // and schedules instructions until the bundle gets ready. bool BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, - BoUpSLP *SLP, Value *OpValue) { - if (isa(OpValue)) + BoUpSLP *SLP, + const InstructionsState &S) { + if (isa(S.OpValue)) return true; // Initialize the instruction bundle. @@ -3857,17 +4095,17 @@ ScheduleData *PrevInBundle = nullptr; ScheduleData *Bundle = nullptr; bool ReSchedule = false; - LLVM_DEBUG(dbgs() << "SLP: bundle: " << *OpValue << "\n"); + LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { - if (!extendSchedulingRegion(V, OpValue)) + if (!extendSchedulingRegion(V, S.OpValue)) return false; } for (Value *V : VL) { - ScheduleData *BundleMember = getScheduleData(V); + ScheduleData *BundleMember = getScheduleData(V, isOneOf(S.OpValue, V)); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); if (BundleMember->IsScheduled) { @@ -3929,9 +4167,10 @@ } } if (!Bundle->isReady()) { - cancelScheduling(VL, OpValue); + cancelScheduling(VL, S.OpValue); return false; } + return true; } @@ -3940,7 +4179,7 @@ if (isa(OpValue)) return; - ScheduleData *Bundle = getScheduleData(OpValue); + ScheduleData *Bundle = getScheduleData(OpValue)->FirstInBundle; LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); @@ -3984,10 +4223,15 @@ return false; assert(isInSchedulingRegion(ISD) && "ScheduleData not in scheduling region"); - ScheduleData *SD = allocateScheduleDataChunks(); - SD->Inst = I; - SD->init(SchedulingRegionID, OpValue); - ExtraScheduleDataMap[I][OpValue] = SD; + if (I == OpValue) { + ExtraScheduleDataMap[I][OpValue] = ISD; + } else + { + ScheduleData *SD = allocateScheduleDataChunks(); + SD->Inst = I; + SD->init(SchedulingRegionID, OpValue); + ExtraScheduleDataMap[I][OpValue] = SD; + } return true; }; if (CheckSheduleForI(I)) @@ -4059,6 +4303,7 @@ SD = allocateScheduleDataChunks(); ScheduleDataMap[I] = SD; SD->Inst = I; + SD->OpValue = I; } assert(!isInSchedulingRegion(SD) && "new ScheduleData already in scheduling region"); @@ -4105,19 +4350,40 @@ << "\n"); BundleMember->Dependencies = 0; BundleMember->resetUnscheduledDeps(); - - // Handle def-use chain dependencies. if (BundleMember->OpValue != BundleMember->Inst) { - ScheduleData *UseSD = getScheduleData(BundleMember->Inst); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { - BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) - BundleMember->incrementUnscheduledDeps(1); - if (!DestBundle->hasValidDependencies()) - WorkList.push_back(DestBundle); + for (Use &U : BundleMember->Inst->operands()) { + auto *I = dyn_cast(U.get()); + if (!I) + continue; + ScheduleData *UseSD = getScheduleData(I); + if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle) && + UseSD->FirstInBundle == BundleMember->FirstInBundle) { + BundleMember->Dependencies++; + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) { + BundleMember->incrementUnscheduledDeps(1); + } + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); + } + } + for (User *U : BundleMember->Inst->users()) { + if (isa(U)) { + ScheduleData *UseSD = getScheduleData(U); + if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle) && + UseSD->FirstInBundle != BundleMember->FirstInBundle) { + BundleMember->Dependencies++; + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) { + BundleMember->incrementUnscheduledDeps(1); + } + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); + } + } } } else { + // Handle def-use chain dependencies. for (User *U : BundleMember->Inst->users()) { if (isa(U)) { ScheduleData *UseSD = getScheduleData(U); @@ -4249,7 +4515,7 @@ I = I->getNextNode()) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { assert(SD->isPartOfBundle() == - (getTreeEntry(SD->Inst) != nullptr) && + (getTreeEntry(SD->Inst, SD->OpValue) != nullptr) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { @@ -4258,6 +4524,7 @@ } }); } + BS->initialFillReadyList(ReadyInsts); Instruction *LastScheduledInst = BS->ScheduleEnd; @@ -4271,13 +4538,13 @@ // there yet. ScheduleData *BundleMember = picked; while (BundleMember) { - Instruction *pickedInst = BundleMember->Inst; - if (LastScheduledInst->getNextNode() != pickedInst) { - BS->BB->getInstList().remove(pickedInst); + Instruction *PickedInst = BundleMember->Inst; + if (LastScheduledInst != PickedInst) { + BS->BB->getInstList().remove(PickedInst); BS->BB->getInstList().insert(LastScheduledInst->getIterator(), - pickedInst); + PickedInst); } - LastScheduledInst = pickedInst; + LastScheduledInst = PickedInst; BundleMember = BundleMember->NextInBundle; } Index: test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt < %s -slp-vectorizer -instcombine -S | FileCheck %s target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" @@ -5,10 +6,10 @@ define <2 x i64> @build_vec_v2i64(<2 x i64> %v0, <2 x i64> %v1) { ; CHECK-LABEL: @build_vec_v2i64( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i64> %v0, i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i64> %v0, i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i64> %v1, i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i64> %v1, i32 1 +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i64> [[V0:%.*]], i32 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i64> [[V0]], i32 1 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i64> [[V1:%.*]], i32 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i64> [[V1]], i32 1 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = sub i64 [[V0_0]], [[V1_0]] @@ -36,12 +37,12 @@ define void @store_chain_v2i64(i64* %a, i64* %b, i64* %c) { ; CHECK-LABEL: @store_chain_v2i64( -; CHECK-NEXT: [[A_1:%.*]] = getelementptr i64, i64* %a, i64 1 -; CHECK-NEXT: [[B_1:%.*]] = getelementptr i64, i64* %b, i64 1 -; CHECK-NEXT: [[C_1:%.*]] = getelementptr i64, i64* %c, i64 1 -; CHECK-NEXT: [[V0_0:%.*]] = load i64, i64* %a, align 8 +; CHECK-NEXT: [[A_1:%.*]] = getelementptr i64, i64* [[A:%.*]], i64 1 +; CHECK-NEXT: [[B_1:%.*]] = getelementptr i64, i64* [[B:%.*]], i64 1 +; CHECK-NEXT: [[C_1:%.*]] = getelementptr i64, i64* [[C:%.*]], i64 1 +; CHECK-NEXT: [[V0_0:%.*]] = load i64, i64* [[A]], align 8 ; CHECK-NEXT: [[V0_1:%.*]] = load i64, i64* [[A_1]], align 8 -; CHECK-NEXT: [[V1_0:%.*]] = load i64, i64* %b, align 8 +; CHECK-NEXT: [[V1_0:%.*]] = load i64, i64* [[B]], align 8 ; CHECK-NEXT: [[V1_1:%.*]] = load i64, i64* [[B_1]], align 8 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i64 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP0_1:%.*]] = add i64 [[V0_1]], [[V1_1]] @@ -49,7 +50,7 @@ ; CHECK-NEXT: [[TMP1_1:%.*]] = sub i64 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP2_0:%.*]] = add i64 [[TMP0_0]], [[TMP0_1]] ; CHECK-NEXT: [[TMP2_1:%.*]] = add i64 [[TMP1_0]], [[TMP1_1]] -; CHECK-NEXT: store i64 [[TMP2_0]], i64* %c, align 8 +; CHECK-NEXT: store i64 [[TMP2_0]], i64* [[C]], align 8 ; CHECK-NEXT: store i64 [[TMP2_1]], i64* [[C_1]], align 8 ; CHECK-NEXT: ret void ; @@ -76,14 +77,14 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <4 x i32> %v0, i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <4 x i32> %v0, i32 1 -; CHECK-NEXT: [[V0_2:%.*]] = extractelement <4 x i32> %v0, i32 2 -; CHECK-NEXT: [[V0_3:%.*]] = extractelement <4 x i32> %v0, i32 3 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <4 x i32> %v1, i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <4 x i32> %v1, i32 1 -; CHECK-NEXT: [[V1_2:%.*]] = extractelement <4 x i32> %v1, i32 2 -; CHECK-NEXT: [[V1_3:%.*]] = extractelement <4 x i32> %v1, i32 3 +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <4 x i32> [[V0:%.*]], i32 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <4 x i32> [[V0]], i32 1 +; CHECK-NEXT: [[V0_2:%.*]] = extractelement <4 x i32> [[V0]], i32 2 +; CHECK-NEXT: [[V0_3:%.*]] = extractelement <4 x i32> [[V0]], i32 3 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <4 x i32> [[V1:%.*]], i32 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <4 x i32> [[V1]], i32 1 +; CHECK-NEXT: [[V1_2:%.*]] = extractelement <4 x i32> [[V1]], i32 2 +; CHECK-NEXT: [[V1_3:%.*]] = extractelement <4 x i32> [[V1]], i32 3 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP0_2:%.*]] = add i32 [[V0_2]], [[V1_2]] @@ -131,10 +132,10 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> %v0, i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> %v0, i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> %v1, i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> %v1, i32 1 +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V1_0]] @@ -166,10 +167,10 @@ define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> %v0, i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> %v0, i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> %v1, i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> %v1, i32 1 +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP0_2:%.*]] = xor i32 [[V0_0]], [[V1_0]] @@ -208,25 +209,25 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_3_binops( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> %v0, i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> %v0, i32 1 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> %v1, i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> %v1, i32 1 +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]] -; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> %v0, %v1 +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> %v0, %v1 +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]] ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_0]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP1_0]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP0_1]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP1_1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] -; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <4 x i32> +; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] +; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 +; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -254,14 +255,14 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @reduction_v4i32( -; CHECK-NEXT: [[V0_0:%.*]] = extractelement <4 x i32> %v0, i32 0 -; CHECK-NEXT: [[V0_1:%.*]] = extractelement <4 x i32> %v0, i32 1 -; CHECK-NEXT: [[V0_2:%.*]] = extractelement <4 x i32> %v0, i32 2 -; CHECK-NEXT: [[V0_3:%.*]] = extractelement <4 x i32> %v0, i32 3 -; CHECK-NEXT: [[V1_0:%.*]] = extractelement <4 x i32> %v1, i32 0 -; CHECK-NEXT: [[V1_1:%.*]] = extractelement <4 x i32> %v1, i32 1 -; CHECK-NEXT: [[V1_2:%.*]] = extractelement <4 x i32> %v1, i32 2 -; CHECK-NEXT: [[V1_3:%.*]] = extractelement <4 x i32> %v1, i32 3 +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <4 x i32> [[V0:%.*]], i32 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <4 x i32> [[V0]], i32 1 +; CHECK-NEXT: [[V0_2:%.*]] = extractelement <4 x i32> [[V0]], i32 2 +; CHECK-NEXT: [[V0_3:%.*]] = extractelement <4 x i32> [[V0]], i32 3 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <4 x i32> [[V1:%.*]], i32 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <4 x i32> [[V1]], i32 1 +; CHECK-NEXT: [[V1_2:%.*]] = extractelement <4 x i32> [[V1]], i32 2 +; CHECK-NEXT: [[V1_3:%.*]] = extractelement <4 x i32> [[V1]], i32 3 ; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] ; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] ; CHECK-NEXT: [[TMP0_2:%.*]] = add i32 [[V0_2]], [[V1_2]] Index: test/Transforms/SLPVectorizer/X86/pr35497.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/pr35497.ll +++ test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -12,20 +12,20 @@ define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 { ; CHECK-LABEL: @_ZN1C10SwitchModeEv( ; CHECK-NEXT: for.body.lr.ph.i: -; CHECK-NEXT: [[OR_1:%.*]] = or i64 undef, 1 -; CHECK-NEXT: store i64 [[OR_1]], i64* undef, align 8 +; CHECK-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> undef, i64 [[BAR5]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i64> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: store i64 [[TMP2]], i64* undef, align 8 ; CHECK-NEXT: [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0 ; CHECK-NEXT: [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 -; CHECK-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i64> [[TMP1]], [[TMP4]] ; CHECK-NEXT: [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0 ; CHECK-NEXT: [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8 ; CHECK-NEXT: ret void ; for.body.lr.ph.i: @@ -50,27 +50,28 @@ ; CHECK-LABEL: @pr35497( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* undef, align 1 -; CHECK-NEXT: [[ADD:%.*]] = add i64 undef, undef -; CHECK-NEXT: store i64 [[ADD]], i64* undef, align 1 -; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> undef, i64 [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], ; CHECK-NEXT: [[TMP3:%.*]] = and <2 x i64> , [[TMP2]] -; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 ; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], zeroinitializer -; CHECK-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1 ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> undef, i64 [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[ADD]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = shl <2 x i64> [[TMP7]], -; CHECK-NEXT: [[TMP9:%.*]] = and <2 x i64> , [[TMP8]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 undef, i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> , [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP8]], i32 1 +; CHECK-NEXT: store i64 [[TMP9]], i64* undef, align 1 +; CHECK-NEXT: [[ARRAYIDX2_1:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 5 +; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 4 +; CHECK-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 1 +; CHECK-NEXT: [[TMP10:%.*]] = shl <2 x i64> [[TMP8]], +; CHECK-NEXT: [[TMP11:%.*]] = and <2 x i64> , [[TMP10]] ; CHECK-NEXT: [[ARRAYIDX2_6:%.*]] = getelementptr inbounds [0 x i64], [0 x i64]* undef, i64 0, i64 0 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = lshr <2 x i64> [[TMP4]], -; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw <2 x i64> [[TMP9]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* [[TMP13]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i64* [[ARRAYIDX2_6]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP12]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = lshr <2 x i64> [[TMP4]], +; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw <2 x i64> [[TMP11]], [[TMP13]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i64* [[ARRAYIDX2_2]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP14]], <2 x i64>* [[TMP15]], align 1 ; CHECK-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -43,22 +43,16 @@ ; CHECK-LABEL: @add1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -86,22 +80,16 @@ ; CHECK-LABEL: @sub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -205,22 +193,18 @@ ; CHECK-LABEL: @addsub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -248,22 +232,18 @@ ; CHECK-LABEL: @addsub1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -291,22 +271,16 @@ ; CHECK-LABEL: @mul( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[MUL]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9 -; CHECK-NEXT: store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -334,22 +308,16 @@ ; CHECK-LABEL: @shl0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SHL5:%.*]] = shl i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -453,22 +421,16 @@ ; CHECK-LABEL: @add1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -496,22 +458,16 @@ ; CHECK-LABEL: @sub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -615,22 +571,18 @@ ; CHECK-LABEL: @addsub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[SUB5]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -658,22 +610,18 @@ ; CHECK-LABEL: @addsub1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -701,22 +649,16 @@ ; CHECK-LABEL: @mulf( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 -; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -825,22 +767,16 @@ ; CHECK-LABEL: @sub0fn( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: