Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -299,6 +299,23 @@ : TargetTransformInfo::SK_PermuteSingleSrc; } +/// Checks if the \p Opcode can be considered as an operand of a (possibly) +/// binary operation \p I. +/// \returns The code of the binary operation of instruction \p I if the +/// instruction with \p Opcode can be considered as an operand of \p I with the +/// default value. +static unsigned tryToRepresentAsInstArg(unsigned Opcode, Instruction *I) { + if (Opcode != Instruction::PHI && + I->getOpcode() != Instruction::PHI && + I->getOpcode() != Instruction::SRem && + I->getOpcode() != Instruction::URem && + I->getOpcode() != Instruction::FRem && + (I->getType()->isIntegerTy() || + (isa(I) && cast(I)->isFast()))) + return I->getOpcode(); + return 0; +} + namespace { /// Main data required for vectorization of instructions. @@ -320,16 +337,21 @@ } /// Some of the instructions in the list have alternate opcodes. - bool isAltShuffle() const { return getOpcode() != getAltOpcode(); } + bool isAltShuffle() const { return (getOpcode() != 0 && getAltOpcode() != 0 && + getOpcode() != getAltOpcode()); } bool isOpcodeOrAlt(Instruction *I) const { unsigned CheckedOpcode = I->getOpcode(); return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode; } - InstructionsState() = delete; - InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp) - : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {} + /// Some of the instructions in the list have non alternate opcodes. + bool IsNonAlt = false; + + InstructionsState() = default; + InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp, + bool IsNonAlt) + : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp), IsNonAlt(IsNonAlt) {} }; } // end anonymous namespace @@ -351,45 +373,102 @@ unsigned BaseIndex = 0) { // Make sure these are all Instructions. if (llvm::any_of(VL, [](Value *V) { return !isa(V); })) - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + return InstructionsState(VL[BaseIndex], nullptr, nullptr, false); + unsigned Opcode = cast(VL[BaseIndex])->getOpcode(); bool IsCastOp = isa(VL[BaseIndex]); bool IsBinOp = isa(VL[BaseIndex]); - unsigned Opcode = cast(VL[BaseIndex])->getOpcode(); + bool IsNonAlt = false; unsigned AltOpcode = Opcode; + unsigned OpcodeNum = 0; + unsigned AltOpcodeNum = 0; + unsigned NonAltNum = 0; + unsigned NonAltIndex = 0; unsigned AltIndex = BaseIndex; - // Check for one alternate opcode from another BinaryOperator. - // TODO - generalize to support all operators (types, calls etc.). + // Check for an alternate opcode pattern. for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { - unsigned InstOpcode = cast(VL[Cnt])->getOpcode(); - if (IsBinOp && isa(VL[Cnt])) { - if (InstOpcode == Opcode || InstOpcode == AltOpcode) - continue; - if (Opcode == AltOpcode) { - AltOpcode = InstOpcode; - AltIndex = Cnt; - continue; - } - } else if (IsCastOp && isa(VL[Cnt])) { + auto *I = cast(VL[Cnt]); + unsigned InstOpcode = I->getOpcode(); + if (IsCastOp && isa(VL[Cnt])) { Type *Ty0 = cast(VL[BaseIndex])->getOperand(0)->getType(); Type *Ty1 = cast(VL[Cnt])->getOperand(0)->getType(); if (Ty0 == Ty1) { - if (InstOpcode == Opcode || InstOpcode == AltOpcode) + if (InstOpcode == Opcode) { + OpcodeNum++; continue; + } + if (AltOpcode != Opcode && InstOpcode == AltOpcode) { + AltOpcodeNum++; + continue; + } if (Opcode == AltOpcode) { AltOpcode = InstOpcode; AltIndex = Cnt; + AltOpcodeNum++; continue; } } - } else if (InstOpcode == Opcode || InstOpcode == AltOpcode) + return InstructionsState(VL[BaseIndex], nullptr, nullptr, false); + } + if (InstOpcode == Opcode) { + OpcodeNum++; + continue; + } + if (AltOpcode != Opcode && InstOpcode == AltOpcode) { + AltOpcodeNum++; continue; - return InstructionsState(VL[BaseIndex], nullptr, nullptr); + } + if (InstOpcode != Opcode && InstOpcode != AltOpcode) { + if (IsBinOp && AltOpcode == Opcode && isa(I)) { + AltOpcode = InstOpcode; + AltOpcodeNum++; + AltIndex = Cnt; + continue; + } + if (tryToRepresentAsInstArg(Opcode, I)) { + if (!IsNonAlt) { + NonAltIndex = Cnt; + IsNonAlt = true; + } + NonAltNum++; + continue; + } + if (Opcode != Instruction::PHI && IsBinOp && + tryToRepresentAsInstArg(InstOpcode, + cast(VL[BaseIndex]))) { + if (!IsNonAlt) { + NonAltIndex = Cnt; + IsNonAlt = true; + } + NonAltNum++; + continue; + } + return InstructionsState(VL[BaseIndex], nullptr, nullptr, false); + } } + if (IsNonAlt && (OpcodeNum + AltOpcodeNum) < NonAltNum) { + BaseIndex = NonAltIndex; + AltIndex = BaseIndex; + Opcode = cast(VL[BaseIndex])->getOpcode(); + AltOpcode = Opcode; + IsBinOp = isa(VL[BaseIndex]); + for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { + auto *I = cast(VL[Cnt]); + unsigned InstOpcode = I->getOpcode(); + if (Opcode == AltOpcode && IsBinOp && isa(I)) { + AltOpcode = InstOpcode; + AltIndex = Cnt; + } + } + } + + if (IsNonAlt && !IsBinOp) + return InstructionsState(VL[BaseIndex], nullptr, nullptr, false); + return InstructionsState(VL[BaseIndex], cast(VL[BaseIndex]), - cast(VL[AltIndex])); + cast(VL[AltIndex]), IsNonAlt); } /// \returns true if all of the values in \p VL have the same type or false @@ -540,6 +619,7 @@ void deleteTree() { VectorizableTree.clear(); ScalarToTreeEntry.clear(); + ExtraScalarToTreeEntry.clear(); MustGather.clear(); ExternalUses.clear(); NumOpsWantToKeepOrder.clear(); @@ -701,10 +781,14 @@ /// The TreeEntry index containing the user of this entry. We can actually /// have multiple users so the data structure is not truly a tree. SmallVector UserTreeIndices; + + /// Info about instruction in this tree entry. + InstructionsState State; }; /// Create a new VectorizableTree entry. void newTreeEntry(ArrayRef VL, bool Vectorized, int &UserTreeIdx, + const InstructionsState &S, ArrayRef ReuseShuffleIndices = None, ArrayRef ReorderIndices = None) { VectorizableTree.emplace_back(VectorizableTree); @@ -716,11 +800,27 @@ ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; if (Vectorized) { + Last->State = S; for (int i = 0, e = VL.size(); i != e; ++i) { - assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = idx; + Value *Key = S.isOpcodeOrAlt(cast(VL[i])) ? + VL[i] : S.OpValue; + if (Key == VL[i]) { + assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); + ScalarToTreeEntry[VL[i]] = idx; + } else + if (S.IsNonAlt) { + ExtraScalarToTreeEntry[VL[i]][Key] = idx; + } } } else { + for (Value *V: VL) { + if (Instruction *I = dyn_cast(V)) { + Last->State.MainOp = I; + Last->State.AltOp = I; + break; + } + } + Last->State.OpValue = VL[0]; MustGather.insert(VL.begin(), VL.end()); } @@ -740,9 +840,25 @@ return nullptr; } + TreeEntry *getTreeEntry(Value *V, Value *OpValue) { + if (V == OpValue) + return getTreeEntry(V); + auto I = ExtraScalarToTreeEntry.find(V); + if (I != ExtraScalarToTreeEntry.end()) { + auto &STT = I->second; + auto STTI = STT.find(OpValue); + if (STTI != STT.end()) + return &VectorizableTree[STTI->second]; + } + return nullptr; + } + /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; + /// Maps a specific scalar to its tree entry(s) with leading scalar. + SmallDenseMap> ExtraScalarToTreeEntry; + /// A list of scalars that we found that we need to keep as scalars. ValueSet MustGather; @@ -916,8 +1032,8 @@ ScheduleData *NextLoadStore = nullptr; /// The dependent memory instructions. - /// This list is derived on demand in calculateDependencies(). - SmallVector MemoryDependencies; + /// This set is derived on demand in calculateDependencies(). + SmallSet MemoryDependencies; /// This ScheduleData is in the current scheduling region if this matches /// the current SchedulingRegionID of BlockScheduling. @@ -972,6 +1088,7 @@ ScheduleEnd = nullptr; FirstLoadStoreInRegion = nullptr; LastLoadStoreInRegion = nullptr; + ExtraScheduleDataMap.clear(); // Reduce the maximum schedule region size by the size of the // previous scheduling run. @@ -1017,10 +1134,6 @@ ScheduleData *BundleMember = SD; while (BundleMember) { - if (BundleMember->Inst != BundleMember->OpValue) { - BundleMember = BundleMember->NextInBundle; - continue; - } // Handle the def-use chain dependencies. for (Use &U : BundleMember->Inst->operands()) { auto *I = dyn_cast(U.get()); @@ -1060,13 +1173,23 @@ void doForAllOpcodes(Value *V, function_ref Action) { - if (ScheduleData *SD = getScheduleData(V)) - Action(SD); + bool Found = false; auto I = ExtraScheduleDataMap.find(V); - if (I != ExtraScheduleDataMap.end()) - for (auto &P : I->second) - if (P.second->SchedulingRegionID == SchedulingRegionID) - Action(P.second); + if (I != ExtraScheduleDataMap.end()) { + for (auto &P : I->second) { + ScheduleData *SD = P.second; + if (SD && !SD->isPartOfBundle()) + continue; + if (SD && SD->SchedulingRegionID == SchedulingRegionID) { + Found = true; + Action(SD); + } + } + } + if (ScheduleData *SD = getScheduleData(V)) { + if (!Found || SD->isPartOfBundle()) + Action(SD); + } } /// Put all instructions into the ReadyList which are ready for scheduling. @@ -1113,6 +1236,10 @@ /// Sets all instruction in the scheduling region to un-scheduled. void resetSchedule(); + /// Reorder ScheduleData data after scheduling, so that the first instruction + /// in the list would be the last one scheduled. + void reorderBundles(); + BasicBlock *BB; /// Simple memory allocation for ScheduleData. @@ -1345,6 +1472,8 @@ for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; int FoundLane = Lane; + if (!Entry->State.isOpcodeOrAlt(cast(Scalar))) + continue; if (!Entry->ReuseShuffleIndices.empty()) { FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(), @@ -1380,6 +1509,11 @@ } } + // Skip partially vectorized bundles with internal + // dependency and non alternative opcode. + if (!getTreeEntry(U) && getTreeEntry(U, Scalar) == Entry) + continue; + // Ignore users in the user ignore list. if (is_contained(UserIgnoreList, UserInst)) continue; @@ -1392,6 +1526,34 @@ } } +static Value *getDefaultConstantForOpcode(unsigned Opcode, Type *Ty) { + switch(Opcode) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + return ConstantInt::getNullValue(Ty); + case Instruction::Mul: + case Instruction::UDiv: + case Instruction::SDiv: + return ConstantInt::get(Ty, /*V=*/1); + case Instruction::FAdd: + case Instruction::FSub: + return ConstantFP::get(Ty, /*V=*/0.0); + case Instruction::FMul: + case Instruction::FDiv: + return ConstantFP::get(Ty, /*V=*/1.0); + case Instruction::And: + return ConstantInt::getAllOnesValue(Ty); + default: + break; + } + llvm_unreachable("unknown binop for default constant value"); +} + void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, int UserTreeIdx) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); @@ -1399,31 +1561,47 @@ InstructionsState S = getSameOpcode(VL); if (Depth == RecursionMaxDepth) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } // Don't handle vectors. if (S.OpValue->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } if (StoreInst *SI = dyn_cast(S.OpValue)) if (SI->getValueOperand()->getType()->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } // If all of the operands are identical or constant we have a simple solution. if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } + if (S.IsNonAlt && VL.size() > 2) { + unsigned SameOrAlt = 0; + for (Value *V : VL) { + auto *Instr = cast(V); + if (S.isOpcodeOrAlt(Instr)) + SameOrAlt++; + } + if (SameOrAlt <= (VL.size() / 2)) { + LLVM_DEBUG( + dbgs() + << "SLP: Number of pseudo instructions greater than others.\n"); + newTreeEntry(VL, false, UserTreeIdx, S); + return; + } + } + // We now know that this is a vector of instructions of the same type from // the same block. @@ -1432,7 +1610,7 @@ if (EphValues.count(VL[i])) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is ephemeral.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1442,7 +1620,7 @@ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); if (!E->isSame(VL)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } // Record the reuse of the tree node. FIXME, currently this is only used to @@ -1461,7 +1639,7 @@ if (getTreeEntry(I)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] << ") is already in tree.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1471,7 +1649,7 @@ for (unsigned i = 0, e = VL.size(); i != e; ++i) { if (MustGather.count(VL[i])) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } } @@ -1485,7 +1663,7 @@ // Don't go into unreachable blocks. They may contain instructions with // dependency cycles which confuse the final scheduling. LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } @@ -1505,7 +1683,7 @@ LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, false, UserTreeIdx); + newTreeEntry(VL, false, UserTreeIdx, S); return; } VL = UniqueValues; @@ -1522,7 +1700,7 @@ assert((!BS.getScheduleData(VL0) || !BS.getScheduleData(VL0)->isPartOfBundle()) && "tryScheduleBundle should cancelScheduling on failure"); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n"); @@ -1543,12 +1721,12 @@ dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n"); for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { @@ -1569,7 +1747,7 @@ if (Reuse) { LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n"); ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, ReuseShuffleIndicies); return; } @@ -1586,12 +1764,14 @@ auto StoredCurrentOrderAndNum = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++StoredCurrentOrderAndNum->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, ReuseShuffleIndicies, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, + ReuseShuffleIndicies, StoredCurrentOrderAndNum->getFirst()); return; } LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); - newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, /*Vectorized=*/false, UserTreeIdx, S, + ReuseShuffleIndicies); BS.cancelScheduling(VL, VL0); return; } @@ -1607,7 +1787,7 @@ if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); return; } @@ -1620,7 +1800,7 @@ auto *L = cast(V); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } @@ -1650,14 +1830,14 @@ if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { // Need to reorder. auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); - newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, + newTreeEntry(VL, /*Vectorized=*/true, UserTreeIdx, S, ReuseShuffleIndicies, I->getFirst()); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } @@ -1667,7 +1847,7 @@ LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } case Instruction::ZExt: @@ -1687,13 +1867,13 @@ Type *Ty = cast(VL[i])->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1716,14 +1896,14 @@ if (Cmp->getPredicate() != P0 || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n"); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n"); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { @@ -1755,7 +1935,7 @@ case Instruction::And: case Instruction::Or: case Instruction::Xor: - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of bin op.\n"); // Sort operands of the instructions so that each side is more likely to @@ -1771,10 +1951,18 @@ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); - - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + for (Value *VecOp : VL) { + auto *I = cast(VecOp); + if (I->getOpcode() == S.getOpcode()) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.getOpcode()) && + "Expected a binary operation."); + Operands.push_back(VecOp); + } + if (allSameType(Operands)) + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; @@ -1784,7 +1972,7 @@ if (cast(VL[j])->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } @@ -1798,7 +1986,7 @@ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } @@ -1810,12 +1998,12 @@ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; @@ -1832,12 +2020,12 @@ for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; @@ -1855,7 +2043,7 @@ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); if (!isTriviallyVectorizable(ID)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); return; } @@ -1869,7 +2057,7 @@ getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] << "\n"); return; @@ -1880,7 +2068,7 @@ Value *A1J = CI2->getArgOperand(1); if (A1I != A1J) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI << " argument " << A1I << "!=" << A1J << "\n"); return; @@ -1892,22 +2080,30 @@ CI->op_begin() + CI->getBundleOperandsEndIndex(), CI2->op_begin() + CI2->getBundleOperandsStartIndex())) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!=" << *VL[i] << '\n'); return; } } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) { - CallInst *CI2 = dyn_cast(j); - Operands.push_back(CI2->getArgOperand(i)); + for (Value *VecOp : VL) { + auto *I = cast(VecOp); + if (S.isOpcodeOrAlt(I)) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.getOpcode()) && + "Expected a binary operation."); + Value *Operand = getDefaultConstantForOpcode(S.getOpcode(), I->getType()); + Operands.push_back(Operand); } - buildTree_rec(Operands, Depth + 1, UserTreeIdx); + if (allSameType(Operands)) + buildTree_rec(Operands, Depth + 1, UserTreeIdx); } return; } @@ -1916,11 +2112,11 @@ // then do not vectorize this instruction. if (!S.isAltShuffle()) { BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n"); return; } - newTreeEntry(VL, true, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, true, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n"); // Reorder operands if reordering would enable vectorization. @@ -1935,8 +2131,17 @@ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *VecOp : VL) { + auto *I = cast(VecOp); + if (S.isOpcodeOrAlt(I)) { + Operands.push_back(I->getOperand(i)); + continue; + } + assert(Instruction::isBinaryOp(S.getOpcode()) && + "Expected a binary operation."); + Value *Operand = getDefaultConstantForOpcode(S.getOpcode(), I->getType()); + Operands.push_back(Operand); + } buildTree_rec(Operands, Depth + 1, UserTreeIdx); } @@ -1944,7 +2149,7 @@ default: BS.cancelScheduling(VL, VL0); - newTreeEntry(VL, false, UserTreeIdx, ReuseShuffleIndicies); + newTreeEntry(VL, false, UserTreeIdx, S, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n"); return; } @@ -2102,11 +2307,10 @@ } return ReuseShuffleCost + getGatherCost(VL); } - InstructionsState S = getSameOpcode(VL); - assert(S.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); - Instruction *VL0 = cast(S.OpValue); - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + assert(E->State.getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + auto *VL0 = cast(E->State.OpValue); + unsigned ShuffleOrOp = E->State.isAltShuffle() ? + (unsigned) Instruction::ShuffleVector : E->State.getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: return 0; @@ -2192,7 +2396,7 @@ case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); int ScalarEltCost = - TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0); + TTI->getCastInstrCost(E->State.getOpcode(), ScalarTy, SrcTy, VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } @@ -2205,7 +2409,8 @@ // Check if the values are candidates to demote. if (!MinBWs.count(VL0) || VecTy != SrcVecTy) { VecCost = ReuseShuffleCost + - TTI->getCastInstrCost(S.getOpcode(), VecTy, SrcVecTy, VL0); + TTI->getCastInstrCost(E->State.getOpcode(), VecTy, + SrcVecTy, VL0); } return VecCost - ScalarCost; } @@ -2213,14 +2418,16 @@ case Instruction::ICmp: case Instruction::Select: { // Calculate the cost of this instruction. - int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy, + int ScalarEltCost = TTI->getCmpSelInstrCost(E->State.getOpcode(), + ScalarTy, Builder.getInt1Ty(), VL0); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0); + int VecCost = TTI->getCmpSelInstrCost(E->State.getOpcode(), VecTy, + MaskTy, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::Add: @@ -2246,7 +2453,7 @@ TargetTransformInfo::OperandValueKind Op1VK = TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueKind Op2VK = - TargetTransformInfo::OK_UniformConstantValue; + TargetTransformInfo::OK_AnyValue; TargetTransformInfo::OperandValueProperties Op1VP = TargetTransformInfo::OP_None; TargetTransformInfo::OperandValueProperties Op2VP = @@ -2257,35 +2464,40 @@ // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt0 = nullptr; - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - const Instruction *I = cast(VL[i]); - ConstantInt *CInt = dyn_cast(I->getOperand(1)); - if (!CInt) { - Op2VK = TargetTransformInfo::OK_AnyValue; - Op2VP = TargetTransformInfo::OP_None; - break; - } - if (Op2VP == TargetTransformInfo::OP_PowerOf2 && - !CInt->getValue().isPowerOf2()) - Op2VP = TargetTransformInfo::OP_None; - if (i == 0) { - CInt0 = CInt; - continue; + if (auto *CInt = dyn_cast(VL0->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_UniformConstantValue; + const unsigned Opcode = E->State.getOpcode(); + for (auto *V : VL) { + auto *I = cast(V); + if (I == VL0 || Opcode != I->getOpcode()) + continue; + if (!isa(I->getOperand(1))) { + Op2VK = TargetTransformInfo::OK_AnyValue; + Op2VP = TargetTransformInfo::OP_None; + break; + } + ConstantInt *CInt_cur = cast(I->getOperand(1)); + if (Op2VK == TargetTransformInfo::OK_UniformConstantValue && + CInt != cast(I->getOperand(1))) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; + if (Op2VP == TargetTransformInfo::OP_PowerOf2 && + !CInt->getValue().isPowerOf2()) + Op2VP = TargetTransformInfo::OP_None; + if (CInt != CInt_cur) + Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } - if (CInt0 != CInt) - Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } SmallVector Operands(VL0->operand_values()); int ScalarEltCost = TTI->getArithmeticInstrCost( - S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); + E->State.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); if (NeedToShuffleReuses) { ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; } int ScalarCost = VecTy->getNumElements() * ScalarEltCost; - int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK, - Op2VK, Op1VP, Op2VP, Operands); + int VecCost = TTI->getArithmeticInstrCost(E->State.getOpcode(), VecTy, + Op1VK, Op2VK, Op1VP, Op2VP, + Operands); return ReuseShuffleCost + VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -2366,11 +2578,11 @@ return ReuseShuffleCost + VecCallCost - ScalarCallCost; } case Instruction::ShuffleVector: { - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(E->State.isAltShuffle() && + ((Instruction::isBinaryOp(E->State.getOpcode()) && + Instruction::isBinaryOp(E->State.getAltOpcode())) || + (Instruction::isCast(E->State.getOpcode()) && + Instruction::isCast(E->State.getAltOpcode()))) && "Invalid Shuffle Vector Operand"); int ScalarCost = 0; if (NeedToShuffleReuses) { @@ -2387,23 +2599,22 @@ } for (Value *i : VL) { Instruction *I = cast(i); - assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); ScalarCost += TTI->getInstructionCost( I, TargetTransformInfo::TCK_RecipThroughput); } // VecCost is equal to sum of the cost of creating 2 vectors // and the cost of creating shuffle. int VecCost = 0; - if (Instruction::isBinaryOp(S.getOpcode())) { - VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy); - VecCost += TTI->getArithmeticInstrCost(S.getAltOpcode(), VecTy); + if (Instruction::isBinaryOp(E->State.getOpcode())) { + VecCost = TTI->getArithmeticInstrCost(E->State.getOpcode(), VecTy); + VecCost += TTI->getArithmeticInstrCost(E->State.getAltOpcode(), VecTy); } else { - Type *Src0SclTy = S.MainOp->getOperand(0)->getType(); - Type *Src1SclTy = S.AltOp->getOperand(0)->getType(); + Type *Src0SclTy = E->State.MainOp->getOperand(0)->getType(); + Type *Src1SclTy = E->State.AltOp->getOperand(0)->getType(); VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size()); VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size()); - VecCost = TTI->getCastInstrCost(S.getOpcode(), VecTy, Src0Ty); - VecCost += TTI->getCastInstrCost(S.getAltOpcode(), VecTy, Src1Ty); + VecCost = TTI->getCastInstrCost(E->State.getOpcode(), VecTy, Src0Ty); + VecCost += TTI->getCastInstrCost(E->State.getAltOpcode(), VecTy, Src1Ty); } VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0); return ReuseShuffleCost + VecCost - ScalarCost; @@ -2469,7 +2680,7 @@ Instruction *PrevInst = nullptr; for (const auto &N : VectorizableTree) { - Instruction *Inst = dyn_cast(N.Scalars[0]); + Instruction *Inst = dyn_cast(N.State.OpValue); if (!Inst) continue; @@ -2654,9 +2865,13 @@ // Push left and right operands of binary operation into Left and Right for (Value *V : VL) { auto *I = cast(V); - assert(S.isOpcodeOrAlt(I) && "Incorrect instruction in vector"); - Left.push_back(I->getOperand(0)); - Right.push_back(I->getOperand(1)); + if (S.isOpcodeOrAlt(I)) { + Left.push_back(I->getOperand(0)); + Right.push_back(I->getOperand(1)); + } else { + Left.push_back(I); + Right.push_back(getDefaultConstantForOpcode(S.getOpcode(), I->getType())); + } } // Reorder if we have a commutative operation and consecutive access @@ -2705,8 +2920,13 @@ int i, unsigned Opcode, Instruction &I, ArrayRef Left, ArrayRef Right, bool AllSameOpcodeLeft, bool AllSameOpcodeRight, bool SplatLeft, bool SplatRight, Value *&VLeft, Value *&VRight) { - VLeft = I.getOperand(0); - VRight = I.getOperand(1); + if (I.getOpcode() == Opcode) { + VLeft = I.getOperand(0); + VRight = I.getOperand(1); + } else { + VLeft = &I; + VRight = getDefaultConstantForOpcode(Opcode, I.getType()); + } // If we have "SplatRight", try to see if commuting is needed to preserve it. if (SplatRight) { if (VRight == Right[i - 1]) @@ -2770,8 +2990,15 @@ // Peel the first iteration out of the loop since there's nothing // interesting to do anyway and it simplifies the checks in the loop. auto *I = cast(VL[0]); - Value *VLeft = I->getOperand(0); - Value *VRight = I->getOperand(1); + Value *VLeft; + Value *VRight; + if (I->getOpcode() == Opcode) { + VLeft = I->getOperand(0); + VRight = I->getOperand(1); + } else { + VLeft = I; + VRight = getDefaultConstantForOpcode(Opcode, I->getType()); + } if (!isa(VRight) && isa(VLeft)) // Favor having instruction to the right. FIXME: why? std::swap(VLeft, VRight); @@ -2869,17 +3096,15 @@ // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; - // Find the last instruction. The common case should be that BB has been - // scheduled, and the last instruction is VL.back(). So we start with - // VL.back() and iterate over schedule data until we reach the end of the - // bundle. The end of the bundle is marked by null ScheduleData. + // Find the last instruction. If the bundle is not scheduled then + // the first in the bundle is the last one in BB, because we discover + // bundles in a backward walk. And if the bundle is already scheduled + // the we reorder one to be the last one scheduled in the first position. if (BlocksSchedules.count(BB)) { auto *Bundle = BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back())); if (Bundle && Bundle->isPartOfBundle()) - for (; Bundle; Bundle = Bundle->NextInBundle) - if (Bundle->OpValue == Bundle->Inst) - LastInst = Bundle->Inst; + LastInst = Bundle->FirstInBundle->Inst; } // LastInst can still be null at this point if there's either not an entry @@ -3026,12 +3251,12 @@ IRBuilder<>::InsertPointGuard Guard(Builder); if (E->VectorizedValue) { - LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n"); + LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " + << *E->State.OpValue << ".\n"); return E->VectorizedValue; } - InstructionsState S = getSameOpcode(E->Scalars); - Instruction *VL0 = cast(S.OpValue); + auto *VL0 = cast(E->State.OpValue); Type *ScalarTy = VL0->getType(); if (StoreInst *SI = dyn_cast(VL0)) ScalarTy = SI->getValueOperand()->getType(); @@ -3040,7 +3265,7 @@ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->NeedToGather) { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3054,8 +3279,8 @@ return V; } - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + unsigned ShuffleOrOp = E->State.isAltShuffle() ? + (unsigned) Instruction::ShuffleVector : E->State.getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); @@ -3123,7 +3348,7 @@ E->VectorizedValue = V; return V; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3158,7 +3383,7 @@ E->VectorizedValue = NewV; return NewV; } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3187,7 +3412,7 @@ for (Value *V : E->Scalars) INVL.push_back(cast(V)->getOperand(0)); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); Value *InVec = vectorizeTree(INVL); @@ -3214,7 +3439,7 @@ RHSV.push_back(cast(V)->getOperand(1)); } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); Value *L = vectorizeTree(LHSV); Value *R = vectorizeTree(RHSV); @@ -3226,7 +3451,7 @@ CmpInst::Predicate P0 = cast(VL0)->getPredicate(); Value *V; - if (S.getOpcode() == Instruction::FCmp) + if (E->State.getOpcode() == Instruction::FCmp) V = Builder.CreateFCmp(P0, L, R); else V = Builder.CreateICmp(P0, L, R); @@ -3248,7 +3473,7 @@ FalseVec.push_back(cast(V)->getOperand(2)); } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); Value *Cond = vectorizeTree(CondVec); Value *True = vectorizeTree(TrueVec); @@ -3288,16 +3513,22 @@ case Instruction::Xor: { ValueList LHSVL, RHSVL; if (isa(VL0) && VL0->isCommutative()) - reorderInputsAccordingToOpcode(S.getOpcode(), E->Scalars, LHSVL, - RHSVL); + reorderInputsAccordingToOpcode(E->State.getOpcode(), E->Scalars, + LHSVL, RHSVL); else for (Value *V : E->Scalars) { auto *I = cast(V); - LHSVL.push_back(I->getOperand(0)); - RHSVL.push_back(I->getOperand(1)); + if (I->getOpcode() == E->State.getOpcode()) { + LHSVL.push_back(I->getOperand(0)); + RHSVL.push_back(I->getOperand(1)); + } else { + LHSVL.push_back(V); + RHSVL.push_back( + getDefaultConstantForOpcode(E->State.getOpcode(), I->getType())); + } } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); Value *LHS = vectorizeTree(LHSVL); Value *RHS = vectorizeTree(RHSVL); @@ -3308,7 +3539,7 @@ } Value *V = Builder.CreateBinOp( - static_cast(S.getOpcode()), LHS, RHS); + static_cast(VL0->getOpcode()), LHS, RHS); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast(V)) V = propagateMetadata(I, E->Scalars); @@ -3327,10 +3558,12 @@ // sink them all the way down past store instructions. bool IsReorder = !E->ReorderIndices.empty(); if (IsReorder) { - S = getSameOpcode(E->Scalars, E->ReorderIndices.front()); + InstructionsState S = getSameOpcode(E->Scalars, + E->ReorderIndices.front()); VL0 = cast(S.OpValue); - } - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, S); + } else + setInsertPointAfterBundle(E->Scalars, E->State); LoadInst *LI = cast(VL0); Type *ScalarLoadTy = LI->getType(); @@ -3377,7 +3610,7 @@ for (Value *V : E->Scalars) ScalarStoreValues.push_back(cast(V)->getValueOperand()); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); Value *VecValue = vectorizeTree(ScalarStoreValues); Value *ScalarPtr = SI->getPointerOperand(); @@ -3404,7 +3637,7 @@ return V; } case Instruction::GetElementPtr: { - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); ValueList Op0VL; for (Value *V : E->Scalars) @@ -3439,7 +3672,7 @@ } case Instruction::Call: { CallInst *CI = cast(VL0); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); Function *FI; Intrinsic::ID IID = Intrinsic::not_intrinsic; Value *ScalarArg = nullptr; @@ -3492,24 +3725,24 @@ } case Instruction::ShuffleVector: { ValueList LHSVL, RHSVL; - assert(S.isAltShuffle() && - ((Instruction::isBinaryOp(S.getOpcode()) && - Instruction::isBinaryOp(S.getAltOpcode())) || - (Instruction::isCast(S.getOpcode()) && - Instruction::isCast(S.getAltOpcode()))) && + assert(E->State.isAltShuffle() && + ((Instruction::isBinaryOp(E->State.getOpcode()) && + Instruction::isBinaryOp(E->State.getAltOpcode())) || + (Instruction::isCast(E->State.getOpcode()) && + Instruction::isCast(E->State.getAltOpcode()))) && "Invalid Shuffle Vector Operand"); Value *LHS, *RHS; - if (Instruction::isBinaryOp(S.getOpcode())) { - reorderAltShuffleOperands(S, E->Scalars, LHSVL, RHSVL); - setInsertPointAfterBundle(E->Scalars, S); + if (Instruction::isBinaryOp(E->State.getOpcode())) { + reorderAltShuffleOperands(E->State, E->Scalars, LHSVL, RHSVL); + setInsertPointAfterBundle(E->Scalars, E->State); LHS = vectorizeTree(LHSVL); RHS = vectorizeTree(RHSVL); } else { ValueList INVL; for (Value *V : E->Scalars) INVL.push_back(cast(V)->getOperand(0)); - setInsertPointAfterBundle(E->Scalars, S); + setInsertPointAfterBundle(E->Scalars, E->State); LHS = vectorizeTree(INVL); } @@ -3519,16 +3752,16 @@ } Value *V0, *V1; - if (Instruction::isBinaryOp(S.getOpcode())) { + if (Instruction::isBinaryOp(E->State.getOpcode())) { V0 = Builder.CreateBinOp( - static_cast(S.getOpcode()), LHS, RHS); + static_cast(E->State.getOpcode()), LHS, RHS); V1 = Builder.CreateBinOp( - static_cast(S.getAltOpcode()), LHS, RHS); + static_cast(E->State.getAltOpcode()), LHS, RHS); } else { V0 = Builder.CreateCast( - static_cast(S.getOpcode()), LHS, VecTy); + static_cast(E->State.getOpcode()), LHS, VecTy); V1 = Builder.CreateCast( - static_cast(S.getAltOpcode()), LHS, VecTy); + static_cast(E->State.getAltOpcode()), LHS, VecTy); } // Create shuffle to take alternate operations from the vector. @@ -3539,8 +3772,7 @@ SmallVector Mask(e); for (unsigned i = 0; i < e; ++i) { auto *OpInst = cast(E->Scalars[i]); - assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); - if (OpInst->getOpcode() == S.getAltOpcode()) { + if (OpInst->getOpcode() == E->State.getAltOpcode()) { Mask[i] = Builder.getInt32(e + i); AltScalars.push_back(E->Scalars[i]); } else { @@ -3550,8 +3782,10 @@ } Value *ShuffleMask = ConstantVector::get(Mask); - propagateIRFlags(V0, OpScalars); - propagateIRFlags(V1, AltScalars); + InstructionsState S = getSameOpcode(OpScalars); + propagateIRFlags(V0, OpScalars, S.OpValue); + S = getSameOpcode(AltScalars); + propagateIRFlags(V1, AltScalars, S.OpValue); Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); if (Instruction *I = dyn_cast(V)) @@ -3589,7 +3823,7 @@ // If the vectorized tree can be rewritten in a smaller type, we truncate the // vectorized root. InstCombine will then rewrite the entire expression. We // sign extend the extracted values below. - auto *ScalarRoot = VectorizableTree[0].Scalars[0]; + auto *ScalarRoot = VectorizableTree[0].State.OpValue; if (MinBWs.count(ScalarRoot)) { if (auto *I = dyn_cast(VectorRoot)) Builder.SetInsertPoint(&*++BasicBlock::iterator(I)); @@ -3704,6 +3938,16 @@ for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (!Entry->State.isOpcodeOrAlt(cast(Scalar))) + continue; + + // Skip partially vectorized bundles with internal + // dependency and non alternative opcode. + if (llvm::any_of(Scalar->users(), [this, Entry, Scalar](User *U) { + return !getTreeEntry(U) && getTreeEntry(U, Scalar) == Entry; + })) + continue; + Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { #ifndef NDEBUG @@ -3834,7 +4078,7 @@ } for (Value *V : VL) { - ScheduleData *BundleMember = getScheduleData(V); + ScheduleData *BundleMember = getScheduleData(V, isOneOf(S, V)); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); if (BundleMember->IsScheduled) { @@ -3897,6 +4141,13 @@ } if (!Bundle->isReady()) { cancelScheduling(VL, S.OpValue); + for (auto *I = ScheduleStart; I != ScheduleEnd; + I = I->getNextNode()) { + doForAllOpcodes(I, [](ScheduleData *SD) { + SD->clearDependencies(); + }); + } + resetSchedule(); return false; } return true; @@ -3907,7 +4158,7 @@ if (isa(OpValue)) return; - ScheduleData *Bundle = getScheduleData(OpValue); + ScheduleData *Bundle = getScheduleData(OpValue)->FirstInBundle; LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); @@ -3951,10 +4202,12 @@ return false; assert(isInSchedulingRegion(ISD) && "ScheduleData not in scheduling region"); - ScheduleData *SD = allocateScheduleDataChunks(); - SD->Inst = I; - SD->init(SchedulingRegionID, S.OpValue); - ExtraScheduleDataMap[I][S.OpValue] = SD; + if (I != S.OpValue) { + ScheduleData *SD = allocateScheduleDataChunks(); + SD->Inst = I; + SD->init(SchedulingRegionID, S.OpValue); + ExtraScheduleDataMap[I][S.OpValue] = SD; + } return true; }; if (CheckSheduleForI(I)) @@ -4026,6 +4279,7 @@ SD = allocateScheduleDataChunks(); ScheduleDataMap[I] = SD; SD->Inst = I; + SD->OpValue = I; } assert(!isInSchedulingRegion(SD) && "new ScheduleData already in scheduling region"); @@ -4075,27 +4329,35 @@ // Handle def-use chain dependencies. if (BundleMember->OpValue != BundleMember->Inst) { - ScheduleData *UseSD = getScheduleData(BundleMember->Inst); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { - BundleMember->Dependencies++; - ScheduleData *DestBundle = UseSD->FirstInBundle; - if (!DestBundle->IsScheduled) + for (User *U : BundleMember->Inst->users()) { + if (isa(U)) { + doForAllOpcodes(U, [&BundleMember, &WorkList](ScheduleData *UseSD) { + BundleMember->Dependencies++; + ScheduleData *DestBundle = UseSD->FirstInBundle; + if (!DestBundle->IsScheduled) + BundleMember->incrementUnscheduledDeps(1); + if (!DestBundle->hasValidDependencies()) + WorkList.push_back(DestBundle); + }); + } else { + // I'm not sure if this can ever happen. But we need to be safe. + // This lets the instruction/bundle never be scheduled and + // eventually disable vectorization. + BundleMember->Dependencies++; BundleMember->incrementUnscheduledDeps(1); - if (!DestBundle->hasValidDependencies()) - WorkList.push_back(DestBundle); + } } } else { for (User *U : BundleMember->Inst->users()) { if (isa(U)) { - ScheduleData *UseSD = getScheduleData(U); - if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) { + doForAllOpcodes(U, [&BundleMember, &WorkList](ScheduleData *UseSD) { BundleMember->Dependencies++; ScheduleData *DestBundle = UseSD->FirstInBundle; if (!DestBundle->IsScheduled) BundleMember->incrementUnscheduledDeps(1); if (!DestBundle->hasValidDependencies()) WorkList.push_back(DestBundle); - } + }); } else { // I'm not sure if this can ever happen. But we need to be safe. // This lets the instruction/bundle never be scheduled and @@ -4135,14 +4397,18 @@ // balance between reduced runtime and accurate dependencies. numAliased++; - DepDest->MemoryDependencies.push_back(BundleMember); - BundleMember->Dependencies++; - ScheduleData *DestBundle = DepDest->FirstInBundle; - if (!DestBundle->IsScheduled) { - BundleMember->incrementUnscheduledDeps(1); - } - if (!DestBundle->hasValidDependencies()) { - WorkList.push_back(DestBundle); + // We don't want any duplicates in the set to have a correct + // dependancies. + if (DepDest->MemoryDependencies.count(BundleMember) == 0) { + DepDest->MemoryDependencies.insert(BundleMember); + BundleMember->Dependencies++; + ScheduleData *DestBundle = DepDest->FirstInBundle; + if (!DestBundle->IsScheduled) { + BundleMember->incrementUnscheduledDeps(1); + } + if (!DestBundle->hasValidDependencies()) { + WorkList.push_back(DestBundle); + } } } DepDest = DepDest->NextLoadStore; @@ -4190,6 +4456,47 @@ ReadyInsts.clear(); } +void BoUpSLP::BlockScheduling::reorderBundles() { + DenseMap ReorderMap; + // Walk backward in the scheduling region to descover last instruction + // for a bundle. + for (auto *I = ScheduleEnd; I != ScheduleStart; I = I->getPrevNode()) { + doForAllOpcodes(I, [&ReorderMap, &I](ScheduleData *SD) { + if (SD->isPartOfBundle() && + ReorderMap.find(SD->FirstInBundle) == ReorderMap.end()) { + ReorderMap[SD->FirstInBundle] = SD; + } + }); + } + DenseMap::iterator it; + // Swap the last scheduled instruction with the first one in the bundle. + for (it = ReorderMap.begin(); it != ReorderMap.end(); it++) { + ScheduleData *FirstSD = it->first; + ScheduleData *LastSD = it->second; + SmallVector Bundle; + unsigned LastPos = 0; + // The first instruction in the bundle is already the last one scheduled. + if (FirstSD == LastSD) + continue; + ScheduleData *SD = FirstSD; + while (SD) { + if (SD == LastSD) + LastPos = Bundle.size(); + Bundle.push_back(SD); + SD = SD->NextInBundle; + } + std::swap(Bundle[0], Bundle[LastPos]); + for (ScheduleData *SD : Bundle) + SD->FirstInBundle = Bundle[0]; + Bundle[0]->NextInBundle = Bundle[1]; + Bundle[LastPos - 1]->NextInBundle = Bundle[LastPos]; + if (LastPos == Bundle.size() - 1) + Bundle[LastPos]->NextInBundle = nullptr; + else + Bundle[LastPos]->NextInBundle = Bundle[LastPos + 1]; + } +} + void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -4216,7 +4523,7 @@ I = I->getNextNode()) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { assert(SD->isPartOfBundle() == - (getTreeEntry(SD->Inst) != nullptr) && + (getTreeEntry(SD->Inst, SD->OpValue) != nullptr) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { @@ -4238,13 +4545,13 @@ // there yet. ScheduleData *BundleMember = picked; while (BundleMember) { - Instruction *pickedInst = BundleMember->Inst; - if (LastScheduledInst->getNextNode() != pickedInst) { - BS->BB->getInstList().remove(pickedInst); + Instruction *PickedInst = BundleMember->Inst; + if (LastScheduledInst != PickedInst) { + BS->BB->getInstList().remove(PickedInst); BS->BB->getInstList().insert(LastScheduledInst->getIterator(), - pickedInst); + PickedInst); } - LastScheduledInst = pickedInst; + LastScheduledInst = PickedInst; BundleMember = BundleMember->NextInBundle; } @@ -4253,6 +4560,8 @@ } assert(NumToSchedule == 0 && "could not schedule all instructions"); + BS->reorderBundles(); + // Avoid duplicate scheduling of the block. BS->ScheduleStart = nullptr; } @@ -4869,7 +5178,7 @@ // Check that all of the parts are scalar instructions of the same type, // we permit an alternate opcode via InstructionsState. InstructionsState S = getSameOpcode(VL); - if (!S.getOpcode()) + if (!S.getOpcode() || S.IsNonAlt) return false; Instruction *I0 = cast(S.OpValue); Index: test/Transforms/SLPVectorizer/X86/pr35497.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/pr35497.ll +++ test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -12,20 +12,20 @@ define void @_ZN1C10SwitchModeEv() local_unnamed_addr #0 comdat align 2 { ; CHECK-LABEL: @_ZN1C10SwitchModeEv( ; CHECK-NEXT: for.body.lr.ph.i: -; CHECK-NEXT: [[OR_1:%.*]] = or i64 undef, 1 -; CHECK-NEXT: store i64 [[OR_1]], i64* undef, align 8 +; CHECK-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> undef, i64 [[BAR5]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i64> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; CHECK-NEXT: store i64 [[TMP2]], i64* undef, align 8 ; CHECK-NEXT: [[FOO_1:%.*]] = getelementptr inbounds [[CLASS_1:%.*]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 0 ; CHECK-NEXT: [[FOO_2:%.*]] = getelementptr inbounds [[CLASS_1]], %class.1* undef, i64 0, i32 0, i32 0, i32 0, i32 0, i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* [[TMP0]], align 8 -; CHECK-NEXT: [[BAR5:%.*]] = load i64, i64* undef, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> undef, i64 [[OR_1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[BAR5]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = and <2 x i64> [[TMP3]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[FOO_1]] to <2 x i64>* +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i64> [[TMP1]], [[TMP4]] ; CHECK-NEXT: [[BAR3:%.*]] = getelementptr inbounds [[CLASS_2:%.*]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 0 ; CHECK-NEXT: [[BAR4:%.*]] = getelementptr inbounds [[CLASS_2]], %class.2* undef, i64 0, i32 0, i32 0, i32 0, i64 1 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* -; CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast i64* [[BAR3]] to <2 x i64>* +; CHECK-NEXT: store <2 x i64> [[TMP5]], <2 x i64>* [[TMP6]], align 8 ; CHECK-NEXT: ret void ; for.body.lr.ph.i: Index: test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -43,22 +43,16 @@ ; CHECK-LABEL: @add1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[ADD3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[ADD6]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[ADD9]], i32* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -86,22 +80,16 @@ ; CHECK-LABEL: @sub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = add nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -205,22 +193,18 @@ ; CHECK-LABEL: @addsub0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = add nsw i32 [[TMP2]], -2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SUB5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -248,22 +232,18 @@ ; CHECK-LABEL: @addsub1( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = sub nsw i32 [[TMP1]], -1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SUB1]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP3]], -3 -; CHECK-NEXT: store i32 [[SUB8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -291,22 +271,16 @@ ; CHECK-LABEL: @mul( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 257 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[MUL]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[MUL3:%.*]] = mul nsw i32 [[TMP1]], -3 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[MUL3]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP2]], i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[MUL9:%.*]] = mul nsw i32 [[TMP3]], -9 -; CHECK-NEXT: store i32 [[MUL9]], i32* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -334,22 +308,16 @@ ; CHECK-LABEL: @shl0( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[TMP0]], i32* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP1]], 1 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 2 -; CHECK-NEXT: store i32 [[SHL]], i32* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, i32* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SHL5:%.*]] = shl i32 [[TMP2]], 2 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, i32* [[DST]], i64 3 -; CHECK-NEXT: store i32 [[SHL5]], i32* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SHL8:%.*]] = shl i32 [[TMP3]], 3 -; CHECK-NEXT: store i32 [[SHL8]], i32* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[SRC]] to <4 x i32>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shl <4 x i32> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[DST]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -453,22 +421,16 @@ ; CHECK-LABEL: @add1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[TMP0]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[TMP1]], 1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[ADD3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], 2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], 3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -496,22 +458,16 @@ ; CHECK-LABEL: @sub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -615,22 +571,18 @@ ; CHECK-LABEL: @addsub0f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[SUB5:%.*]] = fadd fast float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[SUB5]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -658,22 +610,18 @@ ; CHECK-LABEL: @addsub1f( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB1:%.*]] = fsub fast float [[TMP1]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = fsub fast float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[SUB8]], float* [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x float> [[TMP1]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP4]], <4 x float>* [[TMP5]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -701,22 +649,16 @@ ; CHECK-LABEL: @mulf( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = fmul fast float [[TMP0]], 2.570000e+02 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[SUB]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[SUB3:%.*]] = fmul fast float [[TMP1]], -3.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[SUB3]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[TMP2]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[SUB9:%.*]] = fmul fast float [[TMP3]], -9.000000e+00 -; CHECK-NEXT: store float [[SUB9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fmul fast <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -825,22 +767,16 @@ ; CHECK-LABEL: @sub0fn( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[SRC]], align 4 -; CHECK-NEXT: [[ADD:%.*]] = fadd fast float [[TMP0]], -1.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 1 -; CHECK-NEXT: store float [[ADD]], float* [[DST]], align 4 ; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[INCDEC_PTR]], align 4 ; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds float, float* [[DST]], i64 2 -; CHECK-NEXT: store float [[TMP1]], float* [[INCDEC_PTR1]], align 4 ; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 3 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[ADD6:%.*]] = fadd float [[TMP2]], -2.000000e+00 ; CHECK-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds float, float* [[DST]], i64 3 -; CHECK-NEXT: store float [[ADD6]], float* [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[INCDEC_PTR5]], align 4 -; CHECK-NEXT: [[ADD9:%.*]] = fadd float [[TMP3]], -3.000000e+00 -; CHECK-NEXT: store float [[ADD9]], float* [[INCDEC_PTR7]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[SRC]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[DST]] to <4 x float>* +; CHECK-NEXT: store <4 x float> [[TMP2]], <4 x float>* [[TMP3]], align 4 ; CHECK-NEXT: ret void ; entry: