diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -204,6 +204,8 @@ return false; BasicBlock *BB = I0->getParent(); for (int i = 1, e = VL.size(); i < e; i++) { + if (isa(VL[i])) + continue; Instruction *I = dyn_cast(VL[i]); if (!I) return false; @@ -394,9 +396,18 @@ /// could be vectorized even if its structure is diverse. static InstructionsState getSameOpcode(ArrayRef VL, unsigned BaseIndex = 0) { - // Make sure these are all Instructions. - if (llvm::any_of(VL, [](Value *V) { return !isa(V); })) + // Make sure these are all Instructions or UndefValues. + if (llvm::any_of(VL, + [](Value *V) { + return !isa(V) && !isa(V); + }) || + llvm::all_of(VL, [](Value *V) { return isa(V); })) return InstructionsState(VL[BaseIndex], nullptr, nullptr); + for (unsigned I = BaseIndex, E = VL.size(); I < E; I++) + if (isa(VL[I])) { + BaseIndex = I; + break; + } bool IsCastOp = isa(VL[BaseIndex]); bool IsBinOp = isa(VL[BaseIndex]); @@ -407,6 +418,8 @@ // Check for one alternate opcode from another BinaryOperator. // TODO - generalize to support all operators (types, calls etc.). for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { + if (isa(VL[Cnt])) + continue; unsigned InstOpcode = cast(VL[Cnt])->getOpcode(); if (IsBinOp && isa(VL[Cnt])) { if (InstOpcode == Opcode || InstOpcode == AltOpcode) @@ -746,6 +759,7 @@ /// accessing a consecutive address. These strategies are summarized in the /// 'ReorderingMode' enumerator. enum class ReorderingMode { + Unknown, ///< Mode is not defined yet Load, ///< Matching loads to consecutive memory addresses Opcode, ///< Matching instructions based on opcode (same or alternate) Constant, ///< Matching constants @@ -761,6 +775,7 @@ const DataLayout &DL; ScalarEvolution &SE; const BoUpSLP &R; + Instruction &VL0; /// \returns the operand data at \p OpIdx and \p Lane. OperandData &getData(unsigned OpIdx, unsigned Lane) { @@ -1070,6 +1085,8 @@ break; case ReorderingMode::Failed: return None; + case ReorderingMode::Unknown: + llvm_unreachable("Unknown mode is not expected here."); } } @@ -1110,10 +1127,17 @@ // a map. Instead we can simply count the number of operands that // correspond to one of them (in this case the 'true' APO), and calculate // the other by subtracting it from the total number of operands. - for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) - if (getData(OpIdx, Lane).APO) + unsigned UndefsCnt = 0; + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + const OperandData &OpData = getData(OpIdx, Lane); + if (isa(OpData.V)) { + ++UndefsCnt; + continue; + } + if (OpData.APO) ++CntTrue; - unsigned CntFalse = NumOperands - CntTrue; + } + unsigned CntFalse = NumOperands - CntTrue - UndefsCnt; return std::max(CntTrue, CntFalse); } @@ -1122,13 +1146,18 @@ assert(!VL.empty() && "Bad VL"); assert((empty() || VL.size() == getNumLanes()) && "Expected same number of lanes"); - assert(isa(VL[0]) && "Expected instruction"); - unsigned NumOperands = cast(VL[0])->getNumOperands(); + unsigned NumOperands = VL0.getNumOperands(); OpsVec.resize(NumOperands); unsigned NumLanes = VL.size(); for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { OpsVec[OpIdx].resize(NumLanes); for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + if (isa(VL[Lane])) { + OpsVec[OpIdx][Lane] = { + UndefValue::get(VL0.getOperand(OpIdx)->getType()), false, + false}; + continue; + } assert(isa(VL[Lane]) && "Expected instruction"); // Our tree has just 3 nodes: the root and two operands. // It is therefore trivial to get the APO. We only need to check the @@ -1193,9 +1222,9 @@ public: /// Initialize with all the operands of the instruction vector \p RootVL. - VLOperands(ArrayRef RootVL, const DataLayout &DL, + VLOperands(Instruction &VL0, ArrayRef RootVL, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) - : DL(DL), SE(SE), R(R) { + : DL(DL), SE(SE), R(R), VL0(VL0) { // Append all the operands of RootVL. appendOperandsOfVL(RootVL); } @@ -1220,7 +1249,8 @@ // Each operand has its own mode. We are using this mode to help us select // the instructions for each lane, so that they match best with the ones // we have selected so far. - SmallVector ReorderingModes(NumOperands); + SmallVector ReorderingModes(NumOperands, + ReorderingMode::Unknown); // This is a greedy single-pass algorithm. We are going over each lane // once and deciding on the best order right away with no back-tracking. @@ -1314,6 +1344,8 @@ #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) { switch (RMode) { + case ReorderingMode::Unknown: + return "Unknown"; case ReorderingMode::Load: return "Load"; case ReorderingMode::Opcode: @@ -1400,7 +1432,8 @@ /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. - int getGatherCost(Type *Ty, const DenseSet &ShuffledIndices) const; + int getGatherCost(Type *Ty, const DenseSet &ShuffledIndices, + const DenseSet &IgnoredIndices) const; /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the @@ -1420,12 +1453,10 @@ /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. - static void reorderInputsAccordingToOpcode(ArrayRef VL, - SmallVectorImpl &Left, - SmallVectorImpl &Right, - const DataLayout &DL, - ScalarEvolution &SE, - const BoUpSLP &R); + static void reorderInputsAccordingToOpcode( + Instruction &VL0, ArrayRef VL, SmallVectorImpl &Left, + SmallVectorImpl &Right, const DataLayout &DL, + ScalarEvolution &SE, const BoUpSLP &R); struct TreeEntry { using VecTreeTy = SmallVector, 8>; TreeEntry(VecTreeTy &Container) : Container(Container) {} @@ -1493,15 +1524,19 @@ } /// Set the operands of this bundle in their original order. - void setOperandsInOrder() { + void setOperandsInOrder(Instruction *I0) { assert(Operands.empty() && "Already initialized?"); - auto *I0 = cast(Scalars[0]); Operands.resize(I0->getNumOperands()); unsigned NumLanes = Scalars.size(); for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); OpIdx != NumOperands; ++OpIdx) { Operands[OpIdx].resize(NumLanes); for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + if (isa(Scalars[Lane])) { + Operands[OpIdx][Lane] = + UndefValue::get(I0->getOperand(OpIdx)->getType()); + continue; + } auto *I = cast(Scalars[Lane]); assert(I->getNumOperands() == NumOperands && "Expected same number of operands"); @@ -1649,11 +1684,15 @@ ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; Last->setOperations(S); - if (Vectorized) { - for (int i = 0, e = VL.size(); i != e; ++i) { - assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = Last; + for (Value *V : make_filter_range(VL, Instruction::classof)) { + if (Vectorized) { + assert(!getTreeEntry(V) && "Scalar already in tree!"); + ScalarToTreeEntry[V] = Last; + } else { + MustGather.insert(V); } + } + if (Vectorized) { // Update the scheduler bundle to point to this TreeEntry. unsigned Lane = 0; for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember; @@ -1662,10 +1701,9 @@ BundleMember->Lane = Lane; ++Lane; } - assert((!Bundle.getValue() || Lane == VL.size()) && + assert((!Bundle.getValue() || + Lane == llvm::count_if(VL, Instruction::classof)) && "Bundle and VL out of sync"); - } else { - MustGather.insert(VL.begin(), VL.end()); } if (UserTreeIdx.UserTE) @@ -2387,6 +2425,8 @@ // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (isa(Scalar)) + continue; int FoundLane = Lane; if (!Entry->ReuseShuffleIndices.empty()) { FoundLane = @@ -2410,16 +2450,19 @@ // Skip in-tree scalars that become vectors if (TreeEntry *UseEntry = getTreeEntry(U)) { - Value *UseScalar = UseEntry->Scalars[0]; - // Some in-tree scalars will remain as scalar in vectorized - // instructions. If that is the case, the one in Lane 0 will - // be used. - if (UseScalar != U || - !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { - LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U - << ".\n"); - assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state"); - continue; + auto It = llvm::find_if(UseEntry->Scalars, Instruction::classof); + if (It != UseEntry->Scalars.end()) { + Value *UseScalar = *It; + // Some in-tree scalars will remain as scalar in vectorized + // instructions. If that is the case, the one in Lane 0 will + // be used. + if (UseScalar != U || + !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { + LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U + << ".\n"); + assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state"); + continue; + } } } @@ -2472,7 +2515,7 @@ // Don't vectorize ephemeral values. for (Value *V : VL) { - if (EphValues.count(V)) { + if (!isa(V) && EphValues.count(V)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is ephemeral.\n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); @@ -2498,10 +2541,7 @@ // Check that none of the instructions in the bundle are already in the tree. for (Value *V : VL) { - auto *I = dyn_cast(V); - if (!I) - continue; - if (getTreeEntry(I)) { + if (!isa(V) && getTreeEntry(V)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is already in tree.\n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); @@ -2513,7 +2553,8 @@ // we need to gather the scalars. // The reduction nodes (stored in UserIgnoreList) also should stay scalar. for (Value *V : VL) { - if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { + if (!isa(V) && + (MustGather.count(V) || is_contained(UserIgnoreList, V))) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; @@ -2538,6 +2579,11 @@ SmallVector UniqueValues; DenseMap UniquePositions; for (Value *V : VL) { + if (isa(V)) { + ReuseShuffleIndicies.emplace_back(UniqueValues.size()); + UniqueValues.emplace_back(V); + continue; + } auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); ReuseShuffleIndicies.emplace_back(Res.first->second); if (Res.second) @@ -2548,14 +2594,21 @@ ReuseShuffleIndicies.clear(); } else { LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); - if (NumUniqueScalarValues <= 1 || - !llvm::isPowerOf2_32(NumUniqueScalarValues)) { - LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); - return; - } + UniqueValues.append(VL.size() - UniqueValues.size(), + UndefValue::get(VL0->getType())); VL = UniqueValues; } + const unsigned NumberOfInstructions = + llvm::count_if(VL, Instruction::classof); + if (NumberOfInstructions <= 1) { + LLVM_DEBUG( + dbgs() + << "SLP: Gathering due to vectorization of single instruction.\n"); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + return; + } + auto InstructionsOnly = make_filter_range(VL, Instruction::classof); auto &BSRef = BlocksSchedules[BB]; if (!BSRef) @@ -2582,11 +2635,11 @@ auto *PH = cast(VL0); // Check for terminator values (e.g. invoke). - for (unsigned j = 0; j < VL.size(); ++j) - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { - Instruction *Term = dyn_cast( - cast(VL[j])->getIncomingValueForBlock( - PH->getIncomingBlock(i))); + for (Value *V : InstructionsOnly) + for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { + auto *Term = + dyn_cast(cast(V)->getIncomingValueForBlock( + PH->getIncomingBlock(I))); if (Term && Term->isTerminator()) { LLVM_DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"); @@ -2603,13 +2656,16 @@ // Keeps the reordered operands to avoid code duplication. SmallVector OperandsVec; - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getIncomingValueForBlock( - PH->getIncomingBlock(i))); - TE->setOperand(i, Operands); + for (Value *V : VL) { + Operands.emplace_back( + isa(V) ? UndefValue::get(V->getType()) + : cast(V)->getIncomingValueForBlock( + PH->getIncomingBlock(I))); + } + TE->setOperand(I, Operands); OperandsVec.push_back(Operands); } for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx) @@ -2681,9 +2737,22 @@ // Make sure all loads in the bundle are simple - we can't vectorize // atomic or volatile loads. - SmallVector PointerOps(VL.size()); - auto POIter = PointerOps.begin(); + SmallVector PointerOps(NumberOfInstructions); + auto *POIter = PointerOps.begin(); + bool TypeIsSimple = VL0->getType()->isIntegerTy() || + VL0->getType()->isFloatingPointTy() || + VL0->getType()->isPointerTy(); for (Value *V : VL) { + if (isa(V)) { + if (!TypeIsSimple) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); + return; + } + continue; + } auto *L = cast(V); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); @@ -2714,22 +2783,24 @@ dyn_cast(SE->getMinusSCEV(ScevN, Scev0)); uint64_t Size = DL->getTypeAllocSize(ScalarTy); // Check that the sorted loads are consecutive. - if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { + if (Diff && Diff->getAPInt() == (NumberOfInstructions - 1) * Size) { if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { + CurrentOrder.append(VL.size() - NumberOfInstructions, + VL.size() + 1); // Need to reorder. auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies, I->getFirst()); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } return; @@ -2756,6 +2827,8 @@ case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); for (Value *V : VL) { + if (isa(V)) + continue; Type *Ty = cast(V)->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); @@ -2770,12 +2843,15 @@ ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *V : VL) - Operands.push_back(cast(V)->getOperand(i)); + for (Value *V : VL) { + Operands.push_back(isa(V) + ? UndefValue::get(SrcTy) + : cast(V)->getOperand(i)); + } buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -2788,7 +2864,9 @@ CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); Type *ComparedTy = VL0->getOperand(0)->getType(); for (Value *V : VL) { - CmpInst *Cmp = cast(V); + if (isa(V)) + continue; + auto *Cmp = cast(V); if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); @@ -2809,10 +2887,15 @@ // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. assert(P0 == SwapP0 && "Commutative Predicate mismatch"); - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + reorderInputsAccordingToOpcode(*VL0, VL, Left, Right, *DL, *SE, *this); } else { // Collect operands - commute if it uses the swapped predicate. for (Value *V : VL) { + if (isa(V)) { + Left.push_back(UndefValue::get(VL0->getOperand(0)->getType())); + Right.push_back(UndefValue::get(VL0->getOperand(1)->getType())); + continue; + } auto *Cmp = cast(V); Value *LHS = Cmp->getOperand(0); Value *RHS = Cmp->getOperand(1); @@ -2856,7 +2939,7 @@ // have the same opcode. if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + reorderInputsAccordingToOpcode(*VL0, VL, Left, Right, *DL, *SE, *this); TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -2864,20 +2947,48 @@ return; } - TE->setOperandsInOrder(); + SmallVector OperandsVec; for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); - - buildTree_rec(Operands, Depth + 1, {TE, i}); + Value *LastDefined = nullptr; + for (Value *V : VL) { + Value *OpV; + if (isa(V)) { + if (BinaryOperator::isIntDivRem(ShuffleOrOp)) { + if (LastDefined) + OpV = LastDefined; + else + OpV = ConstantInt::get(VL0->getOperand(i)->getType(), 1); + } else { + OpV = UndefValue::get(VL0->getOperand(i)->getType()); + } + } else { + OpV = cast(V)->getOperand(i); + if (isa(OpV) && + BinaryOperator::isIntDivRem(ShuffleOrOp)) { + if (LastDefined) + OpV = LastDefined; + else + OpV = ConstantInt::get(VL0->getOperand(i)->getType(), 1); + } else { + LastDefined = OpV; + } + } + Operands.push_back(OpV); + } + TE->setOperand(i, Operands); + OperandsVec.push_back(Operands); } + for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx) + buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx}); return; } case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. for (Value *V : VL) { + if (isa(V)) + continue; if (cast(V)->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); @@ -2891,6 +3002,8 @@ // different types. Type *Ty0 = VL0->getOperand(0)->getType(); for (Value *V : VL) { + if (isa(V)) + continue; Type *CurTy = cast(V)->getOperand(0)->getType(); if (Ty0 != CurTy) { LLVM_DEBUG(dbgs() @@ -2905,7 +3018,9 @@ // We don't combine GEPs with non-constant indexes. Type *Ty1 = VL0->getOperand(1)->getType(); for (Value *V : VL) { - auto Op = cast(V)->getOperand(1); + if (isa(V)) + continue; + auto *Op = cast(V)->getOperand(1); if (!isa(Op) || (Op->getType() != Ty1 && Op->getType()->getScalarSizeInBits() > @@ -2923,12 +3038,16 @@ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *V : VL) - Operands.push_back(cast(V)->getOperand(i)); + for (Value *V : VL) { + Operands.push_back( + isa(V) + ? UndefValue::get(VL0->getOperand(i)->getType()) + : cast(V)->getOperand(i)); + } buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -2939,11 +3058,16 @@ llvm::Type *ScalarTy = cast(VL0)->getValueOperand()->getType(); // Make sure all stores in the bundle are simple - we can't vectorize // atomic or volatile stores. - SmallVector PointerOps(VL.size()); + SmallVector PointerOps(NumberOfInstructions); ValueList Operands(VL.size()); auto POIter = PointerOps.begin(); auto OIter = Operands.begin(); for (Value *V : VL) { + if (isa(V)) { + *OIter = UndefValue::get(VL0->getOperand(0)->getType()); + ++OIter; + continue; + } auto *SI = cast(V); if (!SI->isSimple()) { BS.cancelScheduling(VL, VL0); @@ -2957,49 +3081,64 @@ ++POIter; ++OIter; } + if (NumberOfInstructions != VL.size() && + !(VL0->getOperand(0)->getType()->isIntegerTy() || + VL0->getOperand(0)->getType()->isFloatingPointTy() || + VL0->getOperand(0)->getType()->isPointerTy())) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG( + dbgs() << "SLP: Non-consecutive store of non-scalar values.\n"); + return; + } OrdersType CurrentOrder; + if (!llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); + return; + } // Check the order of pointer operands. - if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { - Value *Ptr0; - Value *PtrN; + Value *Ptr0; + Value *PtrN; + if (CurrentOrder.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[CurrentOrder.front()]; + PtrN = PointerOps[CurrentOrder.back()]; + } + const SCEV *Scev0 = SE->getSCEV(Ptr0); + const SCEV *ScevN = SE->getSCEV(PtrN); + const auto *Diff = dyn_cast(SE->getMinusSCEV(ScevN, Scev0)); + uint64_t Size = DL->getTypeAllocSize(ScalarTy); + // Check that the sorted pointer operands are consecutive. + if (Diff && Diff->getAPInt() == (NumberOfInstructions - 1) * Size) { if (CurrentOrder.empty()) { - Ptr0 = PointerOps.front(); - PtrN = PointerOps.back(); + // Original stores are consecutive and does not require reordering. + ++NumOpsWantToKeepOriginalOrder; + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, + UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(VL0); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); } else { - Ptr0 = PointerOps[CurrentOrder.front()]; - PtrN = PointerOps[CurrentOrder.back()]; - } - const SCEV *Scev0 = SE->getSCEV(Ptr0); - const SCEV *ScevN = SE->getSCEV(PtrN); - const auto *Diff = - dyn_cast(SE->getMinusSCEV(ScevN, Scev0)); - uint64_t Size = DL->getTypeAllocSize(ScalarTy); - // Check that the sorted pointer operands are consecutive. - if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { - if (CurrentOrder.empty()) { - // Original stores are consecutive and does not require reordering. - ++NumOpsWantToKeepOriginalOrder; - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, - UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); - buildTree_rec(Operands, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); - } else { - // Need to reorder. - auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; - ++(I->getSecond()); - TreeEntry *TE = - newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, I->getFirst()); - TE->setOperandsInOrder(); - buildTree_rec(Operands, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); - } - return; + CurrentOrder.append(VL.size() - NumberOfInstructions, VL.size() + 1); + // Need to reorder. + auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; + ++(I->getSecond()); + TreeEntry *TE = + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, I->getFirst()); + TE->setOperandsInOrder(VL0); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); } + return; } - BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); @@ -3026,6 +3165,13 @@ if (hasVectorInstrinsicScalarOpd(ID, j)) ScalarArgs[j] = CI->getArgOperand(j); for (Value *V : VL) { + if (isa(V)) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); + return; + } CallInst *CI2 = dyn_cast(V); if (!CI2 || CI2->getCalledFunction() != Int || getVectorIntrinsicIDForCall(CI2, TLI) != ID || @@ -3033,8 +3179,8 @@ BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V - << "\n"); + LLVM_DEBUG(dbgs() + << "SLP: mismatched calls:" << *CI << "!=" << *V << "\n"); return; } // Some intrinsics have scalar arguments and should be same in order for @@ -3069,7 +3215,7 @@ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -3098,7 +3244,7 @@ // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + reorderInputsAccordingToOpcode(*VL0, VL, Left, Right, *DL, *SE, *this); TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -3106,12 +3252,15 @@ return; } - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (Value *V : VL) - Operands.push_back(cast(V)->getOperand(i)); + Operands.push_back( + isa(V) + ? UndefValue::get(VL0->getOperand(i)->getType()) + : cast(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -3180,9 +3329,6 @@ NElts = Vec->getType()->getVectorNumElements(); } - if (NElts != VL.size()) - return false; - // Check that all of the indices extract from the correct offset. bool ShouldKeepOrder = true; unsigned E = VL.size(); @@ -3192,8 +3338,10 @@ // consecutive access in the extract instructions, by checking that no // element of CurrentOrder still has value E + 1. CurrentOrder.assign(E, E + 1); - unsigned I = 0; - for (; I < E; ++I) { + unsigned I = 0, End = std::min(NElts, E); + for (; I < End; ++I) { + if (isa(VL[I])) + continue; auto *Inst = cast(VL[I]); if (Inst->getOperand(0) != Vec) break; @@ -3212,10 +3360,15 @@ CurrentOrder[I] = I; } } - if (I < E) { + // Gather if not all extracts are from the same vector/aggrgate. + if (I < End || (NElts < E && !all_of(drop_begin(VL, NElts), [](Value *V) { + return isa(V); + }))) { CurrentOrder.clear(); return false; } + if (NElts != E) + return false; return ShouldKeepOrder; } @@ -3230,18 +3383,29 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef VL = E->Scalars; + auto InstructionsOnly = make_filter_range(VL, Instruction::classof); + const unsigned NumOfInstructions = + llvm::count_if(InstructionsOnly, [](Value *V) { return true; }); + Value *V0; Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - else if (CmpInst *CI = dyn_cast(VL[0])) - ScalarTy = CI->getOperand(0)->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - - // If we have computed a smaller type for the expression, update VecTy so - // that the costs will be accurate. - if (MinBWs.count(VL[0])) - VecTy = VectorType::get( - IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + VectorType *VecTy; + if (!llvm::empty(InstructionsOnly)) { + V0 = *InstructionsOnly.begin(); + if (StoreInst *SI = dyn_cast(V0)) + ScalarTy = SI->getValueOperand()->getType(); + else if (CmpInst *CI = dyn_cast(V0)) + ScalarTy = CI->getOperand(0)->getType(); + VecTy = VectorType::get(ScalarTy, VL.size()); + + // If we have computed a smaller type for the expression, update VecTy so + // that the costs will be accurate. + if (MinBWs.count(V0)) { + VecTy = VectorType::get( + IntegerType::get(F->getContext(), MinBWs[V0].first), VL.size()); + } + } else { + VecTy = VectorType::get(ScalarTy, VL.size()); + } unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); @@ -3259,10 +3423,15 @@ } if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && allSameBlock(VL)) { - Optional ShuffleKind = isShuffle(VL); - if (ShuffleKind.hasValue()) { - int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); - for (auto *V : VL) { + Optional ShuffleKind = + NumOfInstructions > 1 + ? isShuffle(llvm::to_vector<4>(InstructionsOnly)) + : None; + if (NumOfInstructions == 1 || ShuffleKind) { + int Cost = NumOfInstructions > 1 + ? TTI->getShuffleCost(*ShuffleKind, VecTy) + : 0; + for (Value *V : InstructionsOnly) { // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the @@ -3271,8 +3440,10 @@ !ScalarToTreeEntry.count(V)) { auto *IO = cast( cast(V)->getIndexOperand()); - Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, - IO->getZExtValue()); + Cost -= TTI->getVectorInstrCost( + Instruction::ExtractElement, + cast(V)->getVectorOperandType(), + IO->getZExtValue()); } } return ReuseShuffleCost + Cost; @@ -3294,12 +3465,15 @@ if (NeedToShuffleReuses) { unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { + if (isa(VL[I])) + continue; if (ShuffleOrOp == Instruction::ExtractElement) { auto *IO = cast( cast(VL[I])->getIndexOperand()); Idx = IO->getZExtValue(); ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); + Instruction::ExtractElement, + cast(VL[I])->getVectorOperandType(), Idx); } else { ReuseShuffleCost -= TTI->getVectorInstrCost( Instruction::ExtractElement, VecTy, Idx); @@ -3308,47 +3482,73 @@ } Idx = ReuseShuffleNumbers; for (Value *V : VL) { + if (isa(V)) + continue; if (ShuffleOrOp == Instruction::ExtractElement) { auto *IO = cast( cast(V)->getIndexOperand()); Idx = IO->getZExtValue(); + ReuseShuffleCost += TTI->getVectorInstrCost( + Instruction::ExtractElement, + cast(V)->getVectorOperandType(), Idx); } else { --Idx; + ReuseShuffleCost += TTI->getVectorInstrCost( + Instruction::ExtractElement, VecTy, Idx); } - ReuseShuffleCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } } +#ifndef NDEBUG + OrdersType CurrentOrder; + bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); + assert(Reuse && E->ReorderIndices.empty() || + (!Reuse && CurrentOrder.size() == E->ReorderIndices.size() && + std::equal(CurrentOrder.begin(), CurrentOrder.end(), + E->ReorderIndices.begin())) && + "The sequence of extract elements must be reused or shuffled " + "with the same mask."); +#endif int DeadCost = ReuseShuffleCost; if (!E->ReorderIndices.empty()) { // TODO: Merge this shuffle with the ReuseShuffleCost. DeadCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *E = cast(VL[i]); + for (unsigned I = 0, E = VL.size(); I < E; ++I) { + if (isa(VL[I])) + continue; + auto *EI = cast(VL[I]); // If all users are going to be vectorized, instruction can be // considered as dead. // The same, if have only one user, it will be vectorized for sure. - if (areAllUsersVectorized(E)) { + if (areAllUsersVectorized(EI)) { // Take credit for instruction that will become dead. - if (E->hasOneUse()) { - Instruction *Ext = E->user_back(); + if (EI->hasOneUse()) { + Instruction *Ext = EI->user_back(); if ((isa(Ext) || isa(Ext)) && all_of(Ext->users(), [](User *U) { return isa(U); })) { // Use getExtractWithExtendCost() to calculate the cost of // extractelement/ext pair. DeadCost -= TTI->getExtractWithExtendCost( - Ext->getOpcode(), Ext->getType(), VecTy, i); + Ext->getOpcode(), Ext->getType(), VecTy, I); // Add back the cost of s|zext which is subtracted separately. DeadCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), E->getType(), Ext); + Ext->getOpcode(), Ext->getType(), EI->getType(), Ext); continue; } } - DeadCost -= - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *IO = cast( + cast(EI)->getIndexOperand()); + unsigned Idx = IO->getZExtValue(); + DeadCost -= TTI->getVectorInstrCost( + Instruction::ExtractElement, + cast(EI)->getVectorOperandType(), Idx); + } else { + DeadCost -= + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); + } } } return DeadCost; @@ -3369,11 +3569,12 @@ int ScalarEltCost = TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } // Calculate the cost of this instruction. - int ScalarCost = VL.size() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); int VecCost = 0; @@ -3391,10 +3592,11 @@ int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); - int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } @@ -3433,24 +3635,27 @@ // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt0 = nullptr; + Constant *C0 = nullptr; for (unsigned i = 0, e = VL.size(); i < e; ++i) { + if (isa(VL[i])) + continue; const Instruction *I = cast(VL[i]); unsigned OpIdx = isa(I) ? 1 : 0; ConstantInt *CInt = dyn_cast(I->getOperand(OpIdx)); - if (!CInt) { + Constant *UV = dyn_cast(I->getOperand(OpIdx)); + if (!CInt && !UV) { Op2VK = TargetTransformInfo::OK_AnyValue; Op2VP = TargetTransformInfo::OP_None; break; } if (Op2VP == TargetTransformInfo::OP_PowerOf2 && - !CInt->getValue().isPowerOf2()) + (UV || !cast(CInt)->getValue().isPowerOf2())) Op2VP = TargetTransformInfo::OP_None; if (i == 0) { - CInt0 = CInt; + C0 = CInt ? CInt : UV; continue; } - if (CInt0 != CInt) + if (C0 != (CInt ? CInt : UV)) Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } @@ -3458,9 +3663,10 @@ int ScalarEltCost = TTI->getArithmeticInstrCost( E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } - int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; int VecCost = TTI->getArithmeticInstrCost( E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); return ReuseShuffleCost + VecCost - ScalarCost; @@ -3474,9 +3680,10 @@ int ScalarEltCost = TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } - int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; int VecCost = TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); return ReuseShuffleCost + VecCost - ScalarCost; @@ -3487,12 +3694,33 @@ int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; + } + int ScalarLdCost = NumOfInstructions * ScalarEltCost; + int VecLdCost; + auto FirstInstr = llvm::find_if(VL, Instruction::classof); + auto LastInstr = llvm::find_if(llvm::reverse(VL), Instruction::classof); + unsigned InstrDist = std::distance(FirstInstr, LastInstr.base()); + bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(InstrDist); + if (!IsPowOf2NumOfInstructions || InstrDist == 1) { + VecLdCost = TTI->getIntrinsicInstrCost( + Intrinsic::masked_load, VecTy, + {VecTy->getPointerTo(), Builder.getInt32Ty(), + VectorType::get(Builder.getInt1Ty(), + VecTy->getVectorNumElements()), + VecTy}, FastMathFlags()); + } else if (InstrDist == VL.size()) { + VecLdCost = + TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); + } else { + VectorType *LoadVecTy = VectorType::get(ScalarTy, InstrDist); + VecLdCost = TTI->getMemoryOpCost(Instruction::Load, LoadVecTy, + alignment, 0, VL0) ; } - int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; - int VecLdCost = - TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); - if (!E->ReorderIndices.empty()) { + if (!NeedToShuffleReuses && + (!E->ReorderIndices.empty() || + (IsPowOf2NumOfInstructions && InstrDist != VL.size()))) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecLdCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); @@ -3508,10 +3736,21 @@ int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0); if (NeedToShuffleReuses) - ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost; - int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, - VecTy, Alignment, 0, VL0); + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; + int ScalarStCost = NumOfInstructions * ScalarEltCost; + int VecStCost; + if (NumOfInstructions != VL.size()) { + VecStCost = TTI->getIntrinsicInstrCost( + Intrinsic::masked_store, Builder.getVoidTy(), + {VecTy, VecTy->getPointerTo(), Builder.getInt32Ty(), + VectorType::get(Builder.getInt1Ty(), + VecTy->getVectorNumElements())}, + FastMathFlags()); + } else { + VecStCost = + TTI->getMemoryOpCost(Instruction::Store, VecTy, Alignment, 0, VL0); + } if (IsReorder) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecStCost += TTI->getShuffleCost( @@ -3535,9 +3774,10 @@ int ScalarEltCost = TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } - int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCallCost = NumOfInstructions * ScalarEltCost; SmallVector Args(CI->arg_operands()); int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF, @@ -3559,18 +3799,22 @@ int ScalarCost = 0; if (NeedToShuffleReuses) { for (unsigned Idx : E->ReuseShuffleIndices) { - Instruction *I = cast(VL[Idx]); + if (isa(VL[Idx])) + continue; + auto *I = cast(VL[Idx]); ReuseShuffleCost -= TTI->getInstructionCost( I, TargetTransformInfo::TCK_RecipThroughput); } - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { Instruction *I = cast(V); ReuseShuffleCost += TTI->getInstructionCost( I, TargetTransformInfo::TCK_RecipThroughput); } } for (Value *V : VL) { - Instruction *I = cast(V); + if (isa(V)) + continue; + auto *I = cast(V); assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); ScalarCost += TTI->getInstructionCost( I, TargetTransformInfo::TCK_RecipThroughput); @@ -3836,12 +4080,15 @@ return Cost; } -int BoUpSLP::getGatherCost(Type *Ty, - const DenseSet &ShuffledIndices) const { +int BoUpSLP::getGatherCost(Type *Ty, const DenseSet &ShuffledIndices, + const DenseSet &IgnoredIndices) const { int Cost = 0; - for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) - if (!ShuffledIndices.count(i)) - Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + for (unsigned I = 0, E = cast(Ty)->getNumElements(); I < E; ++I) { + if (IgnoredIndices.count(I)) + continue; + if (!ShuffledIndices.count(I)) + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, I); + } if (!ShuffledIndices.empty()) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; @@ -3857,27 +4104,30 @@ // Check if the same elements are inserted several times and count them as // shuffle candidates. DenseSet ShuffledElements; + DenseSet IgnoredElements; DenseSet UniqueElements; // Iterate in reverse order to consider insert elements with the high cost. for (unsigned I = VL.size(); I > 0; --I) { unsigned Idx = I - 1; + if (isa(VL[Idx])) { + IgnoredElements.insert(Idx); + continue; + } if (!UniqueElements.insert(VL[Idx]).second) ShuffledElements.insert(Idx); } - return getGatherCost(VecTy, ShuffledElements); + return getGatherCost(VecTy, ShuffledElements, IgnoredElements); } // Perform operand reordering on the instructions in VL and return the reordered // operands in Left and Right. -void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, - SmallVectorImpl &Left, - SmallVectorImpl &Right, - const DataLayout &DL, - ScalarEvolution &SE, - const BoUpSLP &R) { +void BoUpSLP::reorderInputsAccordingToOpcode( + Instruction &VL0, ArrayRef VL, SmallVectorImpl &Left, + SmallVectorImpl &Right, const DataLayout &DL, ScalarEvolution &SE, + const BoUpSLP &R) { if (VL.empty()) return; - VLOperands Ops(VL, DL, SE, R); + VLOperands Ops(VL0, VL, DL, SE, R); // Reorder the operands in place. Ops.reorder(); Left = Ops.getVL(0); @@ -3885,15 +4135,17 @@ } void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) { + auto InstructionsOnly = make_filter_range(E->Scalars, Instruction::classof); + if (llvm::empty(InstructionsOnly)) + return; // Get the basic block this bundle is in. All instructions in the bundle // should be in this block. auto *Front = E->getMainOp(); auto *BB = Front->getParent(); - assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()), - [=](Value *V) -> bool { - auto *I = cast(V); - return !E->isOpcodeOrAlt(I) || I->getParent() == BB; - })); + assert(llvm::all_of(InstructionsOnly, [=](Value *V) -> bool { + auto *I = cast(V); + return !E->isOpcodeOrAlt(I) || I->getParent() == BB; + })); // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; @@ -3903,8 +4155,8 @@ // VL.back() and iterate over schedule data until we reach the end of the // bundle. The end of the bundle is marked by null ScheduleData. if (BlocksSchedules.count(BB)) { - auto *Bundle = - BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back())); + auto *Bundle = BlocksSchedules[BB]->getScheduleData( + E->isOneOf(*llvm::reverse(InstructionsOnly).begin())); if (Bundle && Bundle->isPartOfBundle()) for (; Bundle; Bundle = Bundle->NextInBundle) if (Bundle->OpValue == Bundle->Inst) @@ -3930,7 +4182,8 @@ // we both exit early from buildTree_rec and that the bundle be out-of-order // (causing us to iterate all the way to the end of the block). if (!LastInst) { - SmallPtrSet Bundle(E->Scalars.begin(), E->Scalars.end()); + SmallPtrSet Bundle(InstructionsOnly.begin(), + InstructionsOnly.end()); for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I)) LastInst = &I; @@ -3950,6 +4203,8 @@ Value *Vec = UndefValue::get(Ty); // Generate the 'InsertElement' instruction. for (unsigned i = 0; i < Ty->getNumElements(); ++i) { + if (isa(VL[i])) + continue; Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); if (auto *Insrt = dyn_cast(Vec)) { GatherSeq.insert(Insrt); @@ -3983,27 +4238,27 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { InstructionsState S = getSameOpcode(VL); if (S.getOpcode()) { - if (TreeEntry *E = getTreeEntry(S.OpValue)) { - if (E->isSame(VL)) { - Value *V = vectorizeTree(E); - if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) { - // We need to get the vectorized value but without shuffle. - if (auto *SV = dyn_cast(V)) { - V = SV->getOperand(0); - } else { - // Reshuffle to get only unique values. - SmallVector UniqueIdxs; - SmallSet UsedIdxs; - for(unsigned Idx : E->ReuseShuffleIndices) - if (UsedIdxs.insert(Idx).second) - UniqueIdxs.emplace_back(Idx); - V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), - UniqueIdxs); - } - } - return V; + // Check that every instruction appears once in this bundle. + SmallVector UniqueValues; + DenseMap UniquePositions; + UniqueValues.reserve(VL.size()); + for (Value *V : VL) { + if (isa(V)) { + UniqueValues.emplace_back(V); + continue; } + auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); + if (Res.second) + UniqueValues.emplace_back(V); } + if (UniqueValues.size() != VL.size()) { + LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); + UniqueValues.append(VL.size() - UniqueValues.size(), + UndefValue::get(VL[0]->getType())); + } + if (TreeEntry *E = getTreeEntry(S.OpValue)) + if (E->isSame(UniqueValues)) + return vectorizeTree(E); } Type *ScalarTy = S.OpValue->getType(); @@ -4047,9 +4302,10 @@ SmallVectorImpl &Mask) { Mask.clear(); const unsigned E = Indices.size(); - Mask.resize(E); + Mask.resize(E, E - 1); for (unsigned I = 0; I < E; ++I) - Mask[Indices[I]] = I; + if (Indices[I] < E) + Mask[Indices[I]] = I; } Value *BoUpSLP::vectorizeTree(TreeEntry *E) { @@ -4065,6 +4321,8 @@ if (StoreInst *SI = dyn_cast(VL0)) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); + if (isa(VL0)) + return UndefValue::get(VecTy); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); @@ -4084,6 +4342,7 @@ } assert(E->State == TreeEntry::Vectorize && "Unhandled state"); + auto InstructionsOnly = make_filter_range(E->Scalars, Instruction::classof); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); switch (ShuffleOrOp) { @@ -4129,27 +4388,26 @@ OrdersType Mask; inversePermutation(E->ReorderIndices, Mask); Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask, + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask, "reorder_shuffle"); } if (NeedToShuffleReuses) { // TODO: Merge this shuffle with the ReorderShuffleMask. if (E->ReorderIndices.empty()) Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), E->ReuseShuffleIndices, "shuffle"); } E->VectorizedValue = V; return V; } case Instruction::ExtractValue: { - LoadInst *LI = cast(E->getSingleOperand(0)); + auto *LI = cast(VL0->getOperand(0)); Builder.SetInsertPoint(LI); - PointerType *PtrTy = - PointerType::get(VecTy, LI->getPointerAddressSpace()); + auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); - LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); - Value *NewV = propagateMetadata(V, E->Scalars); + LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlign()); + Value *NewV = propagateMetadata(V, llvm::to_vector<4>(InstructionsOnly)); if (!E->ReorderIndices.empty()) { OrdersType Mask; inversePermutation(E->ReorderIndices, Mask); @@ -4302,7 +4560,7 @@ RHS); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); + V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly)); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -4325,8 +4583,21 @@ Type *ScalarLoadTy = LI->getType(); unsigned AS = LI->getPointerAddressSpace(); - Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), - VecTy->getPointerTo(AS)); + auto *FirstInstr = llvm::find_if(E->Scalars, Instruction::classof); + auto *LastInstr = + llvm::find_if(llvm::reverse(E->Scalars), Instruction::classof).base(); + unsigned NumOfInstructions = std::distance(FirstInstr, LastInstr); + bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(NumOfInstructions); + Value *VecPtr; + if (!IsPowOf2NumOfInstructions || + NumOfInstructions == E->Scalars.size() || NumOfInstructions == 1) { + VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), + VecTy->getPointerTo(AS)); + } else { + VecPtr = Builder.CreateBitCast( + LI->getPointerOperand(), + VectorType::get(ScalarTy, NumOfInstructions)->getPointerTo(AS)); + } // The pointer operand uses an in-tree scalar so we add the new BitCast to // ExternalUses list to make sure that an extract will be generated in the @@ -4335,12 +4606,41 @@ if (getTreeEntry(PO)) ExternalUses.push_back(ExternalUser(PO, cast(VecPtr), 0)); - MaybeAlign Alignment = MaybeAlign(LI->getAlignment()); - LI = Builder.CreateLoad(VecTy, VecPtr); - if (!Alignment) - Alignment = MaybeAlign(DL->getABITypeAlignment(ScalarLoadTy)); - LI->setAlignment(Alignment); - Value *V = propagateMetadata(LI, E->Scalars); + Align Alignment = + DL->getValueOrABITypeAlignment(LI->getAlign(), ScalarLoadTy); + Instruction *VecLI; + if (IsPowOf2NumOfInstructions) { + LI = Builder.CreateLoad(VecPtr); + LI->setAlignment(Alignment); + VecLI = LI; + } else { + SmallVector Mask; + SmallVector Passthrough; + Mask.reserve(E->Scalars.size()); + Passthrough.reserve(E->Scalars.size()); + for (auto *V : E->Scalars) { + Mask.emplace_back(Builder.getInt1(!isa(V))); + Passthrough.emplace_back(isa(V) + ? cast(V) + : UndefValue::get(LI->getType())); + } + VecLI = Builder.CreateMaskedLoad(VecPtr, Alignment, + ConstantVector::get(Mask), + ConstantVector::get(Passthrough)); + } + Value *V = propagateMetadata(VecLI, llvm::to_vector<4>(InstructionsOnly)); + if (PowerOf2Ceil(NumOfInstructions) != E->Scalars.size()) { + SmallVector ExtendedIndices(E->Scalars.size(), + NumOfInstructions); + const unsigned Dist = std::distance(E->Scalars.begin(), FirstInstr); + auto *LI = FirstInstr; + for (unsigned I = 0; I < NumOfInstructions; ++I, ++LI) { + if (!isa(*LI)) + ExtendedIndices[I] = Dist + I; + } + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), + ExtendedIndices, "load.extend"); + } if (IsReorder) { OrdersType Mask; inversePermutation(E->ReorderIndices, Mask); @@ -4360,7 +4660,8 @@ bool IsReorder = !E->ReorderIndices.empty(); auto *SI = cast( IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0); - unsigned Alignment = SI->getAlignment(); + Align Alignment = DL->getValueOrABITypeAlignment( + SI->getAlign(), SI->getValueOperand()->getType()); unsigned AS = SI->getPointerAddressSpace(); setInsertPointAfterBundle(E); @@ -4374,9 +4675,48 @@ "reorder_shuffle"); } Value *ScalarPtr = SI->getPointerOperand(); - Value *VecPtr = Builder.CreateBitCast( - ScalarPtr, VecValue->getType()->getPointerTo(AS)); - StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); + + auto *FirstInstr = llvm::find_if(E->Scalars, Instruction::classof); + auto *LastInstr = + llvm::find_if(llvm::reverse(E->Scalars), Instruction::classof).base(); + unsigned NumOfInstructions = std::distance(FirstInstr, LastInstr); + bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(NumOfInstructions); + if (PowerOf2Ceil(NumOfInstructions) != E->Scalars.size()) { + SmallVector ExtendedIndices(NumOfInstructions, + NumOfInstructions); + const unsigned Dist = std::distance(E->Scalars.begin(), FirstInstr); + auto *VI = FirstInstr; + for (unsigned I = 0; I < NumOfInstructions; ++I, ++VI) { + if (!isa(*VI)) + ExtendedIndices[I] = Dist + I; + } + VecValue = Builder.CreateShuffleVector( + VecValue, UndefValue::get(VecValue->getType()), ExtendedIndices, + "values.extend"); + } + Value *VecPtr; + if (!IsPowOf2NumOfInstructions || + NumOfInstructions == E->Scalars.size() || NumOfInstructions == 1) { + VecPtr = Builder.CreateBitCast(ScalarPtr, + VecValue->getType()->getPointerTo(AS)); + } else { + VecPtr = Builder.CreateBitCast( + ScalarPtr, + VectorType::get(ScalarTy, NumOfInstructions)->getPointerTo(AS)); + } + Instruction *VecSI; + if (IsPowOf2NumOfInstructions) { + StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); + ST->setAlignment(Alignment); + VecSI = ST; + } else { + SmallVector Mask; + Mask.reserve(E->Scalars.size()); + for (auto *V : E->Scalars) + Mask.emplace_back(Builder.getInt1(!isa(V))); + VecSI = Builder.CreateMaskedStore(VecValue, VecPtr, Alignment, + ConstantVector::get(Mask)); + } // The pointer operand uses an in-tree scalar, so add the new BitCast to // ExternalUses to make sure that an extract will be generated in the @@ -4384,11 +4724,7 @@ if (getTreeEntry(ScalarPtr)) ExternalUses.push_back(ExternalUser(ScalarPtr, cast(VecPtr), 0)); - if (!Alignment) - Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); - - ST->setAlignment(Align(Alignment)); - Value *V = propagateMetadata(ST, E->Scalars); + Value *V = propagateMetadata(VecSI, llvm::to_vector<4>(InstructionsOnly)); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); @@ -4427,7 +4763,7 @@ Value *V = Builder.CreateGEP( cast(VL0)->getSourceElementType(), Op0, OpVecs); if (Instruction *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); + V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly)); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -4530,6 +4866,11 @@ unsigned e = E->Scalars.size(); SmallVector Mask(e); for (unsigned i = 0; i < e; ++i) { + if (isa(E->Scalars[i])) { + Mask[i] = Builder.getInt32(i); + OpScalars.push_back(E->Scalars[i]); + continue; + } auto *OpInst = cast(E->Scalars[i]); assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); if (OpInst->getOpcode() == E->getAltOpcode()) { @@ -4547,7 +4888,7 @@ Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); if (Instruction *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); + V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly)); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); @@ -4697,6 +5038,8 @@ // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (isa(Scalar)) + continue; #ifndef NDEBUG Type *Ty = Scalar->getType(); @@ -4820,14 +5163,15 @@ bool ReSchedule = false; LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); + auto InstructionsOnly = make_filter_range(VL, Instruction::classof); // Make sure that the scheduling region contains all // instructions of the bundle. - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { if (!extendSchedulingRegion(V, S)) return None; } - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); @@ -4906,7 +5250,9 @@ LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); - assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && + assert(Bundle->isSchedulingEntity() && + (Bundle->isPartOfBundle() || + llvm::count_if(VL, Instruction::classof) == 1) && "tried to unbundle something which is not a bundle"); // Un-bundle: make single instructions out of the bundle. @@ -5211,7 +5557,9 @@ I = I->getNextNode()) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { assert(SD->isPartOfBundle() == - (getTreeEntry(SD->Inst) != nullptr) && + (getTreeEntry(SD->Inst) != nullptr && + llvm::count_if(getTreeEntry(SD->Inst)->Scalars, + Instruction::classof) > 1) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { @@ -5671,9 +6019,19 @@ const unsigned MinVF = R.getMinVecRegSize() / Sz; unsigned VF = Chain.size(); - if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) + unsigned NewSize = PowerOf2Ceil(VF); + if (!isPowerOf2_32(Sz) || VF < 2 || NewSize < MinVF) return false; + SmallVector FixedChain; + if (NewSize != VF) { + FixedChain.reserve(NewSize); + FixedChain.append(Chain.begin(), Chain.end()); + FixedChain.append(NewSize - Chain.size(), + UndefValue::get(Chain[0]->getType())); + Chain = FixedChain; + VF = NewSize; + } LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx << "\n"); @@ -5681,9 +6039,27 @@ Optional> Order = R.bestOrder(); // TODO: Handle orders of size less than number of elements in the vector. if (Order && Order->size() == Chain.size()) { + SmallVector NewOrder(Order->begin(), Order->end()); + SmallVector UsedIndices(Order->size(), false); + unsigned BoundVal = NewOrder.size() + 1; + for (unsigned I : *Order) + if (I != BoundVal) + UsedIndices[I] = true; + unsigned Idx = 0, E = UsedIndices.size(); + for (unsigned &I : NewOrder) { + if (I == BoundVal) { + // Find first non-used index. + for (; Idx != E; ++Idx) + if (!UsedIndices[Idx]) + break; + // Set correct index. + I = Idx; + ++Idx; + } + } // TODO: reorder tree nodes without tree rebuilding. SmallVector ReorderedOps(Chain.rbegin(), Chain.rend()); - llvm::transform(*Order, ReorderedOps.begin(), + llvm::transform(NewOrder, ReorderedOps.begin(), [Chain](const unsigned Idx) { return Chain[Idx]; }); R.buildTree(ReorderedOps); } @@ -5778,8 +6154,9 @@ // register size is a power-of-2? unsigned StartIdx = 0; for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) { - for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { - ArrayRef Slice = makeArrayRef(Operands).slice(Cnt, Size); + for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + 2 <= E;) { + ArrayRef Slice = + makeArrayRef(Operands).slice(Cnt, std::min(Size, E - Cnt)); if (!VectorizedStores.count(Slice.front()) && !VectorizedStores.count(Slice.back()) && vectorizeStoreChain(Slice, R, Cnt)) { @@ -5861,9 +6238,9 @@ return false; Instruction *I0 = cast(S.OpValue); - unsigned Sz = R.getVectorElementSize(I0); - unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); - unsigned MaxVF = std::max(PowerOf2Floor(VL.size()), MinVF); + unsigned Pow2VL = PowerOf2Ceil(VL.size()); + unsigned MinVF = 2; + unsigned MaxVF = Pow2VL; if (MaxVF < 2) { R.getORE()->emit([&]() { return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) @@ -5910,7 +6287,7 @@ else OpsWidth = VF; - if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) + if (OpsWidth < 2 || (VF > MinVF && OpsWidth <= VF / 2)) break; ArrayRef Ops = VL.slice(I, OpsWidth); @@ -5923,6 +6300,17 @@ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " << "\n"); + SmallVector FixedChain; + if (OpsWidth != VF && (UserCost || !AllowReorder)) { + unsigned NewSize = VF; + FixedChain.reserve(NewSize); + FixedChain.append(Ops.begin(), Ops.end()); + FixedChain.append(NewSize - Ops.size(), + UndefValue::get(Ops[0]->getType())); + Ops = FixedChain; + } + assert(Ops.size() == VF && + "Operations must have same size as vectorization factor."); R.buildTree(Ops); Optional> Order = R.bestOrder(); @@ -5942,6 +6330,8 @@ R.computeMinimumValueSizes(); int Cost = R.getTreeCost() - UserCost; + LLVM_DEBUG(dbgs() << "SLP: User vectorization cost: " << -UserCost + << ".\n"); CandidateFound = true; MinCost = std::min(MinCost, Cost); @@ -5959,6 +6349,10 @@ NextInst = I + 1; Changed = true; } + if (UserCost && VF == MaxVF && I == 0) { + UserCost -= TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } } } @@ -6725,9 +7119,26 @@ Optional> Order = V.bestOrder(); // TODO: Handle orders of size less than number of elements in the vector. if (Order && Order->size() == VL.size()) { + SmallVector NewOrder(Order->begin(), Order->end()); + SmallVector UsedIndices(Order->size(), false); + unsigned BoundVal = NewOrder.size() + 1; + for (unsigned I : *Order) + if (I != BoundVal) + UsedIndices[I] = true; + unsigned Idx = 0, E = UsedIndices.size(); + for (unsigned &I : NewOrder) { + if (I == BoundVal) { + // Find first non-used index. + for (; Idx != E; ++Idx) + if (!UsedIndices[Idx]) + break; + // Set correct index. + I = Idx; ++Idx; + } + } // TODO: reorder tree nodes without tree rebuilding. SmallVector ReorderedOps(VL.size()); - llvm::transform(*Order, ReorderedOps.begin(), + llvm::transform(NewOrder, ReorderedOps.begin(), [VL](const unsigned Idx) { return VL[Idx]; }); V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); } @@ -6832,6 +7243,15 @@ return VectorizedTree != nullptr; } + SmallVector getExtraArgs() const { + SmallVector Args(ExtraArgs.size()); + auto It = ExtraArgs.begin(); + for (unsigned I = 0, E = ExtraArgs.size(); I < E; ++I, ++It) { + Args[I] = It->second; + } + return Args; + } + unsigned numReductionValues() const { return ReducedVals.size(); } @@ -7108,6 +7528,15 @@ // Set P to nullptr to avoid re-analysis of phi node in // matchAssociativeReduction function unless this is the root node. P = nullptr; + // Try to vectorize ExtraArgs. + // Continue analysis for the instruction from the same basic block + // only to save compile time. + if (++Level < RecursionMaxDepth) + for (auto *Op : HorRdx.getExtraArgs()) + if (VisitedInstrs.insert(Op).second) + if (auto *I = dyn_cast(Op)) + if (!isa(I) && I->getParent() == BB) + Stack.emplace_back(I, Level); continue; } } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll @@ -3,15 +3,17 @@ define void @f1(<2 x i16> %x, i16* %a) { ; CHECK-LABEL: @f1( -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[X:%.*]], <2 x i16> undef, <4 x i32> +; CHECK-NEXT: [[T2:%.*]] = extractelement <2 x i16> [[X:%.*]], i32 0 +; CHECK-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[X]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 -; CHECK-NEXT: store i16 [[TMP1]], i16* [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2 +; CHECK-NEXT: store i16 [[T2]], i16* [[A:%.*]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR0]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR1]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR2]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR3]] ; CHECK-NEXT: ret void ; %t2 = extractelement <2 x i16> %x, i32 0 @@ -35,15 +37,17 @@ ; CHECK: cont: ; CHECK-NEXT: [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ] ; CHECK-NEXT: [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <4 x i32> +; CHECK-NEXT: [[T2:%.*]] = extractelement <2 x i16> [[XX]], i32 0 +; CHECK-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[XX]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: store i16 [[T2]], i16* [[A]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR0]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR1]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR2]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR3]] ; CHECK-NEXT: [[A_VAL:%.*]] = load i16, i16* [[A]], align 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]] @@ -82,16 +86,17 @@ ; CHECK: cont: ; CHECK-NEXT: [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ] ; CHECK-NEXT: [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ] -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[REORDER_SHUFFLE]], <2 x i16> undef, <4 x i32> +; CHECK-NEXT: [[T2:%.*]] = extractelement <2 x i16> [[XX]], i32 0 +; CHECK-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[XX]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: store i16 [[T3]], i16* [[A]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR0]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR1]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR2]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR3]] ; CHECK-NEXT: [[A_VAL:%.*]] = load i16, i16* [[A]], align 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll @@ -72,14 +72,18 @@ ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A1]] ; CHECK-NEXT: [[LOAD1:%.*]] = load i64, i64* [[GEP1]] ; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2 -; CHECK-NEXT: [[S2:%.*]] = sext i32 [[E2]] to i64 -; CHECK-NEXT: [[A2:%.*]] = add i64 [[S2]], [[C2:%.*]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A2]] -; CHECK-NEXT: [[LOAD2:%.*]] = load i64, i64* [[GEP2]] ; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3 -; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64 -; CHECK-NEXT: [[A3:%.*]] = add i64 [[S3]], [[C3:%.*]] -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A3]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[E2]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[E3]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[C2:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C3:%.*]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP6]] +; CHECK-NEXT: [[LOAD2:%.*]] = load i64, i64* [[GEP2]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP7]] ; CHECK-NEXT: [[LOAD3:%.*]] = load i64, i64* [[GEP3]] ; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]]) ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -203,12 +203,10 @@ ; ; MAX-COST-LABEL: @PR32038( ; MAX-COST-NEXT: entry: -; MAX-COST-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 -; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer -; MAX-COST-NEXT: [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; MAX-COST-NEXT: [[P5:%.*]] = icmp eq i8 [[P4]], 0 -; MAX-COST-NEXT: [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; MAX-COST-NEXT: [[P7:%.*]] = icmp eq i8 [[P6]], 0 +; MAX-COST-NEXT: [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1 +; MAX-COST-NEXT: [[P1:%.*]] = icmp eq i8 [[P0]], 0 +; MAX-COST-NEXT: [[TMP0:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2) to <4 x i8>*), i32 2, <4 x i1> , <4 x i8> undef) +; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <4 x i8> [[TMP0]], ; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 ; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 ; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 @@ -217,22 +215,23 @@ ; MAX-COST-NEXT: [[P13:%.*]] = icmp eq i8 [[P12]], 0 ; MAX-COST-NEXT: [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 ; MAX-COST-NEXT: [[P15:%.*]] = icmp eq i8 [[P14]], 0 +; MAX-COST-NEXT: [[TMP2:%.*]] = insertelement <4 x i1> undef, i1 [[P1]], i32 0 ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: ; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; MAX-COST-NEXT: [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0 -; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1 -; MAX-COST-NEXT: [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2 -; MAX-COST-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3 -; MAX-COST-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP2]], i1 [[TMP4]], i32 1 +; MAX-COST-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; MAX-COST-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[TMP6]], i32 2 +; MAX-COST-NEXT: [[TMP8:%.*]] = insertelement <4 x i1> [[TMP7]], i1 [[TMP3]], i32 3 +; MAX-COST-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> , <4 x i32> ; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) -; MAX-COST-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]] -; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]] -; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5 +; MAX-COST-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) +; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P27]] +; MAX-COST-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], [[P29]] +; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP12]], -5 ; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]] ; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -187,21 +187,25 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_3_binops( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = mul <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = xor <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP12:%.*]] = xor <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i32> [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] +; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 +; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR32086.ll @@ -6,7 +6,8 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[LOAD_EXTEND]], <4 x i64> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 @@ -35,13 +36,14 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i64> [[LOAD_EXTEND]], <4 x i64> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[REORDER_SHUFFLE]], <4 x i64> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[ST]] to <4 x i64>* -; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>* +; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8 ; CHECK-NEXT: ret void ; %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1 @@ -65,7 +67,8 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[LOAD_EXTEND]], <4 x i64> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -7,8 +7,8 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = phi <8 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ , [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> undef, <8 x i32> @@ -53,17 +53,17 @@ ; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> undef, i32 [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> undef, i32 [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP15]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP15]] = insertelement <8 x i32> [[TMP13]], i32 [[TMP14]], i32 1 ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( ; FORCE_REDUCTION-NEXT: entry: ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: -; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> +; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <4 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ , [[ENTRY:%.*]] ] +; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> ; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1 ; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], ; FORCE_REDUCTION-NEXT: [[VAL_20:%.*]] = add i32 [[TMP2]], 1496 @@ -106,13 +106,13 @@ ; FORCE_REDUCTION-NEXT: [[VAL_39:%.*]] = add i32 [[TMP2]], 12529 ; FORCE_REDUCTION-NEXT: [[VAL_40:%.*]] = and i32 [[OP_EXTRA29]], [[VAL_39]] ; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[TMP2]], 13685 -; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_40]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_41]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 14910, i32 1 -; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = and <2 x i32> [[TMP8]], [[TMP10]] -; FORCE_REDUCTION-NEXT: [[TMP12:%.*]] = add <2 x i32> [[TMP8]], [[TMP10]] -; FORCE_REDUCTION-NEXT: [[TMP13]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> +; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[VAL_40]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 14910, i32 1 +; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> undef, i32 [[VAL_41]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP2]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = and <4 x i32> [[TMP8]], [[TMP10]] +; FORCE_REDUCTION-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP8]], [[TMP10]] +; FORCE_REDUCTION-NEXT: [[TMP13]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll @@ -4,11 +4,11 @@ define void @mainTest(i32 %param, i32 * %vals, i32 %len) { ; CHECK-LABEL: @mainTest( ; CHECK-NEXT: bci_15.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[PARAM:%.*]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> , i32 [[PARAM:%.*]], i32 1 ; CHECK-NEXT: br label [[BCI_15:%.*]] ; CHECK: bci_15: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = phi <16 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15 ; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4 @@ -24,8 +24,8 @@ ; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0 ; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]] ; CHECK-NEXT: [[V44:%.*]] = add i32 [[TMP2]], 16 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[V44]], i32 0 -; CHECK-NEXT: [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> undef, i32 [[V44]], i32 0 +; CHECK-NEXT: [[TMP7]] = insertelement <16 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1 ; CHECK-NEXT: br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]] ; CHECK: loopexit: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -8,26 +8,24 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; SSE-LABEL: @sitofp_uitofp( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> ; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 ; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float -; SSE-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float -; SSE-NEXT: [[AB2:%.*]] = sitofp i32 [[A2]] to float -; SSE-NEXT: [[AB3:%.*]] = sitofp i32 [[A3]] to float +; SSE-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[REORDER_SHUFFLE]] to <4 x float> ; SSE-NEXT: [[AB4:%.*]] = uitofp i32 [[A4]] to float ; SSE-NEXT: [[AB5:%.*]] = uitofp i32 [[A5]] to float ; SSE-NEXT: [[AB6:%.*]] = uitofp i32 [[A6]] to float ; SSE-NEXT: [[AB7:%.*]] = uitofp i32 [[A7]] to float -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 +; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 ; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 @@ -81,26 +79,24 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; SSE-LABEL: @fptosi_fptoui( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; SSE-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> ; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 ; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 ; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; SSE-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; SSE-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; SSE-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 +; SSE-NEXT: [[TMP1:%.*]] = fptosi <4 x float> [[REORDER_SHUFFLE]] to <4 x i32> ; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 ; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 ; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 ; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 ; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 @@ -135,26 +131,24 @@ ; SLM-NEXT: ret <8 x i32> [[R7]] ; ; AVX-LABEL: @fptosi_fptoui( -; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; AVX-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; AVX-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; AVX-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> ; AVX-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 ; AVX-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 ; AVX-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 ; AVX-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; AVX-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; AVX-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; AVX-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; AVX-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 +; AVX-NEXT: [[TMP1:%.*]] = fptosi <4 x float> [[REORDER_SHUFFLE]] to <4 x i32> ; AVX-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 ; AVX-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 ; AVX-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 ; AVX-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; AVX-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; AVX-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; AVX-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0 +; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; AVX-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1 +; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; AVX-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2 +; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; AVX-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3 ; AVX-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 ; AVX-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; AVX-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 @@ -335,24 +329,24 @@ ; Inspired by PR38154 define <8 x float> @sitofp_uitofp_4i32_8i16_16i8(<4 x i32> %a, <8 x i16> %b, <16 x i8> %c) { ; SSE-LABEL: @sitofp_uitofp_4i32_8i16_16i8( -; SSE-NEXT: [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1 +; SSE-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <2 x i32> ; SSE-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2 ; SSE-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3 ; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i16> [[B:%.*]], i32 0 ; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i16> [[B]], i32 1 ; SSE-NEXT: [[C0:%.*]] = extractelement <16 x i8> [[C:%.*]], i32 0 ; SSE-NEXT: [[C1:%.*]] = extractelement <16 x i8> [[C]], i32 1 -; SSE-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float -; SSE-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float +; SSE-NEXT: [[TMP1:%.*]] = sitofp <2 x i32> [[REORDER_SHUFFLE]] to <2 x float> ; SSE-NEXT: [[AB2:%.*]] = uitofp i32 [[A2]] to float ; SSE-NEXT: [[AB3:%.*]] = uitofp i32 [[A3]] to float ; SSE-NEXT: [[AB4:%.*]] = sitofp i16 [[B0]] to float ; SSE-NEXT: [[AB5:%.*]] = uitofp i16 [[B1]] to float ; SSE-NEXT: [[AB6:%.*]] = sitofp i8 [[C0]] to float ; SSE-NEXT: [[AB7:%.*]] = uitofp i8 [[C1]] to float -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 +; SSE-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1 ; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 ; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -56,20 +56,35 @@ ; SSE-NEXT: ret <8 x float> [[R7]] ; ; SLM-LABEL: @fmul_fdiv_v8f32( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]] -; SLM-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <8 x i32> -; SLM-NEXT: [[TMP9:%.*]] = fdiv <4 x float> [[TMP5]], [[TMP6]] -; SLM-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> undef, <8 x i32> -; SLM-NEXT: [[R3:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP3]], <8 x i32> -; SLM-NEXT: [[R4:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> -; SLM-NEXT: [[R6:%.*]] = shufflevector <8 x float> [[R4]], <8 x float> [[TMP10]], <8 x i32> -; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[R6]], <8 x float> [[TMP8]], <8 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 +; SLM-NEXT: [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <8 x float> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <8 x float> [[B]], i32 2 +; SLM-NEXT: [[B5:%.*]] = extractelement <8 x float> [[B]], i32 5 +; SLM-NEXT: [[B6:%.*]] = extractelement <8 x float> [[B]], i32 6 +; SLM-NEXT: [[B7:%.*]] = extractelement <8 x float> [[B]], i32 7 +; SLM-NEXT: [[AB0:%.*]] = fmul float [[A0]], [[B0]] +; SLM-NEXT: [[AB1:%.*]] = fdiv float [[A1]], [[B1]] +; SLM-NEXT: [[AB2:%.*]] = fdiv float [[A2]], [[B2]] +; SLM-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A]], [[B]] +; SLM-NEXT: [[AB5:%.*]] = fdiv float [[A5]], [[B5]] +; SLM-NEXT: [[AB6:%.*]] = fdiv float [[A6]], [[B6]] +; SLM-NEXT: [[AB7:%.*]] = fmul float [[A7]], [[B7]] +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 +; SLM-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP2]], i32 3 +; SLM-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP3]], i32 4 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 ; SLM-NEXT: ret <8 x float> [[R7]] ; ; AVX-LABEL: @fmul_fdiv_v8f32( @@ -125,14 +140,15 @@ ; SSE-NEXT: ret <4 x float> [[TMP1]] ; ; SLM-LABEL: @fmul_fdiv_v4f32_const( -; SLM-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; SLM-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> ; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 ; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; SLM-NEXT: [[AB0:%.*]] = fmul float [[A0]], 2.000000e+00 +; SLM-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[REORDER_SHUFFLE]], ; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 -; SLM-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[A1]], i32 1 +; SLM-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; SLM-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 +; SLM-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; SLM-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1 ; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2 ; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3 ; SLM-NEXT: ret <4 x float> [[R3]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -144,19 +144,19 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-LABEL: @ashr_shl_v8i32_const( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], -; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[REORDER_SHUFFLE]], +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], +; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; AVX1-LABEL: @ashr_shl_v8i32_const( -; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], -; AVX1-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[REORDER_SHUFFLE]], +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], +; AVX1-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @ashr_shl_v8i32_const( @@ -260,25 +260,22 @@ ; AVX2-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; AVX2-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 ; AVX2-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[A]], [[B]] ; AVX2-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; AVX2-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 -; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1 -; AVX2-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 -; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2 -; AVX2-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 -; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3 -; AVX2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 -; AVX2-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4 -; AVX2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 -; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5 +; AVX2-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP4]], i32 1 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2 +; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP5]], i32 2 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3 +; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4 +; AVX2-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP7]], i32 4 +; AVX2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5 +; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP8]], i32 5 ; AVX2-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; AVX2-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX2-NEXT: ret <8 x i32> [[R7]] @@ -288,25 +285,22 @@ ; AVX512-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; AVX512-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 ; AVX512-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] -; AVX512-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] -; AVX512-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[A]], [[B]] ; AVX512-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; AVX512-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1 -; AVX512-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 -; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2 -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 -; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 -; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4 -; AVX512-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5 +; AVX512-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP4]], i32 1 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2 +; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP5]], i32 2 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3 +; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4 +; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP7]], i32 4 +; AVX512-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5 +; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP8]], i32 5 ; AVX512-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX512-NEXT: ret <8 x i32> [[R7]] @@ -379,26 +373,64 @@ } define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { -; CHECK-LABEL: @sdiv_v8i32_undefs( -; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; CHECK-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; CHECK-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 -; CHECK-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 -; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; CHECK-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 -; CHECK-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; CHECK-NEXT: ret <8 x i32> [[R7]] +; SSE2-LABEL: @sdiv_v8i32_undefs( +; SSE2-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SSE2-NEXT: [[TMP1:%.*]] = sdiv <4 x i32> [[REORDER_SHUFFLE]], +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = sdiv <4 x i32> [[TMP2]], +; SSE2-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> +; SSE2-NEXT: ret <8 x i32> [[R7]] +; +; SLM-LABEL: @sdiv_v8i32_undefs( +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SLM-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; SLM-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; SLM-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SLM-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; SLM-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; SLM-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SLM-NEXT: ret <8 x i32> [[R7]] +; +; AVX1-LABEL: @sdiv_v8i32_undefs( +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 5 +; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP2:%.*]] = sdiv <4 x i32> [[TMP1]], +; AVX1-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; AVX1-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; AVX1-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX1-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 1 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3 +; AVX1-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP6]], i32 4 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX1-NEXT: ret <8 x i32> [[R7]] +; +; AVX2-LABEL: @sdiv_v8i32_undefs( +; AVX2-NEXT: [[TMP1:%.*]] = sdiv <8 x i32> [[A:%.*]], +; AVX2-NEXT: ret <8 x i32> [[TMP1]] +; +; AVX512-LABEL: @sdiv_v8i32_undefs( +; AVX512-NEXT: [[TMP1:%.*]] = sdiv <8 x i32> [[A:%.*]], +; AVX512-NEXT: ret <8 x i32> [[TMP1]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll @@ -237,23 +237,22 @@ define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) { ; CHECK-LABEL: @fcmp_ord_uno_v4i32( ; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 ; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 ; CHECK-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 -; CHECK-NEXT: [[B1:%.*]] = load float, float* [[P1]], align 4 -; CHECK-NEXT: [[B2:%.*]] = load float, float* [[P2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 ; CHECK-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] -; CHECK-NEXT: [[C1:%.*]] = fcmp uno float [[B1]], [[A1]] -; CHECK-NEXT: [[C2:%.*]] = fcmp uno float [[B2]], [[A2]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] ; CHECK-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0 -; CHECK-NEXT: [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[C1]], i32 1 -; CHECK-NEXT: [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[C2]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; CHECK-NEXT: [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; CHECK-NEXT: [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[TMP6]], i32 2 ; CHECK-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D2]], i1 [[C3]], i32 3 ; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -55,35 +55,32 @@ ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; AVX-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] -; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ] +; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; AVX-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1 -; AVX-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]] -; AVX-NEXT: [[TMP9:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer -; AVX-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[TMP9]], [[TMP8]] -; AVX-NEXT: [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], -; AVX-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> -; AVX-NEXT: [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], -; AVX-NEXT: [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer -; AVX-NEXT: [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> , <2 x float> [[TMP14]] -; AVX-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 -; AVX-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 -; AVX-NEXT: [[ADD13]] = fadd float [[TMP16]], [[TMP17]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <2 x float> undef, float [[TMP17]], i32 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[ADD13]], i32 1 -; AVX-NEXT: [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], -; AVX-NEXT: [[TMP21:%.*]] = select <2 x i1> [[TMP20]], <2 x float> [[TMP19]], <2 x float> -; AVX-NEXT: [[TMP22:%.*]] = fcmp olt <2 x float> [[TMP21]], -; AVX-NEXT: [[TMP23]] = select <2 x i1> [[TMP22]], <2 x float> , <2 x float> [[TMP21]] +; AVX-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 +; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[REORDER_SHUFFLE]], [[TMP3]] +; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer +; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[TMP4]] +; AVX-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], +; AVX-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> +; AVX-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], +; AVX-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer +; AVX-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> , <2 x float> [[TMP10]] +; AVX-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; AVX-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; AVX-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]] +; AVX-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[TMP13]], i32 0 +; AVX-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1 +; AVX-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], +; AVX-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> +; AVX-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], +; AVX-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> , <2 x float> [[TMP17]] ; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 ; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; AVX: for.end: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll @@ -131,10 +131,9 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double undef, i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll @@ -4,25 +4,7 @@ define i32 @crash_reordering_undefs() { ; CHECK-LABEL: @crash_reordering_undefs( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[OR0:%.*]] = or i64 undef, undef -; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]] -; CHECK-NEXT: [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD1:%.*]] = add i32 undef, [[ADD0]] -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 undef, undef -; CHECK-NEXT: [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD1]], [[ADD2]] -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 undef, undef -; CHECK-NEXT: [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD5:%.*]] = add i32 [[ADD3]], [[ADD4]] -; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD5]], undef -; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[ADD6]], undef -; CHECK-NEXT: [[ADD8:%.*]] = add i32 [[ADD7]], undef -; CHECK-NEXT: [[OR1:%.*]] = or i64 undef, undef -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]] -; CHECK-NEXT: [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[ADD8]], [[ADD9]] -; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD10]], undef -; CHECK-NEXT: ret i32 [[ADD11]] +; CHECK-NEXT: ret i32 undef ; entry: %or0 = or i64 undef, undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll @@ -31,15 +31,14 @@ ; CHECK: cond.false66.us: ; CHECK-NEXT: [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, undef ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[ADD_I276_US]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double undef, i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> undef, [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> undef, [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP7]], align 8 ; CHECK-NEXT: unreachable ; CHECK: cond.true63.us: ; CHECK-NEXT: unreachable diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -18,21 +18,16 @@ ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[LOAD_EXTEND]], <4 x double> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], +; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[SHUFFLE]], +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[SHUFFLE1]], ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <4 x double>* +; CHECK-NEXT: store <4 x double> [[TMP3]], <4 x double>* [[TMP4]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll @@ -54,14 +54,11 @@ ; CHECK-LABEL: @fextr2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load <4 x double>, <4 x double>* undef -; CHECK-NEXT: [[V0:%.*]] = extractelement <4 x double> [[LD]], i32 0 -; CHECK-NEXT: [[V1:%.*]] = extractelement <4 x double> [[LD]], i32 1 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x double> [[LD]], <4 x double> undef, <2 x i32> ; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[P0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = fadd <2 x double> [[REORDER_SHUFFLE]], +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[P0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll @@ -179,13 +179,22 @@ ; SSE-NEXT: ret <4 x double> [[R03]] ; ; SLM-LABEL: @test_v4f64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3 +; SLM-NEXT: [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3 +; SLM-NEXT: [[R0:%.*]] = fadd double [[A0]], [[A1]] +; SLM-NEXT: [[R1:%.*]] = fadd double [[B0]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = fadd double [[A2]], [[A3]] +; SLM-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] +; SLM-NEXT: [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i32 0 +; SLM-NEXT: [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1 +; SLM-NEXT: [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2 +; SLM-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3 ; SLM-NEXT: ret <4 x double> [[R03]] ; ; AVX-LABEL: @test_v4f64( @@ -353,13 +362,57 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SSE-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i32 8 +; SSE-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i32 9 +; SSE-NEXT: [[A10:%.*]] = extractelement <16 x i16> [[A]], i32 10 +; SSE-NEXT: [[A11:%.*]] = extractelement <16 x i16> [[A]], i32 11 +; SSE-NEXT: [[A12:%.*]] = extractelement <16 x i16> [[A]], i32 12 +; SSE-NEXT: [[A13:%.*]] = extractelement <16 x i16> [[A]], i32 13 +; SSE-NEXT: [[A14:%.*]] = extractelement <16 x i16> [[A]], i32 14 +; SSE-NEXT: [[A15:%.*]] = extractelement <16 x i16> [[A]], i32 15 +; SSE-NEXT: [[B8:%.*]] = extractelement <16 x i16> [[B:%.*]], i32 8 +; SSE-NEXT: [[B9:%.*]] = extractelement <16 x i16> [[B]], i32 9 +; SSE-NEXT: [[B10:%.*]] = extractelement <16 x i16> [[B]], i32 10 +; SSE-NEXT: [[B11:%.*]] = extractelement <16 x i16> [[B]], i32 11 +; SSE-NEXT: [[B12:%.*]] = extractelement <16 x i16> [[B]], i32 12 +; SSE-NEXT: [[B13:%.*]] = extractelement <16 x i16> [[B]], i32 13 +; SSE-NEXT: [[B14:%.*]] = extractelement <16 x i16> [[B]], i32 14 +; SSE-NEXT: [[B15:%.*]] = extractelement <16 x i16> [[B]], i32 15 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[R8:%.*]] = add i16 [[A8]], [[A9]] +; SSE-NEXT: [[R9:%.*]] = add i16 [[A10]], [[A11]] +; SSE-NEXT: [[R10:%.*]] = add i16 [[A12]], [[A13]] +; SSE-NEXT: [[R11:%.*]] = add i16 [[A14]], [[A15]] +; SSE-NEXT: [[R12:%.*]] = add i16 [[B8]], [[B9]] +; SSE-NEXT: [[R13:%.*]] = add i16 [[B10]], [[B11]] +; SSE-NEXT: [[R14:%.*]] = add i16 [[B12]], [[B13]] +; SSE-NEXT: [[R15:%.*]] = add i16 [[B14]], [[B15]] +; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[RV0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[RV1:%.*]] = insertelement <16 x i16> [[RV0]], i16 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[RV2:%.*]] = insertelement <16 x i16> [[RV1]], i16 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[RV3:%.*]] = insertelement <16 x i16> [[RV2]], i16 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[RV4:%.*]] = insertelement <16 x i16> [[RV3]], i16 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[TMP11]], i32 7 +; SSE-NEXT: [[RV8:%.*]] = insertelement <16 x i16> [[RV7]], i16 [[R8]], i32 8 +; SSE-NEXT: [[RV9:%.*]] = insertelement <16 x i16> [[RV8]], i16 [[R9]], i32 9 +; SSE-NEXT: [[RV10:%.*]] = insertelement <16 x i16> [[RV9]], i16 [[R10]], i32 10 +; SSE-NEXT: [[RV11:%.*]] = insertelement <16 x i16> [[RV10]], i16 [[R11]], i32 11 +; SSE-NEXT: [[RV12:%.*]] = insertelement <16 x i16> [[RV11]], i16 [[R12]], i32 12 +; SSE-NEXT: [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i32 13 +; SSE-NEXT: [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i32 14 +; SSE-NEXT: [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i32 15 ; SSE-NEXT: ret <16 x i16> [[RV15]] ; ; SLM-LABEL: @test_v16i16( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll @@ -179,13 +179,22 @@ ; SSE-NEXT: ret <4 x double> [[R03]] ; ; SLM-LABEL: @test_v4f64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3 +; SLM-NEXT: [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3 +; SLM-NEXT: [[R0:%.*]] = fsub double [[A0]], [[A1]] +; SLM-NEXT: [[R1:%.*]] = fsub double [[B0]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = fsub double [[A2]], [[A3]] +; SLM-NEXT: [[R3:%.*]] = fsub double [[B2]], [[B3]] +; SLM-NEXT: [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i32 0 +; SLM-NEXT: [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1 +; SLM-NEXT: [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2 +; SLM-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3 ; SLM-NEXT: ret <4 x double> [[R03]] ; ; AVX-LABEL: @test_v4f64( @@ -353,13 +362,57 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SSE-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i32 8 +; SSE-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i32 9 +; SSE-NEXT: [[A10:%.*]] = extractelement <16 x i16> [[A]], i32 10 +; SSE-NEXT: [[A11:%.*]] = extractelement <16 x i16> [[A]], i32 11 +; SSE-NEXT: [[A12:%.*]] = extractelement <16 x i16> [[A]], i32 12 +; SSE-NEXT: [[A13:%.*]] = extractelement <16 x i16> [[A]], i32 13 +; SSE-NEXT: [[A14:%.*]] = extractelement <16 x i16> [[A]], i32 14 +; SSE-NEXT: [[A15:%.*]] = extractelement <16 x i16> [[A]], i32 15 +; SSE-NEXT: [[B8:%.*]] = extractelement <16 x i16> [[B:%.*]], i32 8 +; SSE-NEXT: [[B9:%.*]] = extractelement <16 x i16> [[B]], i32 9 +; SSE-NEXT: [[B10:%.*]] = extractelement <16 x i16> [[B]], i32 10 +; SSE-NEXT: [[B11:%.*]] = extractelement <16 x i16> [[B]], i32 11 +; SSE-NEXT: [[B12:%.*]] = extractelement <16 x i16> [[B]], i32 12 +; SSE-NEXT: [[B13:%.*]] = extractelement <16 x i16> [[B]], i32 13 +; SSE-NEXT: [[B14:%.*]] = extractelement <16 x i16> [[B]], i32 14 +; SSE-NEXT: [[B15:%.*]] = extractelement <16 x i16> [[B]], i32 15 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[R8:%.*]] = sub i16 [[A8]], [[A9]] +; SSE-NEXT: [[R9:%.*]] = sub i16 [[A10]], [[A11]] +; SSE-NEXT: [[R10:%.*]] = sub i16 [[A12]], [[A13]] +; SSE-NEXT: [[R11:%.*]] = sub i16 [[A14]], [[A15]] +; SSE-NEXT: [[R12:%.*]] = sub i16 [[B8]], [[B9]] +; SSE-NEXT: [[R13:%.*]] = sub i16 [[B10]], [[B11]] +; SSE-NEXT: [[R14:%.*]] = sub i16 [[B12]], [[B13]] +; SSE-NEXT: [[R15:%.*]] = sub i16 [[B14]], [[B15]] +; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[RV0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[RV1:%.*]] = insertelement <16 x i16> [[RV0]], i16 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[RV2:%.*]] = insertelement <16 x i16> [[RV1]], i16 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[RV3:%.*]] = insertelement <16 x i16> [[RV2]], i16 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[RV4:%.*]] = insertelement <16 x i16> [[RV3]], i16 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[TMP11]], i32 7 +; SSE-NEXT: [[RV8:%.*]] = insertelement <16 x i16> [[RV7]], i16 [[R8]], i32 8 +; SSE-NEXT: [[RV9:%.*]] = insertelement <16 x i16> [[RV8]], i16 [[R9]], i32 9 +; SSE-NEXT: [[RV10:%.*]] = insertelement <16 x i16> [[RV9]], i16 [[R10]], i32 10 +; SSE-NEXT: [[RV11:%.*]] = insertelement <16 x i16> [[RV10]], i16 [[R11]], i32 11 +; SSE-NEXT: [[RV12:%.*]] = insertelement <16 x i16> [[RV11]], i16 [[R12]], i32 12 +; SSE-NEXT: [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i32 13 +; SSE-NEXT: [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i32 14 +; SSE-NEXT: [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i32 15 ; SSE-NEXT: ret <16 x i16> [[RV15]] ; ; SLM-LABEL: @test_v16i16( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -214,43 +214,64 @@ ; Unused insertelement define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { +; CHECK-LABEL: @simple_select_no_users( +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> undef, <2 x i32> +; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 +; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 +; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 +; CHECK-NEXT: [[REORDER_SHUFFLE2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 +; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[REORDER_SHUFFLE]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[C3]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <2 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[REORDER_SHUFFLE1]], <2 x float> [[REORDER_SHUFFLE2]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[A3]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[B3]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP4]], <2 x float> [[TMP7]], <2 x float> [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 +; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 +; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 +; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP13]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 +; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP14]], i32 3 +; CHECK-NEXT: ret <4 x float> [[RD]] +; ; ANY-LABEL: @simple_select_no_users( -; ANY-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 -; ANY-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 +; ANY-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> undef, <2 x i32> ; ANY-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 ; ANY-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 -; ANY-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; ANY-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; ANY-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> ; ANY-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 ; ANY-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; ANY-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; ANY-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 +; ANY-NEXT: [[REORDER_SHUFFLE2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <2 x i32> ; ANY-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 ; ANY-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; ANY-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0 -; ANY-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 -; ANY-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; ANY-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 -; ANY-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 -; ANY-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer -; ANY-NEXT: [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0 -; ANY-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 -; ANY-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0 -; ANY-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 -; ANY-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] -; ANY-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 -; ANY-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 -; ANY-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 -; ANY-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 -; ANY-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] -; ANY-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; ANY-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0 -; ANY-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; ANY-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1 -; ANY-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0 -; ANY-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2 -; ANY-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1 -; ANY-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3 +; ANY-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[REORDER_SHUFFLE]], zeroinitializer +; ANY-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 +; ANY-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[C3]], i32 1 +; ANY-NEXT: [[TMP4:%.*]] = icmp ne <2 x i32> [[TMP3]], zeroinitializer +; ANY-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[REORDER_SHUFFLE1]], <2 x float> [[REORDER_SHUFFLE2]] +; ANY-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 +; ANY-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[A3]], i32 1 +; ANY-NEXT: [[TMP8:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 +; ANY-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[B3]], i32 1 +; ANY-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP4]], <2 x float> [[TMP7]], <2 x float> [[TMP9]] +; ANY-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 +; ANY-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP11]], i32 0 +; ANY-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 +; ANY-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP12]], i32 1 +; ANY-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 +; ANY-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP13]], i32 2 +; ANY-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 +; ANY-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP14]], i32 3 ; ANY-NEXT: ret <4 x float> [[RD]] ; %c0 = extractelement <4 x i32> %c, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll @@ -11,7 +11,7 @@ @h = common dso_local global float 0.000000e+00, align 4 define dso_local void @j() local_unnamed_addr { -; CHECK-LABEL: define {{[^@]+}}@j( +; CHECK-LABEL: @j( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** @b, align 8 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 @@ -19,42 +19,42 @@ ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[LOAD_EXTEND1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 13 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX1]] to <2 x i32>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[REORDER_SHUFFLE]], [[REORDER_SHUFFLE1]] -; CHECK-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x float> -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> , [[TMP7]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[LOAD_EXTEND]], [[LOAD_EXTEND1]] +; CHECK-NEXT: [[TMP6:%.*]] = sitofp <4 x i32> [[TMP5]] to <4 x float> +; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = fsub <4 x float> , [[TMP7]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 ; CHECK-NEXT: store float [[TMP9]], float* @g, align 4 ; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], ; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 2 ; CHECK-NEXT: store float [[TMP11]], float* @c, align 4 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 ; CHECK-NEXT: store float [[TMP12]], float* @d, align 4 ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 3 ; CHECK-NEXT: store float [[TMP13]], float* @e, align 4 -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 0 ; CHECK-NEXT: store float [[TMP14]], float* @f, align 4 ; CHECK-NEXT: [[ARRAYIDX15:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 14 ; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 15 ; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* @a, align 4 ; CHECK-NEXT: [[CONV19:%.*]] = sitofp i32 [[TMP15]] to float -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> undef, float [[CONV19]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x float> [[TMP16]], float -1.000000e+00, i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP17]], float [[TMP18]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float -1.000000e+00, i32 3 -; CHECK-NEXT: [[TMP21:%.*]] = fsub <4 x float> [[TMP10]], [[TMP20]] -; CHECK-NEXT: [[TMP22:%.*]] = fadd <4 x float> [[TMP10]], [[TMP20]] -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x float> [[TMP21]], <4 x float> [[TMP22]], <4 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = fptosi <4 x float> [[TMP23]] to <4 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP24]], <4 x i32>* [[TMP25]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 1 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float -1.000000e+00, i32 3 +; CHECK-NEXT: [[TMP20:%.*]] = fadd <4 x float> [[TMP10]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = fsub <4 x float> [[TMP10]], [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP21]], <4 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = fptosi <4 x float> [[TMP22]] to <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[REORDER_SHUFFLE]], <4 x i32>* [[TMP24]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -54,15 +54,16 @@ ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP1]], i32 4, <4 x i1> , <4 x float> undef) +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 ; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 ; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP4]], i32 1 -; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[X2]], i32 2 -; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2 +; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[TMP5]], i32 2 +; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[TMP5]], i32 3 ; CHECK-NEXT: ret <4 x float> [[I3]] ; %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0 @@ -84,7 +85,8 @@ ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <4 x i32> +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[LOAD_EXTEND]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 ; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 ; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP3]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll @@ -15,21 +15,21 @@ define i32 @foo(i32* nocapture %A, i32 %n) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i32 [[N:%.*]], 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[A]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP1]], 11 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = add nsw i32 [[TMP11]], [[TMP13]] -; CHECK-NEXT: store i32 [[TMP14]], i32* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw i32 [[TMP1]], 7 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP10]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4 ; CHECK-NEXT: ret i32 undef ; %1 = mul nsw i32 %n, 5 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -266,32 +266,24 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32000 x float], [32000 x float]* @a, i32 0, i32 0), align 16 ; CHECK-NEXT: br label [[FOR_BODY3:%.*]] ; CHECK: for.body3: -; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[ARRAYIDX5]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 -; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP13]] -; CHECK-NEXT: [[TMP14]] = load float, float* [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i32 3 -; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]] -; CHECK-NEXT: store float [[MUL45]], float* [[ARRAYIDX31]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[ARRAYIDX]] to <8 x float>* +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP5]], i32 4, <8 x i1> , <8 x float> undef) +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = fmul <8 x float> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[ARRAYIDX5]] to <8 x float>* +; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP10]], i32 4, <8 x i1> ) +; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP11]], 31995 +; CHECK-NEXT: [[TMP12]] = extractelement <8 x float> [[TMP6]], i32 4 ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]] ; CHECK: for.end: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll @@ -18,26 +18,23 @@ ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[SHR15]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 undef, i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 undef, i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 undef, i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[TMP7]], undef -; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP7]], <4 x i32> undef -; CHECK-NEXT: [[TMP10:%.*]] = sext <4 x i32> [[TMP9]] to <4 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2 -; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3 -; CHECK-NEXT: [[TMP19:%.*]] = sext i32 [[TMP18]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP19]] +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], undef +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP4]], <4 x i32> undef +; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[TMP6]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i64> [[TMP7]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP16]] ; CHECK-NEXT: unreachable ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -140,49 +140,54 @@ define float @foo3(float* nocapture readonly %A) #0 { ; CHECK-LABEL: @foo3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE]], i32 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <8 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP0]], i32 4, <8 x i1> , <8 x float> undef) +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[REORDER_SHUFFLE]], <8 x float> undef, <8 x i32> ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00 -; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] -; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = phi <8 x float> [ [[SHUFFLE]], [[ENTRY]] ], [ [[TMP32:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 6 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 5 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>* -; CHECK-NEXT: [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP11]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> undef, float [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP13]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP8]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP4]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = fmul <4 x float> [[TMP16]], -; CHECK-NEXT: [[TMP18]] = fadd <4 x float> [[TMP6]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX14]] to <4 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP6]], i32 4, <4 x i1> , <4 x float> undef) +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE1]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x float> undef, float [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE1]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float 8.000000e+00, i32 3 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float 7.000000e+00, i32 4 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x float> , float [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP3]], i32 4 +; CHECK-NEXT: [[TMP18:%.*]] = fmul <8 x float> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x float> [[TMP2]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP20]], 121 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x float> [[TMP19]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <8 x float> undef, float [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x float> [[TMP19]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <8 x float> [[TMP22]], float [[TMP23]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x float> [[TMP19]], i32 2 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <8 x float> [[TMP24]], float [[TMP25]], i32 2 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x float> [[TMP19]], i32 3 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP27]], i32 3 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x float> [[TMP19]], i32 4 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <8 x float> [[TMP28]], float [[TMP29]], i32 4 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <8 x float> [[TMP30]], float [[TMP8]], i32 5 +; CHECK-NEXT: [[TMP32]] = insertelement <8 x float> [[TMP31]], float [[TMP10]], i32 6 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP18]], i32 3 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP18]], i32 2 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP18]], i32 1 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP18]], i32 0 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP23]] +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[TMP29]], [[TMP27]] +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP25]] +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP23]] +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP21]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll @@ -13,9 +13,10 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[REORDER_SHUFFLE]], i32 0 ; CHECK-NEXT: [[VECIN0:%.*]] = insertelement <2 x float> undef, float [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[REORDER_SHUFFLE]], i32 1 ; CHECK-NEXT: [[VECIN1:%.*]] = insertelement <2 x float> [[VECIN0]], float [[TMP5]], i64 1 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 ; CHECK-NEXT: [[VECIN2:%.*]] = insertelement <2 x float> undef, float [[TMP6]], i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll @@ -10,46 +10,44 @@ ; CHECK: bb1: ; CHECK-NEXT: ret void ; CHECK: bb2: -; CHECK-NEXT: [[T:%.*]] = select i1 undef, i16 undef, i16 15 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> undef, i16 [[T]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16 undef, i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32> -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> , [[REORDER_SHUFFLE]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef -; CHECK-NEXT: [[SHUFFLE8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE8]], -; CHECK-NEXT: [[RDX_SHUF9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP10:%.*]] = icmp sgt <4 x i32> [[TMP5]], [[RDX_SHUF9]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT11:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP10]], <4 x i32> [[TMP5]], <4 x i32> [[RDX_SHUF9]] +; CHECK-NEXT: [[TMP:%.*]] = select i1 undef, i16 undef, i16 15 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[TMP]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP2]], undef +; CHECK-NEXT: [[SHUFFLE8:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE8]], +; CHECK-NEXT: [[RDX_SHUF9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP10:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF9]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT11:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP10]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF9]] ; CHECK-NEXT: [[RDX_SHUF12:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT11]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP13:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT11]], [[RDX_SHUF12]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT14:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP13]], <4 x i32> [[RDX_MINMAX_SELECT11]], <4 x i32> [[RDX_SHUF12]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT14]], i32 0 -; CHECK-NEXT: [[T19:%.*]] = select i1 undef, i32 [[TMP6]], i32 undef -; CHECK-NEXT: [[T20:%.*]] = icmp sgt i32 [[T19]], 63 -; CHECK-NEXT: [[TMP7:%.*]] = sub nsw <2 x i32> undef, [[TMP2]] -; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP7]], undef -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i32> [[TMP9]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP9]], <4 x i32> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT14]], i32 0 +; CHECK-NEXT: [[TMP19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef +; CHECK-NEXT: [[TMP20:%.*]] = icmp sgt i32 [[TMP19]], 63 +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> undef, [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> [[TMP6]], undef +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i32> [[TMP8]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP8]], <4 x i32> [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp slt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i32 [[TMP10]], undef -; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef -; CHECK-NEXT: [[TMP12:%.*]] = icmp slt i32 [[OP_EXTRA]], undef -; CHECK-NEXT: [[OP_EXTRA4:%.*]] = select i1 [[TMP12]], i32 [[OP_EXTRA]], i32 undef -; CHECK-NEXT: [[TMP13:%.*]] = icmp slt i32 [[OP_EXTRA4]], undef -; CHECK-NEXT: [[OP_EXTRA5:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA4]], i32 undef -; CHECK-NEXT: [[TMP14:%.*]] = icmp slt i32 [[OP_EXTRA5]], undef -; CHECK-NEXT: [[OP_EXTRA6:%.*]] = select i1 [[TMP14]], i32 [[OP_EXTRA5]], i32 undef -; CHECK-NEXT: [[TMP15:%.*]] = icmp slt i32 [[OP_EXTRA6]], undef -; CHECK-NEXT: [[OP_EXTRA7:%.*]] = select i1 [[TMP15]], i32 [[OP_EXTRA6]], i32 undef -; CHECK-NEXT: [[T45:%.*]] = icmp sgt i32 undef, [[OP_EXTRA7]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = icmp slt i32 [[TMP9]], undef +; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 undef +; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i32 [[OP_EXTRA]], undef +; CHECK-NEXT: [[OP_EXTRA4:%.*]] = select i1 [[TMP11]], i32 [[OP_EXTRA]], i32 undef +; CHECK-NEXT: [[TMP12:%.*]] = icmp slt i32 [[OP_EXTRA4]], undef +; CHECK-NEXT: [[OP_EXTRA5:%.*]] = select i1 [[TMP12]], i32 [[OP_EXTRA4]], i32 undef +; CHECK-NEXT: [[TMP13:%.*]] = icmp slt i32 [[OP_EXTRA5]], undef +; CHECK-NEXT: [[OP_EXTRA6:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA5]], i32 undef +; CHECK-NEXT: [[TMP14:%.*]] = icmp slt i32 [[OP_EXTRA6]], undef +; CHECK-NEXT: [[OP_EXTRA7:%.*]] = select i1 [[TMP14]], i32 [[OP_EXTRA6]], i32 undef +; CHECK-NEXT: [[TMP45:%.*]] = icmp sgt i32 undef, [[OP_EXTRA7]] ; CHECK-NEXT: unreachable ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll @@ -13,11 +13,12 @@ ; CHECK-NEXT: [[T11:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 1, i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[T14]] to <2 x float>* ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> ; CHECK-NEXT: br label [[T37:%.*]] ; CHECK: t37: -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[TMP3:%.*]] ], [ [[T89:%.*]], [[T37]] ] -; CHECK-NEXT: [[TMP7:%.*]] = fdiv fast <2 x float> , [[TMP6]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[LOAD_EXTEND]], [[TMP3:%.*]] ], [ [[REORDER_SHUFFLE:%.*]], [[T37]] ] +; CHECK-NEXT: [[TMP7:%.*]] = fdiv fast <4 x float> , [[TMP6]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[T21:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 0 ; CHECK-NEXT: [[T25:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 1 ; CHECK-NEXT: [[T31:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 2 @@ -25,7 +26,8 @@ ; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[T21]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[SHUFFLE]], <4 x float>* [[TMP8]], align 4 ; CHECK-NEXT: [[T88:%.*]] = bitcast float* [[T9]] to <2 x float>* -; CHECK-NEXT: [[T89]] = load <2 x float>, <2 x float>* [[T88]], align 4 +; CHECK-NEXT: [[T89:%.*]] = load <2 x float>, <2 x float>* [[T88]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE]] = shufflevector <2 x float> [[T89]], <2 x float> undef, <4 x i32> ; CHECK-NEXT: br i1 undef, label [[T37]], label [[T55:%.*]] ; CHECK: t55: ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll @@ -25,39 +25,37 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <2 x i32> ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP3:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[G_031:%.*]] = phi float [ [[TMP1]], [[ENTRY]] ], [ [[ADD9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[R_030:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 7.000000e+00 ; CHECK-NEXT: [[ADD4]] = fadd float [[R_030]], [[MUL]] -; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX7]], align 4 -; CHECK-NEXT: [[MUL8:%.*]] = fmul float [[TMP5]], 8.000000e+00 -; CHECK-NEXT: [[ADD9]] = fadd float [[G_031]], [[MUL8]] -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP7]], 9.000000e+00 -; CHECK-NEXT: [[ADD14]] = fadd float [[B_032]], [[MUL13]] +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX7]] to <2 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[REORDER_SHUFFLE1]], +; CHECK-NEXT: [[TMP9]] = fadd <2 x float> [[TMP4]], [[TMP8]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP8]], 121 +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP10]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]] ; CHECK: for.body.for.body_crit_edge: ; CHECK-NEXT: [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[ADD4]], [[ADD9]] -; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[ADD4]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 +; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[TMP12]] ; CHECK-NEXT: ret float [[ADD17]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll @@ -10,18 +10,10 @@ define i32 @slp_schedule_bundle() local_unnamed_addr #0 { ; CHECK-LABEL: @slp_schedule_bundle( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([1 x i32]* @b to <4 x i32>*), align 4 -; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], -; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4 -; CHECK-NEXT: [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31 -; CHECK-NEXT: [[DOTLOBIT_NOT_4:%.*]] = xor i32 [[DOTLOBIT_4]], 1 -; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_4]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4 -; CHECK-NEXT: [[DOTLOBIT_5:%.*]] = lshr i32 [[TMP4]], 31 -; CHECK-NEXT: [[DOTLOBIT_NOT_5:%.*]] = xor i32 [[DOTLOBIT_5]], 1 -; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_5]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast ([1 x i32]* @b to <8 x i32>*), i32 4, <8 x i1> , <8 x i32> undef) +; CHECK-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP2]], <8 x i32>* bitcast ([1 x i32]* @a to <8 x i32>*), i32 4, <8 x i1> ) ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/sext.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sext.ll @@ -114,23 +114,43 @@ } define <4 x i64> @loadext_4i8_to_4i64(i8* %p0) { -; SSE-LABEL: @loadext_4i8_to_4i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 -; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE2-LABEL: @loadext_4i8_to_4i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SSE2-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; SSE2-NEXT: [[TMP3:%.*]] = bitcast i8* [[P2]] to <2 x i8>* +; SSE2-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* [[TMP3]], align 1 +; SSE2-NEXT: [[TMP5:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> +; SSE2-NEXT: [[TMP6:%.*]] = sext <2 x i8> [[TMP4]] to <2 x i64> +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 +; SSE2-NEXT: ret <4 x i64> [[V3]] +; +; SLM-LABEL: @loadext_4i8_to_4i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 +; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 2 +; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 +; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 +; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 +; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 +; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 +; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i64 +; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i64 +; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 +; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SLM-NEXT: ret <4 x i64> [[V3]] ; ; AVX-LABEL: @loadext_4i8_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i8, i8* [[P0:%.*]], i64 1 @@ -205,30 +225,34 @@ ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = bitcast i8* [[P2]] to <2 x i8>* +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* [[TMP3]], align 1 +; SLM-NEXT: [[TMP5:%.*]] = bitcast i8* [[P4]] to <2 x i8>* +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i8>, <2 x i8>* [[TMP5]], align 1 +; SLM-NEXT: [[TMP7:%.*]] = bitcast i8* [[P6]] to <2 x i8>* +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i8>, <2 x i8>* [[TMP7]], align 1 +; SLM-NEXT: [[TMP9:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16> +; SLM-NEXT: [[TMP10:%.*]] = sext <2 x i8> [[TMP4]] to <2 x i16> +; SLM-NEXT: [[TMP11:%.*]] = sext <2 x i8> [[TMP6]] to <2 x i16> +; SLM-NEXT: [[TMP12:%.*]] = sext <2 x i8> [[TMP8]] to <2 x i16> +; SLM-NEXT: [[TMP13:%.*]] = extractelement <2 x i16> [[TMP9]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP13]], i32 0 +; SLM-NEXT: [[TMP14:%.*]] = extractelement <2 x i16> [[TMP9]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP14]], i32 1 +; SLM-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP10]], i32 0 +; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP15]], i32 2 +; SLM-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1 +; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP16]], i32 3 +; SLM-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP11]], i32 0 +; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP17]], i32 4 +; SLM-NEXT: [[TMP18:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1 +; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP18]], i32 5 +; SLM-NEXT: [[TMP19:%.*]] = extractelement <2 x i16> [[TMP12]], i32 0 +; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP19]], i32 6 +; SLM-NEXT: [[TMP20:%.*]] = extractelement <2 x i16> [[TMP12]], i32 1 +; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP20]], i32 7 ; SLM-NEXT: ret <8 x i16> [[V7]] ; ; AVX-LABEL: @loadext_8i8_to_8i16( @@ -491,54 +515,62 @@ ; SLM-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 ; SLM-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 ; SLM-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[I8:%.*]] = load i8, i8* [[P8]], align 1 -; SLM-NEXT: [[I9:%.*]] = load i8, i8* [[P9]], align 1 -; SLM-NEXT: [[I10:%.*]] = load i8, i8* [[P10]], align 1 -; SLM-NEXT: [[I11:%.*]] = load i8, i8* [[P11]], align 1 -; SLM-NEXT: [[I12:%.*]] = load i8, i8* [[P12]], align 1 -; SLM-NEXT: [[I13:%.*]] = load i8, i8* [[P13]], align 1 -; SLM-NEXT: [[I14:%.*]] = load i8, i8* [[P14]], align 1 -; SLM-NEXT: [[I15:%.*]] = load i8, i8* [[P15]], align 1 -; SLM-NEXT: [[X0:%.*]] = sext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = sext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = sext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = sext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = sext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = sext i8 [[I7]] to i16 -; SLM-NEXT: [[X8:%.*]] = sext i8 [[I8]] to i16 -; SLM-NEXT: [[X9:%.*]] = sext i8 [[I9]] to i16 -; SLM-NEXT: [[X10:%.*]] = sext i8 [[I10]] to i16 -; SLM-NEXT: [[X11:%.*]] = sext i8 [[I11]] to i16 -; SLM-NEXT: [[X12:%.*]] = sext i8 [[I12]] to i16 -; SLM-NEXT: [[X13:%.*]] = sext i8 [[I13]] to i16 -; SLM-NEXT: [[X14:%.*]] = sext i8 [[I14]] to i16 -; SLM-NEXT: [[X15:%.*]] = sext i8 [[I15]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8 -; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9 -; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10 -; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11 -; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12 -; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13 -; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14 -; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = bitcast i8* [[P2]] to <2 x i8>* +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* [[TMP3]], align 1 +; SLM-NEXT: [[TMP5:%.*]] = bitcast i8* [[P4]] to <2 x i8>* +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i8>, <2 x i8>* [[TMP5]], align 1 +; SLM-NEXT: [[TMP7:%.*]] = bitcast i8* [[P6]] to <2 x i8>* +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i8>, <2 x i8>* [[TMP7]], align 1 +; SLM-NEXT: [[TMP9:%.*]] = bitcast i8* [[P8]] to <2 x i8>* +; SLM-NEXT: [[TMP10:%.*]] = load <2 x i8>, <2 x i8>* [[TMP9]], align 1 +; SLM-NEXT: [[TMP11:%.*]] = bitcast i8* [[P10]] to <2 x i8>* +; SLM-NEXT: [[TMP12:%.*]] = load <2 x i8>, <2 x i8>* [[TMP11]], align 1 +; SLM-NEXT: [[TMP13:%.*]] = bitcast i8* [[P12]] to <2 x i8>* +; SLM-NEXT: [[TMP14:%.*]] = load <2 x i8>, <2 x i8>* [[TMP13]], align 1 +; SLM-NEXT: [[TMP15:%.*]] = bitcast i8* [[P14]] to <2 x i8>* +; SLM-NEXT: [[TMP16:%.*]] = load <2 x i8>, <2 x i8>* [[TMP15]], align 1 +; SLM-NEXT: [[TMP17:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i16> +; SLM-NEXT: [[TMP18:%.*]] = sext <2 x i8> [[TMP4]] to <2 x i16> +; SLM-NEXT: [[TMP19:%.*]] = sext <2 x i8> [[TMP6]] to <2 x i16> +; SLM-NEXT: [[TMP20:%.*]] = sext <2 x i8> [[TMP8]] to <2 x i16> +; SLM-NEXT: [[TMP21:%.*]] = sext <2 x i8> [[TMP10]] to <2 x i16> +; SLM-NEXT: [[TMP22:%.*]] = sext <2 x i8> [[TMP12]] to <2 x i16> +; SLM-NEXT: [[TMP23:%.*]] = sext <2 x i8> [[TMP14]] to <2 x i16> +; SLM-NEXT: [[TMP24:%.*]] = sext <2 x i8> [[TMP16]] to <2 x i16> +; SLM-NEXT: [[TMP25:%.*]] = extractelement <2 x i16> [[TMP17]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP25]], i32 0 +; SLM-NEXT: [[TMP26:%.*]] = extractelement <2 x i16> [[TMP17]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP26]], i32 1 +; SLM-NEXT: [[TMP27:%.*]] = extractelement <2 x i16> [[TMP18]], i32 0 +; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP27]], i32 2 +; SLM-NEXT: [[TMP28:%.*]] = extractelement <2 x i16> [[TMP18]], i32 1 +; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP28]], i32 3 +; SLM-NEXT: [[TMP29:%.*]] = extractelement <2 x i16> [[TMP19]], i32 0 +; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP29]], i32 4 +; SLM-NEXT: [[TMP30:%.*]] = extractelement <2 x i16> [[TMP19]], i32 1 +; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP30]], i32 5 +; SLM-NEXT: [[TMP31:%.*]] = extractelement <2 x i16> [[TMP20]], i32 0 +; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP31]], i32 6 +; SLM-NEXT: [[TMP32:%.*]] = extractelement <2 x i16> [[TMP20]], i32 1 +; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP32]], i32 7 +; SLM-NEXT: [[TMP33:%.*]] = extractelement <2 x i16> [[TMP21]], i32 0 +; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP33]], i32 8 +; SLM-NEXT: [[TMP34:%.*]] = extractelement <2 x i16> [[TMP21]], i32 1 +; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP34]], i32 9 +; SLM-NEXT: [[TMP35:%.*]] = extractelement <2 x i16> [[TMP22]], i32 0 +; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP35]], i32 10 +; SLM-NEXT: [[TMP36:%.*]] = extractelement <2 x i16> [[TMP22]], i32 1 +; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP36]], i32 11 +; SLM-NEXT: [[TMP37:%.*]] = extractelement <2 x i16> [[TMP23]], i32 0 +; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP37]], i32 12 +; SLM-NEXT: [[TMP38:%.*]] = extractelement <2 x i16> [[TMP23]], i32 1 +; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP38]], i32 13 +; SLM-NEXT: [[TMP39:%.*]] = extractelement <2 x i16> [[TMP24]], i32 0 +; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP39]], i32 14 +; SLM-NEXT: [[TMP40:%.*]] = extractelement <2 x i16> [[TMP24]], i32 1 +; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP40]], i32 15 ; SLM-NEXT: ret <16 x i16> [[V15]] ; ; AVX-LABEL: @loadext_16i8_to_16i16( @@ -768,23 +800,43 @@ } define <4 x i64> @loadext_4i16_to_4i64(i16* %p0) { -; SSE-LABEL: @loadext_4i16_to_4i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 -; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE2-LABEL: @loadext_4i16_to_4i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SSE2-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 +; SSE2-NEXT: [[TMP3:%.*]] = bitcast i16* [[P2]] to <2 x i16>* +; SSE2-NEXT: [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* [[TMP3]], align 1 +; SSE2-NEXT: [[TMP5:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> +; SSE2-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP4]] to <2 x i64> +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 +; SSE2-NEXT: ret <4 x i64> [[V3]] +; +; SLM-LABEL: @loadext_4i16_to_4i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 +; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 2 +; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 +; SLM-NEXT: [[I0:%.*]] = load i16, i16* [[P0]], align 1 +; SLM-NEXT: [[I1:%.*]] = load i16, i16* [[P1]], align 1 +; SLM-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 +; SLM-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 +; SLM-NEXT: [[X0:%.*]] = sext i16 [[I0]] to i64 +; SLM-NEXT: [[X1:%.*]] = sext i16 [[I1]] to i64 +; SLM-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 +; SLM-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SLM-NEXT: ret <4 x i64> [[V3]] ; ; AVX-LABEL: @loadext_4i16_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i16, i16* [[P0:%.*]], i64 1 @@ -985,23 +1037,43 @@ } define <4 x i64> @loadext_4i32_to_4i64(i32* %p0) { -; SSE-LABEL: @loadext_4i32_to_4i64( -; SSE-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 -; SSE-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 -; SSE-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 -; SSE-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 -; SSE-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 -; SSE-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; SSE-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; SSE-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 -; SSE-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 -; SSE-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 -; SSE-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 -; SSE-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 -; SSE-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 -; SSE-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; SSE-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 -; SSE-NEXT: ret <4 x i64> [[V3]] +; SSE2-LABEL: @loadext_4i32_to_4i64( +; SSE2-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; SSE2-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; SSE2-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; SSE2-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* +; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 +; SSE2-NEXT: [[TMP3:%.*]] = bitcast i32* [[P2]] to <2 x i32>* +; SSE2-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 1 +; SSE2-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> +; SSE2-NEXT: [[TMP6:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64> +; SSE2-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; SSE2-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 +; SSE2-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; SSE2-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 +; SSE2-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; SSE2-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 +; SSE2-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; SSE2-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 +; SSE2-NEXT: ret <4 x i64> [[V3]] +; +; SLM-LABEL: @loadext_4i32_to_4i64( +; SLM-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 +; SLM-NEXT: [[P2:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 2 +; SLM-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 +; SLM-NEXT: [[I0:%.*]] = load i32, i32* [[P0]], align 1 +; SLM-NEXT: [[I1:%.*]] = load i32, i32* [[P1]], align 1 +; SLM-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 +; SLM-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 +; SLM-NEXT: [[X0:%.*]] = sext i32 [[I0]] to i64 +; SLM-NEXT: [[X1:%.*]] = sext i32 [[I1]] to i64 +; SLM-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 +; SLM-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 +; SLM-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[X0]], i32 0 +; SLM-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[X1]], i32 1 +; SLM-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 +; SLM-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; SLM-NEXT: ret <4 x i64> [[V3]] ; ; AVX-LABEL: @loadext_4i32_to_4i64( ; AVX-NEXT: [[P1:%.*]] = getelementptr inbounds i32, i32* [[P0:%.*]], i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-throttle.ll @@ -5,18 +5,20 @@ ; CHECK-LABEL: @rftbsub( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 2, 1 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8 -; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef +; CHECK-NEXT: [[TMP0:%.*]] = or i64 2, 1 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP3]], undef ; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] -; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] -; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]] -; CHECK-NEXT: store double [[SUB25]], double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]] -; CHECK-NEXT: store double [[SUB29]], double* [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[MUL18]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP4]], undef +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], undef +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x double> [[TMP2]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP9]], align 8 ; CHECK-NEXT: unreachable ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll @@ -7,10 +7,11 @@ ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[REORDER_SHUFFLE]], <2 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[LOAD_EXTEND]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[REORDER_SHUFFLE]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A2:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 @@ -67,12 +68,13 @@ ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[LOAD_EXTEND]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[REORDER_SHUFFLE]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A4:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A2:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A3:%.*]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 @@ -131,12 +133,13 @@ ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[LOAD_EXTEND]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[REORDER_SHUFFLE]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A3:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A4:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A2:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A1:%.*]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/zext.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/zext.ll @@ -233,30 +233,34 @@ ; SLM-NEXT: [[P5:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 5 ; SLM-NEXT: [[P6:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 6 ; SLM-NEXT: [[P7:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 7 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[X7]], i32 7 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = bitcast i8* [[P2]] to <2 x i8>* +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* [[TMP3]], align 1 +; SLM-NEXT: [[TMP5:%.*]] = bitcast i8* [[P4]] to <2 x i8>* +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i8>, <2 x i8>* [[TMP5]], align 1 +; SLM-NEXT: [[TMP7:%.*]] = bitcast i8* [[P6]] to <2 x i8>* +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i8>, <2 x i8>* [[TMP7]], align 1 +; SLM-NEXT: [[TMP9:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i16> +; SLM-NEXT: [[TMP10:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i16> +; SLM-NEXT: [[TMP11:%.*]] = zext <2 x i8> [[TMP6]] to <2 x i16> +; SLM-NEXT: [[TMP12:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i16> +; SLM-NEXT: [[TMP13:%.*]] = extractelement <2 x i16> [[TMP9]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <8 x i16> undef, i16 [[TMP13]], i32 0 +; SLM-NEXT: [[TMP14:%.*]] = extractelement <2 x i16> [[TMP9]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <8 x i16> [[V0]], i16 [[TMP14]], i32 1 +; SLM-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP10]], i32 0 +; SLM-NEXT: [[V2:%.*]] = insertelement <8 x i16> [[V1]], i16 [[TMP15]], i32 2 +; SLM-NEXT: [[TMP16:%.*]] = extractelement <2 x i16> [[TMP10]], i32 1 +; SLM-NEXT: [[V3:%.*]] = insertelement <8 x i16> [[V2]], i16 [[TMP16]], i32 3 +; SLM-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP11]], i32 0 +; SLM-NEXT: [[V4:%.*]] = insertelement <8 x i16> [[V3]], i16 [[TMP17]], i32 4 +; SLM-NEXT: [[TMP18:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1 +; SLM-NEXT: [[V5:%.*]] = insertelement <8 x i16> [[V4]], i16 [[TMP18]], i32 5 +; SLM-NEXT: [[TMP19:%.*]] = extractelement <2 x i16> [[TMP12]], i32 0 +; SLM-NEXT: [[V6:%.*]] = insertelement <8 x i16> [[V5]], i16 [[TMP19]], i32 6 +; SLM-NEXT: [[TMP20:%.*]] = extractelement <2 x i16> [[TMP12]], i32 1 +; SLM-NEXT: [[V7:%.*]] = insertelement <8 x i16> [[V6]], i16 [[TMP20]], i32 7 ; SLM-NEXT: ret <8 x i16> [[V7]] ; ; AVX-LABEL: @loadext_8i8_to_8i16( @@ -519,54 +523,62 @@ ; SLM-NEXT: [[P13:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 13 ; SLM-NEXT: [[P14:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 14 ; SLM-NEXT: [[P15:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 15 -; SLM-NEXT: [[I0:%.*]] = load i8, i8* [[P0]], align 1 -; SLM-NEXT: [[I1:%.*]] = load i8, i8* [[P1]], align 1 -; SLM-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; SLM-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; SLM-NEXT: [[I4:%.*]] = load i8, i8* [[P4]], align 1 -; SLM-NEXT: [[I5:%.*]] = load i8, i8* [[P5]], align 1 -; SLM-NEXT: [[I6:%.*]] = load i8, i8* [[P6]], align 1 -; SLM-NEXT: [[I7:%.*]] = load i8, i8* [[P7]], align 1 -; SLM-NEXT: [[I8:%.*]] = load i8, i8* [[P8]], align 1 -; SLM-NEXT: [[I9:%.*]] = load i8, i8* [[P9]], align 1 -; SLM-NEXT: [[I10:%.*]] = load i8, i8* [[P10]], align 1 -; SLM-NEXT: [[I11:%.*]] = load i8, i8* [[P11]], align 1 -; SLM-NEXT: [[I12:%.*]] = load i8, i8* [[P12]], align 1 -; SLM-NEXT: [[I13:%.*]] = load i8, i8* [[P13]], align 1 -; SLM-NEXT: [[I14:%.*]] = load i8, i8* [[P14]], align 1 -; SLM-NEXT: [[I15:%.*]] = load i8, i8* [[P15]], align 1 -; SLM-NEXT: [[X0:%.*]] = zext i8 [[I0]] to i16 -; SLM-NEXT: [[X1:%.*]] = zext i8 [[I1]] to i16 -; SLM-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i16 -; SLM-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i16 -; SLM-NEXT: [[X4:%.*]] = zext i8 [[I4]] to i16 -; SLM-NEXT: [[X5:%.*]] = zext i8 [[I5]] to i16 -; SLM-NEXT: [[X6:%.*]] = zext i8 [[I6]] to i16 -; SLM-NEXT: [[X7:%.*]] = zext i8 [[I7]] to i16 -; SLM-NEXT: [[X8:%.*]] = zext i8 [[I8]] to i16 -; SLM-NEXT: [[X9:%.*]] = zext i8 [[I9]] to i16 -; SLM-NEXT: [[X10:%.*]] = zext i8 [[I10]] to i16 -; SLM-NEXT: [[X11:%.*]] = zext i8 [[I11]] to i16 -; SLM-NEXT: [[X12:%.*]] = zext i8 [[I12]] to i16 -; SLM-NEXT: [[X13:%.*]] = zext i8 [[I13]] to i16 -; SLM-NEXT: [[X14:%.*]] = zext i8 [[I14]] to i16 -; SLM-NEXT: [[X15:%.*]] = zext i8 [[I15]] to i16 -; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[X0]], i32 0 -; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[X1]], i32 1 -; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[X2]], i32 2 -; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[X3]], i32 3 -; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[X4]], i32 4 -; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[X5]], i32 5 -; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[X6]], i32 6 -; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[X7]], i32 7 -; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[X8]], i32 8 -; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[X9]], i32 9 -; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[X10]], i32 10 -; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[X11]], i32 11 -; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[X12]], i32 12 -; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[X13]], i32 13 -; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[X14]], i32 14 -; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[X15]], i32 15 +; SLM-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* +; SLM-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 +; SLM-NEXT: [[TMP3:%.*]] = bitcast i8* [[P2]] to <2 x i8>* +; SLM-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* [[TMP3]], align 1 +; SLM-NEXT: [[TMP5:%.*]] = bitcast i8* [[P4]] to <2 x i8>* +; SLM-NEXT: [[TMP6:%.*]] = load <2 x i8>, <2 x i8>* [[TMP5]], align 1 +; SLM-NEXT: [[TMP7:%.*]] = bitcast i8* [[P6]] to <2 x i8>* +; SLM-NEXT: [[TMP8:%.*]] = load <2 x i8>, <2 x i8>* [[TMP7]], align 1 +; SLM-NEXT: [[TMP9:%.*]] = bitcast i8* [[P8]] to <2 x i8>* +; SLM-NEXT: [[TMP10:%.*]] = load <2 x i8>, <2 x i8>* [[TMP9]], align 1 +; SLM-NEXT: [[TMP11:%.*]] = bitcast i8* [[P10]] to <2 x i8>* +; SLM-NEXT: [[TMP12:%.*]] = load <2 x i8>, <2 x i8>* [[TMP11]], align 1 +; SLM-NEXT: [[TMP13:%.*]] = bitcast i8* [[P12]] to <2 x i8>* +; SLM-NEXT: [[TMP14:%.*]] = load <2 x i8>, <2 x i8>* [[TMP13]], align 1 +; SLM-NEXT: [[TMP15:%.*]] = bitcast i8* [[P14]] to <2 x i8>* +; SLM-NEXT: [[TMP16:%.*]] = load <2 x i8>, <2 x i8>* [[TMP15]], align 1 +; SLM-NEXT: [[TMP17:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i16> +; SLM-NEXT: [[TMP18:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i16> +; SLM-NEXT: [[TMP19:%.*]] = zext <2 x i8> [[TMP6]] to <2 x i16> +; SLM-NEXT: [[TMP20:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i16> +; SLM-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP10]] to <2 x i16> +; SLM-NEXT: [[TMP22:%.*]] = zext <2 x i8> [[TMP12]] to <2 x i16> +; SLM-NEXT: [[TMP23:%.*]] = zext <2 x i8> [[TMP14]] to <2 x i16> +; SLM-NEXT: [[TMP24:%.*]] = zext <2 x i8> [[TMP16]] to <2 x i16> +; SLM-NEXT: [[TMP25:%.*]] = extractelement <2 x i16> [[TMP17]], i32 0 +; SLM-NEXT: [[V0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP25]], i32 0 +; SLM-NEXT: [[TMP26:%.*]] = extractelement <2 x i16> [[TMP17]], i32 1 +; SLM-NEXT: [[V1:%.*]] = insertelement <16 x i16> [[V0]], i16 [[TMP26]], i32 1 +; SLM-NEXT: [[TMP27:%.*]] = extractelement <2 x i16> [[TMP18]], i32 0 +; SLM-NEXT: [[V2:%.*]] = insertelement <16 x i16> [[V1]], i16 [[TMP27]], i32 2 +; SLM-NEXT: [[TMP28:%.*]] = extractelement <2 x i16> [[TMP18]], i32 1 +; SLM-NEXT: [[V3:%.*]] = insertelement <16 x i16> [[V2]], i16 [[TMP28]], i32 3 +; SLM-NEXT: [[TMP29:%.*]] = extractelement <2 x i16> [[TMP19]], i32 0 +; SLM-NEXT: [[V4:%.*]] = insertelement <16 x i16> [[V3]], i16 [[TMP29]], i32 4 +; SLM-NEXT: [[TMP30:%.*]] = extractelement <2 x i16> [[TMP19]], i32 1 +; SLM-NEXT: [[V5:%.*]] = insertelement <16 x i16> [[V4]], i16 [[TMP30]], i32 5 +; SLM-NEXT: [[TMP31:%.*]] = extractelement <2 x i16> [[TMP20]], i32 0 +; SLM-NEXT: [[V6:%.*]] = insertelement <16 x i16> [[V5]], i16 [[TMP31]], i32 6 +; SLM-NEXT: [[TMP32:%.*]] = extractelement <2 x i16> [[TMP20]], i32 1 +; SLM-NEXT: [[V7:%.*]] = insertelement <16 x i16> [[V6]], i16 [[TMP32]], i32 7 +; SLM-NEXT: [[TMP33:%.*]] = extractelement <2 x i16> [[TMP21]], i32 0 +; SLM-NEXT: [[V8:%.*]] = insertelement <16 x i16> [[V7]], i16 [[TMP33]], i32 8 +; SLM-NEXT: [[TMP34:%.*]] = extractelement <2 x i16> [[TMP21]], i32 1 +; SLM-NEXT: [[V9:%.*]] = insertelement <16 x i16> [[V8]], i16 [[TMP34]], i32 9 +; SLM-NEXT: [[TMP35:%.*]] = extractelement <2 x i16> [[TMP22]], i32 0 +; SLM-NEXT: [[V10:%.*]] = insertelement <16 x i16> [[V9]], i16 [[TMP35]], i32 10 +; SLM-NEXT: [[TMP36:%.*]] = extractelement <2 x i16> [[TMP22]], i32 1 +; SLM-NEXT: [[V11:%.*]] = insertelement <16 x i16> [[V10]], i16 [[TMP36]], i32 11 +; SLM-NEXT: [[TMP37:%.*]] = extractelement <2 x i16> [[TMP23]], i32 0 +; SLM-NEXT: [[V12:%.*]] = insertelement <16 x i16> [[V11]], i16 [[TMP37]], i32 12 +; SLM-NEXT: [[TMP38:%.*]] = extractelement <2 x i16> [[TMP23]], i32 1 +; SLM-NEXT: [[V13:%.*]] = insertelement <16 x i16> [[V12]], i16 [[TMP38]], i32 13 +; SLM-NEXT: [[TMP39:%.*]] = extractelement <2 x i16> [[TMP24]], i32 0 +; SLM-NEXT: [[V14:%.*]] = insertelement <16 x i16> [[V13]], i16 [[TMP39]], i32 14 +; SLM-NEXT: [[TMP40:%.*]] = extractelement <2 x i16> [[TMP24]], i32 1 +; SLM-NEXT: [[V15:%.*]] = insertelement <16 x i16> [[V14]], i16 [[TMP40]], i32 15 ; SLM-NEXT: ret <16 x i16> [[V15]] ; ; AVX-LABEL: @loadext_16i8_to_16i16(