diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -27,6 +27,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" @@ -90,6 +91,7 @@ #include #include #include +#include #include #include #include @@ -195,20 +197,14 @@ /// \returns true if all of the instructions in \p VL are in the same block or /// false otherwise. -static bool allSameBlock(ArrayRef VL) { - Instruction *I0 = dyn_cast(VL[0]); - if (!I0) - return false; +template static bool allSameBlock(T &&VL) { + if (empty(VL)) + return true; + auto *I0 = cast(*VL.begin()); BasicBlock *BB = I0->getParent(); - for (int I = 1, E = VL.size(); I < E; I++) { - auto *II = dyn_cast(VL[I]); - if (!II) - return false; - - if (BB != II->getParent()) - return false; - } - return true; + return all_of(drop_begin(VL, 1), [BB](Value *V) { + return BB == cast(V)->getParent(); + }); } /// \returns True if all of the values in \p VL are constants (but not @@ -397,9 +393,16 @@ /// could be vectorized even if its structure is diverse. static InstructionsState getSameOpcode(ArrayRef VL, unsigned BaseIndex = 0) { - // Make sure these are all Instructions. - if (llvm::any_of(VL, [](Value *V) { return !isa(V); })) + // Make sure these are all Instructions or UndefValues. + if (llvm::any_of(VL, + [](Value *V) { + return !isa(V) && !isa(V); + }) || + llvm::all_of(VL, [](Value *V) { return isa(V); })) return InstructionsState(VL[BaseIndex], nullptr, nullptr); + BaseIndex = + std::distance(VL.begin(), llvm::find_if(llvm::drop_begin(VL, BaseIndex), + Instruction::classof)); bool IsCastOp = isa(VL[BaseIndex]); bool IsBinOp = isa(VL[BaseIndex]); @@ -410,6 +413,8 @@ // Check for one alternate opcode from another BinaryOperator. // TODO - generalize to support all operators (types, calls etc.). for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { + if (isa(VL[Cnt])) + continue; unsigned InstOpcode = cast(VL[Cnt])->getOpcode(); if (IsBinOp && isa(VL[Cnt])) { if (InstOpcode == Opcode || InstOpcode == AltOpcode) @@ -527,9 +532,10 @@ SmallVectorImpl &Mask) { Mask.clear(); const unsigned E = Indices.size(); - Mask.resize(E, E + 1); + Mask.resize(E, UndefMaskElem); for (unsigned I = 0; I < E; ++I) - Mask[Indices[I]] = I; + if (Indices[I] != E + 1) + Mask[Indices[I]] = I; } namespace slpvectorizer { @@ -851,6 +857,7 @@ /// accessing a consecutive address. These strategies are summarized in the /// 'ReorderingMode' enumerator. enum class ReorderingMode { + Unknown, ///< Mode is not defined yet Load, ///< Matching loads to consecutive memory addresses Opcode, ///< Matching instructions based on opcode (same or alternate) Constant, ///< Matching constants @@ -866,6 +873,7 @@ const DataLayout &DL; ScalarEvolution &SE; const BoUpSLP &R; + Instruction &VL0; /// \returns the operand data at \p OpIdx and \p Lane. OperandData &getData(unsigned OpIdx, unsigned Lane) { @@ -1175,6 +1183,8 @@ break; case ReorderingMode::Failed: return None; + case ReorderingMode::Unknown: + llvm_unreachable("Unknown mode is not expected here."); } } @@ -1215,10 +1225,17 @@ // a map. Instead we can simply count the number of operands that // correspond to one of them (in this case the 'true' APO), and calculate // the other by subtracting it from the total number of operands. - for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) - if (getData(OpIdx, Lane).APO) + unsigned UndefsCnt = 0; + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + const OperandData &OpData = getData(OpIdx, Lane); + if (isa(OpData.V)) { + ++UndefsCnt; + continue; + } + if (OpData.APO) ++CntTrue; - unsigned CntFalse = NumOperands - CntTrue; + } + unsigned CntFalse = NumOperands - CntTrue - UndefsCnt; return std::max(CntTrue, CntFalse); } @@ -1227,13 +1244,18 @@ assert(!VL.empty() && "Bad VL"); assert((empty() || VL.size() == getNumLanes()) && "Expected same number of lanes"); - assert(isa(VL[0]) && "Expected instruction"); - unsigned NumOperands = cast(VL[0])->getNumOperands(); + unsigned NumOperands = VL0.getNumOperands(); OpsVec.resize(NumOperands); unsigned NumLanes = VL.size(); for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { OpsVec[OpIdx].resize(NumLanes); for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + if (isa(VL[Lane])) { + OpsVec[OpIdx][Lane] = { + UndefValue::get(VL0.getOperand(OpIdx)->getType()), false, + false}; + continue; + } assert(isa(VL[Lane]) && "Expected instruction"); // Our tree has just 3 nodes: the root and two operands. // It is therefore trivial to get the APO. We only need to check the @@ -1298,9 +1320,9 @@ public: /// Initialize with all the operands of the instruction vector \p RootVL. - VLOperands(ArrayRef RootVL, const DataLayout &DL, + VLOperands(Instruction &VL0, ArrayRef RootVL, const DataLayout &DL, ScalarEvolution &SE, const BoUpSLP &R) - : DL(DL), SE(SE), R(R) { + : DL(DL), SE(SE), R(R), VL0(VL0) { // Append all the operands of RootVL. appendOperandsOfVL(RootVL); } @@ -1325,7 +1347,8 @@ // Each operand has its own mode. We are using this mode to help us select // the instructions for each lane, so that they match best with the ones // we have selected so far. - SmallVector ReorderingModes(NumOperands); + SmallVector ReorderingModes(NumOperands, + ReorderingMode::Unknown); // This is a greedy single-pass algorithm. We are going over each lane // once and deciding on the best order right away with no back-tracking. @@ -1419,6 +1442,8 @@ #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) { switch (RMode) { + case ReorderingMode::Unknown: + return "Unknown"; case ReorderingMode::Load: return "Load"; case ReorderingMode::Opcode: @@ -1506,7 +1531,8 @@ /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. int getGatherCost(FixedVectorType *Ty, - const DenseSet &ShuffledIndices) const; + const DenseSet &ShuffledIndices, + const SparseBitVector<> &IgnoredIndices) const; /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the @@ -1526,12 +1552,10 @@ /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. - static void reorderInputsAccordingToOpcode(ArrayRef VL, - SmallVectorImpl &Left, - SmallVectorImpl &Right, - const DataLayout &DL, - ScalarEvolution &SE, - const BoUpSLP &R); + static void reorderInputsAccordingToOpcode( + Instruction &VL0, ArrayRef VL, SmallVectorImpl &Left, + SmallVectorImpl &Right, const DataLayout &DL, + ScalarEvolution &SE, const BoUpSLP &R); struct TreeEntry { using VecTreeTy = SmallVector, 8>; TreeEntry(VecTreeTy &Container) : Container(Container) {} @@ -1601,15 +1625,19 @@ } /// Set the operands of this bundle in their original order. - void setOperandsInOrder() { + void setOperandsInOrder(Instruction *I0) { assert(Operands.empty() && "Already initialized?"); - auto *I0 = cast(Scalars[0]); Operands.resize(I0->getNumOperands()); unsigned NumLanes = Scalars.size(); for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); OpIdx != NumOperands; ++OpIdx) { Operands[OpIdx].resize(NumLanes); for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + if (isa(Scalars[Lane])) { + Operands[OpIdx][Lane] = + UndefValue::get(I0->getOperand(OpIdx)->getType()); + continue; + } auto *I = cast(Scalars[Lane]); assert(I->getNumOperands() == NumOperands && "Expected same number of operands"); @@ -1788,8 +1816,9 @@ ReuseShuffleIndices.end()); Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end()); Last->setOperations(S); + auto InstructionsOnly = make_filter_range(VL, Instruction::classof); if (Last->State != TreeEntry::NeedToGather) { - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { assert(!getTreeEntry(V) && "Scalar already in tree!"); ScalarToTreeEntry[V] = Last; } @@ -1801,10 +1830,12 @@ BundleMember->Lane = Lane; ++Lane; } - assert((!Bundle.getValue() || Lane == VL.size()) && + assert((!Bundle.getValue() || + Lane == std::distance(InstructionsOnly.begin(), + InstructionsOnly.end())) && "Bundle and VL out of sync"); } else { - MustGather.insert(VL.begin(), VL.end()); + MustGather.insert(InstructionsOnly.begin(), InstructionsOnly.end()); } if (UserTreeIdx.UserTE) @@ -2529,6 +2560,8 @@ // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (isa(Scalar)) + continue; int FoundLane = Lane; if (!Entry->ReuseShuffleIndices.empty()) { FoundLane = @@ -2552,9 +2585,12 @@ // Skip in-tree scalars that become vectors if (TreeEntry *UseEntry = getTreeEntry(U)) { - Value *UseScalar = UseEntry->Scalars[0]; + auto *It = llvm::find_if(UseEntry->Scalars, Instruction::classof); + assert(It != UseEntry->Scalars.end() && + "At least single instruction is expected."); + Value *UseScalar = *It; // Some in-tree scalars will remain as scalar in vectorized - // instructions. If that is the case, the one in Lane 0 will + // instructions. If that is the case, the one in the first lane will // be used. if (UseScalar != U || !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) { @@ -2602,8 +2638,10 @@ return; } + auto InitialInstructionsOnly = make_filter_range(VL, Instruction::classof); // If all of the operands are identical or constant we have a simple solution. - if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) { + if (allConstant(VL) || isSplat(VL) || + !allSameBlock(InitialInstructionsOnly) || !S.getOpcode()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); return; @@ -2613,7 +2651,7 @@ // the same block. // Don't vectorize ephemeral values. - for (Value *V : VL) { + for (Value *V : InitialInstructionsOnly) { if (EphValues.count(V)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is ephemeral.\n"); @@ -2639,11 +2677,8 @@ } // Check that none of the instructions in the bundle are already in the tree. - for (Value *V : VL) { - auto *I = dyn_cast(V); - if (!I) - continue; - if (getTreeEntry(I)) { + for (Value *V : InitialInstructionsOnly) { + if (getTreeEntry(V)) { LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is already in tree.\n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); @@ -2654,7 +2689,7 @@ // If any of the scalars is marked as a value that needs to stay scalar, then // we need to gather the scalars. // The reduction nodes (stored in UserIgnoreList) also should stay scalar. - for (Value *V : VL) { + for (Value *V : InitialInstructionsOnly) { if (MustGather.count(V) || is_contained(UserIgnoreList, V)) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); @@ -2680,6 +2715,11 @@ SmallVector UniqueValues; DenseMap UniquePositions; for (Value *V : VL) { + if (isa(V)) { + ReuseShuffleIndicies.emplace_back(UniqueValues.size()); + UniqueValues.emplace_back(V); + continue; + } auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); ReuseShuffleIndicies.emplace_back(Res.first->second); if (Res.second) @@ -2690,14 +2730,23 @@ ReuseShuffleIndicies.clear(); } else { LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); - if (NumUniqueScalarValues <= 1 || - !llvm::isPowerOf2_32(NumUniqueScalarValues)) { - LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx); - return; - } + UniqueValues.append(VL.size() - UniqueValues.size(), + UndefValue::get(VL0->getType())); VL = UniqueValues; } + const unsigned NumberOfInstructions = + llvm::count_if(VL, Instruction::classof); + if (NumberOfInstructions <= 1) { + assert(NumberOfInstructions != 0 && + "At least one instruction is expected."); + LLVM_DEBUG( + dbgs() + << "SLP: Gathering due to vectorization of single instruction.\n"); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + return; + } + auto InstructionsOnly = make_filter_range(VL, Instruction::classof); auto &BSRef = BlocksSchedules[BB]; if (!BSRef) @@ -2724,10 +2773,10 @@ auto *PH = cast(VL0); // Check for terminator values (e.g. invoke). - for (Value *V : VL) + for (Value *V : InstructionsOnly) for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { - Instruction *Term = dyn_cast( - cast(V)->getIncomingValueForBlock( + auto *Term = + dyn_cast(cast(V)->getIncomingValueForBlock( PH->getIncomingBlock(I))); if (Term && Term->isTerminator()) { LLVM_DEBUG(dbgs() @@ -2749,8 +2798,10 @@ ValueList Operands; // Prepare the operand vector. for (Value *V : VL) - Operands.push_back(cast(V)->getIncomingValueForBlock( - PH->getIncomingBlock(I))); + Operands.emplace_back( + isa(V) ? UndefValue::get(V->getType()) + : cast(V)->getIncomingValueForBlock( + PH->getIncomingBlock(I))); TE->setOperand(I, Operands); OperandsVec.push_back(Operands); } @@ -2821,9 +2872,11 @@ // Make sure all loads in the bundle are simple - we can't vectorize // atomic or volatile loads. - SmallVector PointerOps(VL.size()); - auto POIter = PointerOps.begin(); + SmallVector PointerOps(NumberOfInstructions); + auto *POIter = PointerOps.begin(); for (Value *V : VL) { + if (isa(V)) + continue; auto *L = cast(V); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); @@ -2854,20 +2907,22 @@ dyn_cast(SE->getMinusSCEV(ScevN, Scev0)); uint64_t Size = DL->getTypeAllocSize(ScalarTy); // Check that the sorted loads are consecutive. - if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { + if (Diff && Diff->getAPInt() == (NumberOfInstructions - 1) * Size) { if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { + CurrentOrder.append(VL.size() - NumberOfInstructions, + VL.size() + 1); // Need to reorder. TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies, CurrentOrder); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); findRootOrder(CurrentOrder); ++NumOpsWantToKeepOrder[CurrentOrder]; @@ -2877,7 +2932,10 @@ // Vectorizing non-consecutive loads with `llvm.masked.gather`. TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S, UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); + PointerOps.append( + VL.size() - NumberOfInstructions, + UndefValue::get(cast(VL0)->getPointerOperandType())); buildTree_rec(PointerOps, Depth + 1, {TE, 0}); LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n"); return; @@ -2902,7 +2960,7 @@ case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { Type *Ty = cast(V)->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); @@ -2917,12 +2975,14 @@ ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (Value *V : VL) - Operands.push_back(cast(V)->getOperand(i)); + Operands.push_back(isa(V) + ? UndefValue::get(SrcTy) + : cast(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -2934,8 +2994,8 @@ CmpInst::Predicate P0 = cast(VL0)->getPredicate(); CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); Type *ComparedTy = VL0->getOperand(0)->getType(); - for (Value *V : VL) { - CmpInst *Cmp = cast(V); + for (Value *V : InstructionsOnly) { + auto *Cmp = cast(V); if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); @@ -2956,10 +3016,15 @@ // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. assert(P0 == SwapP0 && "Commutative Predicate mismatch"); - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + reorderInputsAccordingToOpcode(*VL0, VL, Left, Right, *DL, *SE, *this); } else { // Collect operands - commute if it uses the swapped predicate. for (Value *V : VL) { + if (isa(V)) { + Left.push_back(UndefValue::get(VL0->getOperand(0)->getType())); + Right.push_back(UndefValue::get(VL0->getOperand(1)->getType())); + continue; + } auto *Cmp = cast(V); Value *LHS = Cmp->getOperand(0); Value *RHS = Cmp->getOperand(1); @@ -3003,7 +3068,7 @@ // have the same opcode. if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + reorderInputsAccordingToOpcode(*VL0, VL, Left, Right, *DL, *SE, *this); TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -3011,20 +3076,46 @@ return; } - TE->setOperandsInOrder(); + SmallVector OperandsVec; for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *V : VL) - Operands.push_back(cast(V)->getOperand(i)); - - buildTree_rec(Operands, Depth + 1, {TE, i}); + Value *LastDefined = nullptr; + for (Value *V : VL) { + Value *OpV; + if (isa(V)) { + if (BinaryOperator::isIntDivRem(ShuffleOrOp)) { + if (LastDefined) + OpV = LastDefined; + else + OpV = ConstantInt::get(VL0->getOperand(i)->getType(), 1); + } else { + OpV = UndefValue::get(VL0->getOperand(i)->getType()); + } + } else { + OpV = cast(V)->getOperand(i); + if (isa(OpV) && + BinaryOperator::isIntDivRem(ShuffleOrOp)) { + if (LastDefined) + OpV = LastDefined; + else + OpV = ConstantInt::get(VL0->getOperand(i)->getType(), 1); + } else { + LastDefined = OpV; + } + } + Operands.push_back(OpV); + } + TE->setOperand(i, Operands); + OperandsVec.push_back(Operands); } + for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx) + buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx}); return; } case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { if (cast(V)->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); @@ -3037,7 +3128,7 @@ // We can't combine several GEPs into one vector if they operate on // different types. Type *Ty0 = VL0->getOperand(0)->getType(); - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { Type *CurTy = cast(V)->getOperand(0)->getType(); if (Ty0 != CurTy) { LLVM_DEBUG(dbgs() @@ -3051,8 +3142,8 @@ // We don't combine GEPs with non-constant indexes. Type *Ty1 = VL0->getOperand(1)->getType(); - for (Value *V : VL) { - auto Op = cast(V)->getOperand(1); + for (Value *V : InstructionsOnly) { + auto *Op = cast(V)->getOperand(1); if (!isa(Op) || (Op->getType() != Ty1 && Op->getType()->getScalarSizeInBits() > @@ -3070,12 +3161,15 @@ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (Value *V : VL) - Operands.push_back(cast(V)->getOperand(i)); + Operands.push_back( + isa(V) + ? UndefValue::get(VL0->getOperand(i)->getType()) + : cast(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -3086,11 +3180,16 @@ llvm::Type *ScalarTy = cast(VL0)->getValueOperand()->getType(); // Make sure all stores in the bundle are simple - we can't vectorize // atomic or volatile stores. - SmallVector PointerOps(VL.size()); + SmallVector PointerOps(NumberOfInstructions); ValueList Operands(VL.size()); auto POIter = PointerOps.begin(); auto OIter = Operands.begin(); for (Value *V : VL) { + if (isa(V)) { + *OIter = UndefValue::get(VL0->getOperand(0)->getType()); + ++OIter; + continue; + } auto *SI = cast(V); if (!SI->isSimple()) { BS.cancelScheduling(VL, VL0); @@ -3106,46 +3205,50 @@ } OrdersType CurrentOrder; + if (!llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); + return; + } // Check the order of pointer operands. - if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) { - Value *Ptr0; - Value *PtrN; + Value *Ptr0; + Value *PtrN; + if (CurrentOrder.empty()) { + Ptr0 = PointerOps.front(); + PtrN = PointerOps.back(); + } else { + Ptr0 = PointerOps[CurrentOrder.front()]; + PtrN = PointerOps[CurrentOrder.back()]; + } + const SCEV *Scev0 = SE->getSCEV(Ptr0); + const SCEV *ScevN = SE->getSCEV(PtrN); + const auto *Diff = dyn_cast(SE->getMinusSCEV(ScevN, Scev0)); + uint64_t Size = DL->getTypeAllocSize(ScalarTy); + // Check that the sorted pointer operands are consecutive. + if (Diff && Diff->getAPInt() == (NumberOfInstructions - 1) * Size) { if (CurrentOrder.empty()) { - Ptr0 = PointerOps.front(); - PtrN = PointerOps.back(); + // Original stores are consecutive and does not require reordering. + ++NumOpsWantToKeepOriginalOrder; + TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, + UserTreeIdx, ReuseShuffleIndicies); + TE->setOperandsInOrder(VL0); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); } else { - Ptr0 = PointerOps[CurrentOrder.front()]; - PtrN = PointerOps[CurrentOrder.back()]; - } - const SCEV *Scev0 = SE->getSCEV(Ptr0); - const SCEV *ScevN = SE->getSCEV(PtrN); - const auto *Diff = - dyn_cast(SE->getMinusSCEV(ScevN, Scev0)); - uint64_t Size = DL->getTypeAllocSize(ScalarTy); - // Check that the sorted pointer operands are consecutive. - if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) { - if (CurrentOrder.empty()) { - // Original stores are consecutive and does not require reordering. - ++NumOpsWantToKeepOriginalOrder; - TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, - UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); - buildTree_rec(Operands, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); - } else { - TreeEntry *TE = - newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndicies, CurrentOrder); - TE->setOperandsInOrder(); - buildTree_rec(Operands, Depth + 1, {TE, 0}); - LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); - findRootOrder(CurrentOrder); - ++NumOpsWantToKeepOrder[CurrentOrder]; - } - return; + CurrentOrder.append(VL.size() - NumberOfInstructions, VL.size() + 1); + TreeEntry *TE = + newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies, CurrentOrder); + TE->setOperandsInOrder(VL0); + buildTree_rec(Operands, Depth + 1, {TE, 0}); + LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n"); + findRootOrder(CurrentOrder); + ++NumOpsWantToKeepOrder[CurrentOrder]; } + return; } - BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); @@ -3177,6 +3280,13 @@ if (hasVectorInstrinsicScalarOpd(ID, j)) ScalarArgs[j] = CI->getArgOperand(j); for (Value *V : VL) { + if (isa(V)) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); + return; + } CallInst *CI2 = dyn_cast(V); if (!CI2 || CI2->getCalledFunction() != F || getVectorIntrinsicIDForCall(CI2, TLI) != ID || @@ -3186,8 +3296,8 @@ BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V - << "\n"); + LLVM_DEBUG(dbgs() + << "SLP: mismatched calls:" << *CI << "!=" << *V << "\n"); return; } // Some intrinsics have scalar arguments and should be same in order for @@ -3222,7 +3332,7 @@ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. @@ -3251,7 +3361,7 @@ // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this); + reorderInputsAccordingToOpcode(*VL0, VL, Left, Right, *DL, *SE, *this); TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -3259,12 +3369,15 @@ return; } - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. for (Value *V : VL) - Operands.push_back(cast(V)->getOperand(i)); + Operands.push_back( + isa(V) + ? UndefValue::get(VL0->getOperand(i)->getType()) + : cast(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -3337,9 +3450,6 @@ NElts = cast(Vec->getType())->getNumElements(); } - if (NElts != VL.size()) - return false; - // Check that all of the indices extract from the correct offset. bool ShouldKeepOrder = true; unsigned E = VL.size(); @@ -3349,8 +3459,10 @@ // consecutive access in the extract instructions, by checking that no // element of CurrentOrder still has value E + 1. CurrentOrder.assign(E, E + 1); - unsigned I = 0; - for (; I < E; ++I) { + unsigned I = 0, End = std::min(NElts, E); + for (; I < End; ++I) { + if (isa(VL[I])) + continue; auto *Inst = cast(VL[I]); if (Inst->getOperand(0) != Vec) break; @@ -3369,10 +3481,15 @@ CurrentOrder[I] = I; } } - if (I < E) { + // Gather if not all extracts are from the same vector/aggregate. + if (I < End || (NElts < E && !all_of(drop_begin(VL, NElts), [](Value *V) { + return isa(V); + }))) { CurrentOrder.clear(); return false; } + if (NElts != E) + return false; return ShouldKeepOrder; } @@ -3416,20 +3533,31 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef VL = E->Scalars; + auto InstructionsOnly = make_filter_range(VL, Instruction::classof); + const unsigned NumOfInstructions = + std::distance(InstructionsOnly.begin(), InstructionsOnly.end()); + Value *V0; Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - else if (CmpInst *CI = dyn_cast(VL[0])) - ScalarTy = CI->getOperand(0)->getType(); - auto *VecTy = FixedVectorType::get(ScalarTy, VL.size()); + FixedVectorType *VecTy; + if (!llvm::empty(InstructionsOnly)) { + V0 = *InstructionsOnly.begin(); + if (StoreInst *SI = dyn_cast(V0)) + ScalarTy = SI->getValueOperand()->getType(); + else if (CmpInst *CI = dyn_cast(V0)) + ScalarTy = CI->getOperand(0)->getType(); + VecTy = FixedVectorType::get(ScalarTy, VL.size()); + + // If we have computed a smaller type for the expression, update VecTy so + // that the costs will be accurate. + if (MinBWs.count(V0)) { + VecTy = FixedVectorType::get( + IntegerType::get(F->getContext(), MinBWs[V0].first), VL.size()); + } + } else { + VecTy = FixedVectorType::get(ScalarTy, VL.size()); + } TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - // If we have computed a smaller type for the expression, update VecTy so - // that the costs will be accurate. - if (MinBWs.count(VL[0])) - VecTy = FixedVectorType::get( - IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); - unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); int ReuseShuffleCost = 0; @@ -3444,21 +3572,27 @@ return ReuseShuffleCost + TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0); } - if (E->getOpcode() == Instruction::ExtractElement && - allSameType(VL) && allSameBlock(VL)) { - Optional ShuffleKind = isShuffle(VL); - if (ShuffleKind.hasValue()) { - int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); - for (auto *V : VL) { + if (E->getOpcode() == Instruction::ExtractElement && allSameType(VL) && + allSameBlock(InstructionsOnly)) { + Optional ShuffleKind = + NumOfInstructions > 1 + ? isShuffle(llvm::to_vector<4>(InstructionsOnly)) + : None; + if (NumOfInstructions == 1 || ShuffleKind) { + int Cost = NumOfInstructions > 1 + ? TTI->getShuffleCost(*ShuffleKind, VecTy) + : 0; + for (Value *V : InstructionsOnly) { // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the // vectorized tree. if (areAllUsersVectorized(cast(V)) && !ScalarToTreeEntry.count(V)) { - auto *IO = cast( - cast(V)->getIndexOperand()); - Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, + auto *EE = cast(V); + auto *IO = cast(EE->getIndexOperand()); + Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, + EE->getVectorOperandType(), IO->getZExtValue()); } } @@ -3470,7 +3604,8 @@ assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && "Unhandled state"); - assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL"); + assert(E->getOpcode() && allSameType(VL) && allSameBlock(InstructionsOnly) && + "Invalid VL"); Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); @@ -3484,12 +3619,14 @@ if (NeedToShuffleReuses) { unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { + if (isa(VL[I])) + continue; if (ShuffleOrOp == Instruction::ExtractElement) { - auto *IO = cast( - cast(VL[I])->getIndexOperand()); + auto *EE = cast(VL[I]); + auto *IO = cast(EE->getIndexOperand()); Idx = IO->getZExtValue(); ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); + Instruction::ExtractElement, EE->getVectorOperandType(), Idx); } else { ReuseShuffleCost -= TTI->getVectorInstrCost( Instruction::ExtractElement, VecTy, Idx); @@ -3497,24 +3634,38 @@ } } Idx = ReuseShuffleNumbers; - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { if (ShuffleOrOp == Instruction::ExtractElement) { - auto *IO = cast( - cast(V)->getIndexOperand()); + auto *EE = cast(V); + auto *IO = cast(EE->getIndexOperand()); Idx = IO->getZExtValue(); + ReuseShuffleCost += TTI->getVectorInstrCost( + Instruction::ExtractElement, EE->getVectorOperandType(), Idx); } else { --Idx; + ReuseShuffleCost += TTI->getVectorInstrCost( + Instruction::ExtractElement, VecTy, Idx); } - ReuseShuffleCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } DeadCost = ReuseShuffleCost; } else if (!E->ReorderIndices.empty()) { DeadCost = TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); } +#ifndef NDEBUG + OrdersType CurrentOrder; + bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); + assert(Reuse && E->ReorderIndices.empty() || + (!Reuse && CurrentOrder.size() == E->ReorderIndices.size() && + std::equal(CurrentOrder.begin(), CurrentOrder.end(), + E->ReorderIndices.begin())) && + "The sequence of extract elements must be reused or shuffled " + "with the same mask."); +#endif for (unsigned I = 0, E = VL.size(); I < E; ++I) { - Instruction *EI = cast(VL[I]); + if (isa(VL[I])) + continue; + auto *EI = cast(VL[I]); // If all users are going to be vectorized, instruction can be // considered as dead. // The same, if have only one user, it will be vectorized for sure. @@ -3536,8 +3687,16 @@ continue; } } - DeadCost -= - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *EE = cast(EI); + auto *IO = cast(EE->getIndexOperand()); + unsigned Idx = IO->getZExtValue(); + DeadCost -= TTI->getVectorInstrCost( + Instruction::ExtractElement, EE->getVectorOperandType(), Idx); + } else { + DeadCost -= + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); + } } } return DeadCost; @@ -3559,11 +3718,12 @@ TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, TTI::getCastContextHint(VL0), CostKind, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } // Calculate the cost of this instruction. - int ScalarCost = VL.size() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size()); int VecCost = 0; @@ -3585,10 +3745,11 @@ TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(), CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size()); - int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; // Check if all entries in VL are either compares or selects with compares // as condition that have the same predicates. @@ -3663,24 +3824,27 @@ // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt0 = nullptr; + Constant *C0 = nullptr; for (unsigned i = 0, e = VL.size(); i < e; ++i) { + if (isa(VL[i])) + continue; const Instruction *I = cast(VL[i]); unsigned OpIdx = isa(I) ? 1 : 0; ConstantInt *CInt = dyn_cast(I->getOperand(OpIdx)); - if (!CInt) { + Constant *UV = dyn_cast(I->getOperand(OpIdx)); + if (!CInt && !UV) { Op2VK = TargetTransformInfo::OK_AnyValue; Op2VP = TargetTransformInfo::OP_None; break; } if (Op2VP == TargetTransformInfo::OP_PowerOf2 && - !CInt->getValue().isPowerOf2()) + (UV || !cast(CInt)->getValue().isPowerOf2())) Op2VP = TargetTransformInfo::OP_None; if (i == 0) { - CInt0 = CInt; + C0 = CInt ? CInt : UV; continue; } - if (CInt0 != CInt) + if (C0 != (CInt ? CInt : UV)) Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } @@ -3689,9 +3853,10 @@ E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } - int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; int VecCost = TTI->getArithmeticInstrCost( E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0); @@ -3708,9 +3873,10 @@ TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } - int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; int VecCost = TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind, Op1VK, Op2VK); @@ -3724,20 +3890,59 @@ TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } - int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarLdCost = NumOfInstructions * ScalarEltCost; int VecLdCost; + const auto *FirstInstr = llvm::find_if(VL, Instruction::classof); + const auto LastInstr = + llvm::find_if(llvm::reverse(VL), Instruction::classof); + unsigned InstrDist = std::distance(FirstInstr, LastInstr.base()); + bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(InstrDist); + bool ShuffledLoadInstructions = false; if (E->State == TreeEntry::Vectorize) { - VecLdCost = TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, - CostKind, VL0); + if (!IsPowOf2NumOfInstructions) { + IntrinsicCostAttributes Attrs( + Intrinsic::masked_load, VecTy, + {VecTy->getPointerTo(), Builder.getInt32Ty(), + FixedVectorType::get(Builder.getInt1Ty(), + VecTy->getNumElements()), + VecTy}); + VecLdCost = TTI->getIntrinsicInstrCost(Attrs, CostKind); + } else { + auto *LoadVecTy = VecTy; + if (InstrDist != VL.size()) + LoadVecTy = FixedVectorType::get(ScalarTy, InstrDist); + VecLdCost = TTI->getMemoryOpCost(Instruction::Load, LoadVecTy, + alignment, 0, CostKind, VL0); + if (!NeedToShuffleReuses && InstrDist != VL.size()) { + VecLdCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + ShuffledLoadInstructions = true; + } + } } else { assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); + Align CommonAlignment = alignment; + for (Value *V : InstructionsOnly) + CommonAlignment = + commonAlignment(CommonAlignment, cast(V)->getAlign()); + + unsigned NormalizedSz = llvm::PowerOf2Ceil(InstrDist); VecLdCost = TTI->getGatherScatterOpCost( - Instruction::Load, VecTy, cast(VL0)->getPointerOperand(), - /*VariableMask=*/false, alignment, CostKind, VL0); + Instruction::Load, FixedVectorType::get(ScalarTy, NormalizedSz), + cast(VL0)->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, CostKind, VL0); + // Cost of resizing the loaded elements to the size of the vector. + if (!NeedToShuffleReuses && NormalizedSz != VL.size()) { + VecLdCost = TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + ShuffledLoadInstructions = true; + } } - if (!NeedToShuffleReuses && !E->ReorderIndices.empty()) + if (!NeedToShuffleReuses && !E->ReorderIndices.empty() && + !ShuffledLoadInstructions) VecLdCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost)); @@ -3752,9 +3957,28 @@ int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0); - int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; - int VecStCost = TTI->getMemoryOpCost(Instruction::Store, - VecTy, Alignment, 0, CostKind, VL0); + int ScalarStCost = NumOfInstructions * ScalarEltCost; + int VecStCost; + const auto *FirstInstr = llvm::find_if(VL, Instruction::classof); + const auto LastInstr = + llvm::find_if(llvm::reverse(VL), Instruction::classof); + unsigned InstrDist = std::distance(FirstInstr, LastInstr.base()); + bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(InstrDist); + if (!IsPowOf2NumOfInstructions) { + IntrinsicCostAttributes Attrs( + Intrinsic::masked_store, Builder.getVoidTy(), + {VecTy, VecTy->getPointerTo(), Builder.getInt32Ty(), + FixedVectorType::get(Builder.getInt1Ty(), + VecTy->getNumElements())}); + VecStCost = TTI->getIntrinsicInstrCost(Attrs, CostKind); + } else if (InstrDist == VL.size()) { + VecStCost = TTI->getMemoryOpCost(Instruction::Store, VecTy, Alignment, + 0, CostKind, VL0); + } else { + auto *StoreVecTy = FixedVectorType::get(ScalarTy, InstrDist); + VecStCost = TTI->getMemoryOpCost(Instruction::Store, StoreVecTy, + Alignment, 0, CostKind, VL0); + } if (IsReorder) VecStCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); @@ -3769,9 +3993,10 @@ IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1); int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } - int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCallCost = NumOfInstructions * ScalarEltCost; auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI); int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second); @@ -3792,16 +4017,18 @@ int ScalarCost = 0; if (NeedToShuffleReuses) { for (unsigned Idx : E->ReuseShuffleIndices) { - Instruction *I = cast(VL[Idx]); + if (isa(VL[Idx])) + continue; + auto *I = cast(VL[Idx]); ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind); } - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { Instruction *I = cast(V); ReuseShuffleCost += TTI->getInstructionCost(I, CostKind); } } - for (Value *V : VL) { - Instruction *I = cast(V); + for (Value *V : InstructionsOnly) { + auto *I = cast(V); assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); ScalarCost += TTI->getInstructionCost(I, CostKind); } @@ -4105,12 +4332,16 @@ } int BoUpSLP::getGatherCost(FixedVectorType *Ty, - const DenseSet &ShuffledIndices) const { + const DenseSet &ShuffledIndices, + const SparseBitVector<> &IgnoredIndices) const { unsigned NumElts = Ty->getNumElements(); APInt DemandedElts = APInt::getNullValue(NumElts); - for (unsigned I = 0; I < NumElts; ++I) + for (unsigned I = 0; I < NumElts; ++I) { + if (IgnoredIndices.test(I)) + continue; if (!ShuffledIndices.count(I)) DemandedElts.setBit(I); + } int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true, /*Extract*/ false); if (!ShuffledIndices.empty()) @@ -4128,27 +4359,30 @@ // Check if the same elements are inserted several times and count them as // shuffle candidates. DenseSet ShuffledElements; + SparseBitVector<> IgnoredElements; DenseSet UniqueElements; // Iterate in reverse order to consider insert elements with the high cost. for (unsigned I = VL.size(); I > 0; --I) { unsigned Idx = I - 1; + if (isa(VL[Idx])) { + IgnoredElements.set(Idx); + continue; + } if (!UniqueElements.insert(VL[Idx]).second) ShuffledElements.insert(Idx); } - return getGatherCost(VecTy, ShuffledElements); + return getGatherCost(VecTy, ShuffledElements, IgnoredElements); } // Perform operand reordering on the instructions in VL and return the reordered // operands in Left and Right. -void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef VL, - SmallVectorImpl &Left, - SmallVectorImpl &Right, - const DataLayout &DL, - ScalarEvolution &SE, - const BoUpSLP &R) { +void BoUpSLP::reorderInputsAccordingToOpcode( + Instruction &VL0, ArrayRef VL, SmallVectorImpl &Left, + SmallVectorImpl &Right, const DataLayout &DL, ScalarEvolution &SE, + const BoUpSLP &R) { if (VL.empty()) return; - VLOperands Ops(VL, DL, SE, R); + VLOperands Ops(VL0, VL, DL, SE, R); // Reorder the operands in place. Ops.reorder(); Left = Ops.getVL(0); @@ -4156,15 +4390,17 @@ } void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) { + auto InstructionsOnly = make_filter_range(E->Scalars, Instruction::classof); + if (llvm::empty(InstructionsOnly)) + return; // Get the basic block this bundle is in. All instructions in the bundle // should be in this block. auto *Front = E->getMainOp(); auto *BB = Front->getParent(); - assert(llvm::all_of(make_range(E->Scalars.begin(), E->Scalars.end()), - [=](Value *V) -> bool { - auto *I = cast(V); - return !E->isOpcodeOrAlt(I) || I->getParent() == BB; - })); + assert(llvm::all_of(InstructionsOnly, [=](Value *V) -> bool { + auto *I = cast(V); + return !E->isOpcodeOrAlt(I) || I->getParent() == BB; + })); // The last instruction in the bundle in program order. Instruction *LastInst = nullptr; @@ -4174,8 +4410,8 @@ // VL.back() and iterate over schedule data until we reach the end of the // bundle. The end of the bundle is marked by null ScheduleData. if (BlocksSchedules.count(BB)) { - auto *Bundle = - BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back())); + auto *Bundle = BlocksSchedules[BB]->getScheduleData( + E->isOneOf(*llvm::reverse(InstructionsOnly).begin())); if (Bundle && Bundle->isPartOfBundle()) for (; Bundle; Bundle = Bundle->NextInBundle) if (Bundle->OpValue == Bundle->Inst) @@ -4201,7 +4437,8 @@ // we both exit early from buildTree_rec and that the bundle be out-of-order // (causing us to iterate all the way to the end of the block). if (!LastInst) { - SmallPtrSet Bundle(E->Scalars.begin(), E->Scalars.end()); + SmallPtrSet Bundle(InstructionsOnly.begin(), + InstructionsOnly.end()); for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I)) LastInst = &I; @@ -4224,6 +4461,10 @@ Value *Vec = UndefValue::get(VecTy); unsigned InsIndex = 0; for (Value *Val : VL) { + if (isa(Val)) { + ++InsIndex; + continue; + } Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++)); auto *InsElt = dyn_cast(Vec); if (!InsElt) @@ -4250,26 +4491,27 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { InstructionsState S = getSameOpcode(VL); if (S.getOpcode()) { - if (TreeEntry *E = getTreeEntry(S.OpValue)) { - if (E->isSame(VL)) { - Value *V = vectorizeTree(E); - if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) { - // We need to get the vectorized value but without shuffle. - if (auto *SV = dyn_cast(V)) { - V = SV->getOperand(0); - } else { - // Reshuffle to get only unique values. - SmallVector UniqueIdxs; - SmallSet UsedIdxs; - for (int Idx : E->ReuseShuffleIndices) - if (UsedIdxs.insert(Idx).second) - UniqueIdxs.emplace_back(Idx); - V = Builder.CreateShuffleVector(V, UniqueIdxs); - } - } - return V; + // Check that every instruction appears once in this bundle. + SmallVector UniqueValues; + DenseMap UniquePositions; + UniqueValues.reserve(VL.size()); + for (Value *V : VL) { + if (isa(V)) { + UniqueValues.emplace_back(V); + continue; } + auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); + if (Res.second) + UniqueValues.emplace_back(V); + } + if (UniqueValues.size() != VL.size()) { + LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); + UniqueValues.append(VL.size() - UniqueValues.size(), + UndefValue::get(VL[0]->getType())); } + if (TreeEntry *E = getTreeEntry(S.OpValue)) + if (E->isSame(UniqueValues)) + return vectorizeTree(E); } // Check that every instruction appears once in this bundle. @@ -4338,7 +4580,8 @@ SmallVector NewMask(SubMask.size(), SubMask.size()); int TermValue = std::min(Mask.size(), SubMask.size()); for (int I = 0, E = SubMask.size(); I < E; ++I) { - if (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue) { + if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem || + Mask[SubMask[I]] >= TermValue) { NewMask[I] = E; continue; } @@ -4370,6 +4613,14 @@ } ShuffleInstructionBuilder ShuffleBuilder(Builder); + Instruction *VL0 = E->getMainOp(); + Type *ScalarTy = VL0->getType(); + if (auto *Store = dyn_cast(VL0)) + ScalarTy = Store->getValueOperand()->getType(); + auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); + if (isa(VL0)) + return UndefValue::get(VecTy); + bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); if (E->State == TreeEntry::NeedToGather) { setInsertPointAfterBundle(E); @@ -4389,13 +4640,9 @@ assert((E->State == TreeEntry::Vectorize || E->State == TreeEntry::ScatterVectorize) && "Unhandled state"); + auto InstructionsOnly = make_filter_range(E->Scalars, Instruction::classof); unsigned ShuffleOrOp = E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode(); - Instruction *VL0 = E->getMainOp(); - Type *ScalarTy = VL0->getType(); - if (auto *Store = dyn_cast(VL0)) - ScalarTy = Store->getValueOperand()->getType(); - auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size()); switch (ShuffleOrOp) { case Instruction::PHI: { auto *PH = cast(VL0); @@ -4442,12 +4689,12 @@ return V; } case Instruction::ExtractValue: { - auto *LI = cast(E->getSingleOperand(0)); + auto *LI = cast(VL0->getOperand(0)); Builder.SetInsertPoint(LI); auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign()); - Value *NewV = propagateMetadata(V, E->Scalars); + Value *NewV = propagateMetadata(V, llvm::to_vector<4>(InstructionsOnly)); ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); NewV = ShuffleBuilder.finalize(NewV); @@ -4583,7 +4830,7 @@ RHS); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); + V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly)); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -4602,31 +4849,86 @@ setInsertPointAfterBundle(E); LoadInst *LI = cast(VL0); - Instruction *NewLI; unsigned AS = LI->getPointerAddressSpace(); Value *PO = LI->getPointerOperand(); + auto *FirstInstr = llvm::find_if(E->Scalars, Instruction::classof); + auto *LastInstr = + llvm::find_if(llvm::reverse(E->Scalars), Instruction::classof).base(); + unsigned NumOfInstructions = std::distance(FirstInstr, LastInstr); + bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(NumOfInstructions); + Value *VecPtr; + Instruction *VecLI; + Value *V; if (E->State == TreeEntry::Vectorize) { - - Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS)); - + if (IsPowOf2NumOfInstructions) { + VecPtr = Builder.CreateBitCast( + PO, FixedVectorType::get(ScalarTy, NumOfInstructions) + ->getPointerTo(AS)); + VecLI = Builder.CreateAlignedLoad( + FixedVectorType::get(ScalarTy, NumOfInstructions), VecPtr, + LI->getAlign()); + V = propagateMetadata(VecLI, llvm::to_vector<4>(InstructionsOnly)); + if (PowerOf2Ceil(NumOfInstructions) != E->Scalars.size()) { + SmallVector ExtendedIndices(E->Scalars.size(), + NumOfInstructions); + const unsigned Dist = std::distance(E->Scalars.begin(), FirstInstr); + auto *LI = FirstInstr; + for (unsigned I = 0; I < NumOfInstructions; ++I, ++LI) { + if (!isa(*LI)) + ExtendedIndices[I] = Dist + I; + } + ShuffleBuilder.addMask(ExtendedIndices); + } + } else { + VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS)); + SmallVector Mask; + Mask.reserve(E->Scalars.size()); + for (auto *V : E->Scalars) + Mask.emplace_back(Builder.getInt1(!isa(V))); + SmallVector Passthrough( + E->Scalars.size(), UndefValue::get(LI->getType())); + VecLI = Builder.CreateMaskedLoad(VecPtr, LI->getAlign(), + ConstantVector::get(Mask), + ConstantVector::get(Passthrough)); + V = propagateMetadata(VecLI, llvm::to_vector<4>(InstructionsOnly)); + } // The pointer operand uses an in-tree scalar so we add the new BitCast // to ExternalUses list to make sure that an extract will be generated // in the future. if (getTreeEntry(PO)) ExternalUses.emplace_back(PO, cast(VecPtr), 0); - - NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign()); } else { assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state"); + unsigned NormalizedSz = llvm::PowerOf2Ceil(NumOfInstructions); Value *VecPtr = vectorizeTree(E->getOperand(0)); + if (NormalizedSz != E->Scalars.size()) { + // Reduce the original vector to optimize masked gather. + SmallVector RedMask(NormalizedSz, 0); + std::iota(RedMask.begin(), RedMask.end(), 0); + VecPtr = Builder.CreateShuffleVector(VecPtr, RedMask); + } // Use the minimum alignment of the gathered loads. Align CommonAlignment = LI->getAlign(); - for (Value *V : E->Scalars) + for (Value *V : InstructionsOnly) CommonAlignment = commonAlignment(CommonAlignment, cast(V)->getAlign()); - NewLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment); + SmallVector Mask; + Mask.reserve(E->Scalars.size()); + Mask.append(NumOfInstructions, Builder.getInt1(/*V=*/true)); + Mask.append(NormalizedSz - NumOfInstructions, + Builder.getInt1(/*V=*/false)); + SmallVector Passthrough(NormalizedSz, + UndefValue::get(LI->getType())); + VecLI = Builder.CreateMaskedGather(VecPtr, CommonAlignment, + ConstantVector::get(Mask), + ConstantVector::get(Passthrough)); + V = propagateMetadata(VecLI, llvm::to_vector<4>(InstructionsOnly)); + if (NormalizedSz != E->Scalars.size()) { + SmallVector Mask(E->Scalars.size(), NormalizedSz); + std::iota(Mask.begin(), Mask.begin() + NumOfInstructions, 0); + ShuffleBuilder.addMask(Mask); + } } - Value *V = propagateMetadata(NewLI, E->Scalars); ShuffleBuilder.addInversedMask(E->ReorderIndices); ShuffleBuilder.addMask(E->ReuseShuffleIndices); @@ -4648,10 +4950,42 @@ VecValue = ShuffleBuilder.finalize(VecValue); Value *ScalarPtr = SI->getPointerOperand(); - Value *VecPtr = Builder.CreateBitCast( - ScalarPtr, VecValue->getType()->getPointerTo(AS)); - StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr, - SI->getAlign()); + + auto *FirstInstr = llvm::find_if(E->Scalars, Instruction::classof); + auto *LastInstr = + llvm::find_if(llvm::reverse(E->Scalars), Instruction::classof).base(); + unsigned NumOfInstructions = std::distance(FirstInstr, LastInstr); + if (PowerOf2Ceil(NumOfInstructions) != E->Scalars.size()) { + SmallVector ExtendedIndices(NumOfInstructions, + NumOfInstructions); + const unsigned Dist = std::distance(E->Scalars.begin(), FirstInstr); + auto *VI = FirstInstr; + for (unsigned I = 0; I < NumOfInstructions; ++I, ++VI) { + if (!isa(*VI)) + ExtendedIndices[I] = Dist + I; + } + VecValue = Builder.CreateShuffleVector( + VecValue, UndefValue::get(VecValue->getType()), ExtendedIndices, + "values.extend"); + } + Value *VecPtr; + Instruction *VecSI; + bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(NumOfInstructions); + if (IsPowOf2NumOfInstructions) { + VecPtr = Builder.CreateBitCast( + ScalarPtr, FixedVectorType::get(ScalarTy, NumOfInstructions) + ->getPointerTo(AS)); + VecSI = Builder.CreateAlignedStore(VecValue, VecPtr, SI->getAlign()); + } else { + VecPtr = Builder.CreateBitCast(ScalarPtr, + VecValue->getType()->getPointerTo(AS)); + SmallVector Mask; + Mask.reserve(E->Scalars.size()); + for (auto *V : E->Scalars) + Mask.emplace_back(Builder.getInt1(!isa(V))); + VecSI = Builder.CreateMaskedStore(VecValue, VecPtr, SI->getAlign(), + ConstantVector::get(Mask)); + } // The pointer operand uses an in-tree scalar, so add the new BitCast to // ExternalUses to make sure that an extract will be generated in the @@ -4659,7 +4993,7 @@ if (getTreeEntry(ScalarPtr)) ExternalUses.push_back(ExternalUser(ScalarPtr, cast(VecPtr), 0)); - Value *V = propagateMetadata(ST, E->Scalars); + Value *V = propagateMetadata(VecSI, llvm::to_vector<4>(InstructionsOnly)); E->VectorizedValue = V; ++NumVectorInstructions; @@ -4684,6 +5018,8 @@ ->getPointerOperandType() ->getScalarType()); for (Value *&V : VL) { + if (isa(V)) + continue; auto *CI = cast(V); V = ConstantExpr::getIntegerCast(CI, Ty, CI->getValue().isSignBitSet()); @@ -4695,7 +5031,7 @@ Value *V = Builder.CreateGEP( cast(VL0)->getSourceElementType(), Op0, OpVecs); if (Instruction *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); + V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly)); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -4810,6 +5146,11 @@ unsigned e = E->Scalars.size(); SmallVector Mask(e); for (unsigned i = 0; i < e; ++i) { + if (isa(E->Scalars[i])) { + Mask[i] = i; + OpScalars.push_back(E->Scalars[i]); + continue; + } auto *OpInst = cast(E->Scalars[i]); assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); if (OpInst->getOpcode() == E->getAltOpcode()) { @@ -4826,7 +5167,7 @@ Value *V = Builder.CreateShuffleVector(V0, V1, Mask); if (Instruction *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); + V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly)); ShuffleBuilder.addMask(E->ReuseShuffleIndices); V = ShuffleBuilder.finalize(V); @@ -4976,6 +5317,8 @@ // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (isa(Scalar)) + continue; #ifndef NDEBUG Type *Ty = Scalar->getType(); @@ -5101,14 +5444,15 @@ bool ReSchedule = false; LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); + auto InstructionsOnly = make_filter_range(VL, Instruction::classof); // Make sure that the scheduling region contains all // instructions of the bundle. - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { if (!extendSchedulingRegion(V, S)) return None; } - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); @@ -5187,7 +5531,9 @@ LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); - assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && + assert(Bundle->isSchedulingEntity() && + (Bundle->isPartOfBundle() || + llvm::count_if(VL, Instruction::classof) == 1) && "tried to unbundle something which is not a bundle"); // Un-bundle: make single instructions out of the bundle. @@ -5494,7 +5840,9 @@ I = I->getNextNode()) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { assert(SD->isPartOfBundle() == - (getTreeEntry(SD->Inst) != nullptr) && + (getTreeEntry(SD->Inst) != nullptr && + llvm::count_if(getTreeEntry(SD->Inst)->Scalars, + Instruction::classof) > 1) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { @@ -5957,6 +6305,37 @@ return Changed; } +/// Order may have elements assigned special value (size + 1) which is out of +/// bounds. Such indices only appear on places which correspond to undef values +/// (see canReuseExtract for details) and used in order to avoid undef values +/// have effect on operands ordering. +/// The first loop below simply finds all unused indices and then the next loop +/// nest assigns these indecies for undef values positions. +/// As an example below Order has two undef positions and they have assigned +/// values 3 and 7 respectively: +/// before: 6 9 5 4 9 2 1 0 +/// after: 6 3 5 4 7 2 1 0 +static void fixupOrderingIndicies(SmallVectorImpl &Order) { + const unsigned Sz = Order.size(); + SmallBitVector UsedIndices(Sz); + const unsigned BoundVal = Sz + 1; + for (unsigned I : Order) + if (I != BoundVal) + UsedIndices[I] = true; + unsigned Idx = 0; + for (unsigned &I : Order) { + if (I == BoundVal) { + // Find first non-used index. + for (; Idx != Sz; ++Idx) + if (!UsedIndices[Idx]) + break; + // Set correct index. + I = Idx; + ++Idx; + } + } +} + bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, unsigned Idx) { LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size() @@ -5965,9 +6344,19 @@ const unsigned MinVF = R.getMinVecRegSize() / Sz; unsigned VF = Chain.size(); - if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF) + unsigned NewSize = PowerOf2Ceil(VF); + if (!isPowerOf2_32(Sz) || VF < 2 || NewSize < MinVF) return false; + SmallVector FixedChain; + if (NewSize != VF) { + FixedChain.reserve(NewSize); + FixedChain.append(Chain.begin(), Chain.end()); + FixedChain.append(NewSize - Chain.size(), + UndefValue::get(Chain[0]->getType())); + Chain = FixedChain; + VF = NewSize; + } LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx << "\n"); @@ -5975,9 +6364,11 @@ Optional> Order = R.bestOrder(); // TODO: Handle orders of size less than number of elements in the vector. if (Order && Order->size() == Chain.size()) { + SmallVector NewOrder(Order->begin(), Order->end()); + fixupOrderingIndicies(NewOrder); // TODO: reorder tree nodes without tree rebuilding. SmallVector ReorderedOps(Chain.rbegin(), Chain.rend()); - llvm::transform(*Order, ReorderedOps.begin(), + llvm::transform(NewOrder, ReorderedOps.begin(), [Chain](const unsigned Idx) { return Chain[Idx]; }); R.buildTree(ReorderedOps); } @@ -6074,8 +6465,9 @@ // register size is a power-of-2? unsigned StartIdx = 0; for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) { - for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) { - ArrayRef Slice = makeArrayRef(Operands).slice(Cnt, Size); + for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + 2 <= E;) { + ArrayRef Slice = + makeArrayRef(Operands).slice(Cnt, std::min(Size, E - Cnt)); if (!VectorizedStores.count(Slice.front()) && !VectorizedStores.count(Slice.back()) && vectorizeStoreChain(Slice, R, Cnt)) { @@ -6177,9 +6569,9 @@ } } - unsigned Sz = R.getVectorElementSize(I0); - unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); - unsigned MaxVF = std::max(PowerOf2Floor(VL.size()), MinVF); + unsigned Pow2VL = PowerOf2Ceil(VL.size()); + unsigned MinVF = 2; + unsigned MaxVF = Pow2VL; if (MaxVF < 2) { R.getORE()->emit([&]() { return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) @@ -6216,7 +6608,7 @@ else OpsWidth = VF; - if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) + if (OpsWidth < 2 || (VF > MinVF && OpsWidth <= VF / 2)) break; ArrayRef Ops = VL.slice(I, OpsWidth); @@ -6229,6 +6621,17 @@ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " << "\n"); + SmallVector FixedChain; + if (OpsWidth != VF && (CompensateUseCost || !AllowReorder)) { + unsigned NewSize = VF; + FixedChain.reserve(NewSize); + FixedChain.append(Ops.begin(), Ops.end()); + FixedChain.append(NewSize - Ops.size(), + UndefValue::get(Ops[0]->getType())); + Ops = FixedChain; + } + assert(Ops.size() == VF && + "Operations must have same size as vectorization factor."); R.buildTree(Ops); Optional> Order = R.bestOrder(); @@ -7052,9 +7455,11 @@ assert(Order->size() == VL.size() && "Order size must be the same as number of vectorized " "instructions."); + SmallVector NewOrder(Order->begin(), Order->end()); + fixupOrderingIndicies(NewOrder); // TODO: reorder tree nodes without tree rebuilding. SmallVector ReorderedOps(VL.size()); - llvm::transform(*Order, ReorderedOps.begin(), + llvm::transform(NewOrder, ReorderedOps.begin(), [VL](const unsigned Idx) { return VL[Idx]; }); V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); } @@ -7156,6 +7561,14 @@ return VectorizedTree != nullptr; } + SmallVector getCopyOfExtraArgValues() const { + SmallVector Args(ExtraArgs.size()); + std::transform( + ExtraArgs.begin(), ExtraArgs.end(), Args.begin(), + [](const std::pair &P) { return P.second; }); + return Args; + } + unsigned numReductionValues() const { return ReducedVals.size(); } @@ -7502,6 +7915,15 @@ // Set P to nullptr to avoid re-analysis of phi node in // matchAssociativeReduction function unless this is the root node. P = nullptr; + // Try to vectorize ExtraArgs. + // Continue analysis for the instruction from the same basic block + // only to save compile time. + if (++Level < RecursionMaxDepth) + for (auto *Op : HorRdx.getCopyOfExtraArgValues()) + if (VisitedInstrs.insert(Op).second) + if (auto *I = dyn_cast(Op)) + if (!isa(I) && I->getParent() == BB) + Stack.emplace_back(I, Level); continue; } } diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll @@ -3,16 +3,17 @@ define void @f1(<2 x i16> %x, i16* %a) { ; CHECK-LABEL: @f1( -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[X:%.*]], <2 x i16> undef, <4 x i32> +; CHECK-NEXT: [[T2:%.*]] = extractelement <2 x i16> [[X:%.*]], i32 0 +; CHECK-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[X]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 -; CHECK-NEXT: store i16 [[TMP1]], i16* [[A:%.*]], align 2 -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2 -; CHECK-NEXT: ret void +; CHECK-NEXT: store i16 [[T2]], i16* [[A:%.*]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR0]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR1]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR2]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR3]] ; %t2 = extractelement <2 x i16> %x, i32 0 %t3 = extractelement <2 x i16> %x, i32 1 @@ -35,15 +36,17 @@ ; CHECK: cont: ; CHECK-NEXT: [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ] ; CHECK-NEXT: [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <4 x i32> +; CHECK-NEXT: [[T2:%.*]] = extractelement <2 x i16> [[XX]], i32 0 +; CHECK-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[XX]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: store i16 [[T2]], i16* [[A]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR0]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR1]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR2]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR3]] ; CHECK-NEXT: [[A_VAL:%.*]] = load i16, i16* [[A]], align 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]] @@ -82,15 +85,17 @@ ; CHECK: cont: ; CHECK-NEXT: [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ] ; CHECK-NEXT: [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <4 x i32> +; CHECK-NEXT: [[T2:%.*]] = extractelement <2 x i16> [[XX]], i32 0 +; CHECK-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[XX]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]], align 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: store i16 [[T3]], i16* [[A]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR0]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR1]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR2]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR3]] ; CHECK-NEXT: [[A_VAL:%.*]] = load i16, i16* [[A]], align 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/ext-trunc.ll @@ -72,14 +72,18 @@ ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A1]] ; CHECK-NEXT: [[LOAD1:%.*]] = load i64, i64* [[GEP1]] ; CHECK-NEXT: [[E2:%.*]] = extractelement <4 x i32> [[SUB0]], i32 2 -; CHECK-NEXT: [[S2:%.*]] = sext i32 [[E2]] to i64 -; CHECK-NEXT: [[A2:%.*]] = add i64 [[S2]], [[C2:%.*]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A2]] -; CHECK-NEXT: [[LOAD2:%.*]] = load i64, i64* [[GEP2]] ; CHECK-NEXT: [[E3:%.*]] = extractelement <4 x i32> [[SUB0]], i32 3 -; CHECK-NEXT: [[S3:%.*]] = sext i32 [[E3]] to i64 -; CHECK-NEXT: [[A3:%.*]] = add i64 [[S3]], [[C3:%.*]] -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[A3]] +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> undef, i32 [[E2]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[E3]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> undef, i64 [[C2:%.*]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[C3:%.*]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP6]] +; CHECK-NEXT: [[LOAD2:%.*]] = load i64, i64* [[GEP2]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i64, i64* [[P]], i64 [[TMP7]] ; CHECK-NEXT: [[LOAD3:%.*]] = load i64, i64* [[GEP3]] ; CHECK-NEXT: call void @foo(i64 [[LOAD0]], i64 [[LOAD1]], i64 [[LOAD2]], i64 [[LOAD3]]) ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll --- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -203,12 +203,10 @@ ; ; MAX-COST-LABEL: @PR32038( ; MAX-COST-NEXT: entry: -; MAX-COST-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 -; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer -; MAX-COST-NEXT: [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; MAX-COST-NEXT: [[P5:%.*]] = icmp eq i8 [[P4]], 0 -; MAX-COST-NEXT: [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; MAX-COST-NEXT: [[P7:%.*]] = icmp eq i8 [[P6]], 0 +; MAX-COST-NEXT: [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1 +; MAX-COST-NEXT: [[P1:%.*]] = icmp eq i8 [[P0]], 0 +; MAX-COST-NEXT: [[TMP0:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2) to <4 x i8>*), i32 2, <4 x i1> , <4 x i8> undef) +; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <4 x i8> [[TMP0]], ; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 ; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 ; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 @@ -217,22 +215,23 @@ ; MAX-COST-NEXT: [[P13:%.*]] = icmp eq i8 [[P12]], 0 ; MAX-COST-NEXT: [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 ; MAX-COST-NEXT: [[P15:%.*]] = icmp eq i8 [[P14]], 0 +; MAX-COST-NEXT: [[TMP2:%.*]] = insertelement <4 x i1> undef, i1 [[P1]], i32 0 ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: ; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; MAX-COST-NEXT: [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0 -; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1 -; MAX-COST-NEXT: [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2 -; MAX-COST-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3 -; MAX-COST-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP2]], i1 [[TMP4]], i32 1 +; MAX-COST-NEXT: [[TMP6:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; MAX-COST-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[TMP6]], i32 2 +; MAX-COST-NEXT: [[TMP8:%.*]] = insertelement <4 x i1> [[TMP7]], i1 [[TMP3]], i32 3 +; MAX-COST-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> , <4 x i32> ; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) -; MAX-COST-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]] -; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]] -; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5 +; MAX-COST-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) +; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P27]] +; MAX-COST-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], [[P29]] +; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP12]], -5 ; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]] ; MAX-COST-NEXT: [[P33:%.*]] = select i1 [[P15]], i32 -720, i32 -80 diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll @@ -215,16 +215,16 @@ ; ; GFX8-LABEL: @uadd_sat_v3i16( ; GFX8-NEXT: bb: -; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2 -; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2 -; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> undef, <2 x i32> -; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]]) +; GFX8-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <3 x i16> [[ARG0:%.*]], <3 x i16> undef, <2 x i32> +; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0]], i64 2 +; GFX8-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <3 x i16> [[ARG1:%.*]], <3 x i16> undef, <2 x i32> +; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1]], i64 2 +; GFX8-NEXT: [[TMP0:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[REORDER_SHUFFLE]], <2 x i16> [[REORDER_SHUFFLE1]]) ; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]]) -; GFX8-NEXT: [[TMP3:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0 -; GFX8-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[TMP3]], i64 0 -; GFX8-NEXT: [[TMP4:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1 -; GFX8-NEXT: [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[TMP4]], i64 1 +; GFX8-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[TMP0]], i32 0 +; GFX8-NEXT: [[INS_0:%.*]] = insertelement <3 x i16> undef, i16 [[TMP1]], i64 0 +; GFX8-NEXT: [[TMP2:%.*]] = extractelement <2 x i16> [[TMP0]], i32 1 +; GFX8-NEXT: [[INS_1:%.*]] = insertelement <3 x i16> [[INS_0]], i16 [[TMP2]], i64 1 ; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[INS_1]], i16 [[ADD_2]], i64 2 ; GFX8-NEXT: ret <3 x i16> [[INS_2]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -7,8 +7,8 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = phi <8 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ , [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) @@ -47,17 +47,17 @@ ; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> undef, i32 [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> undef, i32 [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP15]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP15]] = insertelement <8 x i32> [[TMP13]], i32 [[TMP14]], i32 1 ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( ; FORCE_REDUCTION-NEXT: entry: ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: -; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> +; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <4 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ , [[ENTRY:%.*]] ] +; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> ; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1 ; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], ; FORCE_REDUCTION-NEXT: [[VAL_20:%.*]] = add i32 [[TMP2]], 1496 @@ -96,13 +96,13 @@ ; FORCE_REDUCTION-NEXT: [[VAL_39:%.*]] = add i32 [[TMP2]], 12529 ; FORCE_REDUCTION-NEXT: [[VAL_40:%.*]] = and i32 [[OP_EXTRA27]], [[VAL_39]] ; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[TMP2]], 13685 -; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_40]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_41]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 14910, i32 1 -; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = and <2 x i32> [[TMP8]], [[TMP10]] -; FORCE_REDUCTION-NEXT: [[TMP12:%.*]] = add <2 x i32> [[TMP8]], [[TMP10]] -; FORCE_REDUCTION-NEXT: [[TMP13]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> +; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[VAL_40]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 14910, i32 1 +; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> undef, i32 [[VAL_41]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP2]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = and <4 x i32> [[TMP8]], [[TMP10]] +; FORCE_REDUCTION-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP8]], [[TMP10]] +; FORCE_REDUCTION-NEXT: [[TMP13]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll @@ -4,11 +4,11 @@ define void @mainTest(i32 %param, i32 * %vals, i32 %len) { ; CHECK-LABEL: @mainTest( ; CHECK-NEXT: bci_15.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[PARAM:%.*]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> , i32 [[PARAM:%.*]], i32 1 ; CHECK-NEXT: br label [[BCI_15:%.*]] ; CHECK: bci_15: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = phi <16 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15 ; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4 @@ -16,8 +16,8 @@ ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP4]]) ; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]] ; CHECK-NEXT: [[V44:%.*]] = add i32 [[TMP2]], 16 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[V44]], i32 0 -; CHECK-NEXT: [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> undef, i32 [[V44]], i32 0 +; CHECK-NEXT: [[TMP7]] = insertelement <16 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1 ; CHECK-NEXT: br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]] ; CHECK: loopexit: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -8,26 +8,24 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; SSE-LABEL: @sitofp_uitofp( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> ; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 ; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float -; SSE-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float -; SSE-NEXT: [[AB2:%.*]] = sitofp i32 [[A2]] to float -; SSE-NEXT: [[AB3:%.*]] = sitofp i32 [[A3]] to float +; SSE-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[REORDER_SHUFFLE]] to <4 x float> ; SSE-NEXT: [[AB4:%.*]] = uitofp i32 [[A4]] to float ; SSE-NEXT: [[AB5:%.*]] = uitofp i32 [[A5]] to float ; SSE-NEXT: [[AB6:%.*]] = uitofp i32 [[A6]] to float ; SSE-NEXT: [[AB7:%.*]] = uitofp i32 [[A7]] to float -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 +; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 ; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 @@ -81,26 +79,24 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; SSE-LABEL: @fptosi_fptoui( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; SSE-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> ; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 ; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 ; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; SSE-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; SSE-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; SSE-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 +; SSE-NEXT: [[TMP1:%.*]] = fptosi <4 x float> [[REORDER_SHUFFLE]] to <4 x i32> ; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 ; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 ; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 ; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 ; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 @@ -135,26 +131,24 @@ ; SLM-NEXT: ret <8 x i32> [[R7]] ; ; AVX-LABEL: @fptosi_fptoui( -; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; AVX-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; AVX-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; AVX-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> ; AVX-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 ; AVX-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 ; AVX-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 ; AVX-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; AVX-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; AVX-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; AVX-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; AVX-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 +; AVX-NEXT: [[TMP1:%.*]] = fptosi <4 x float> [[REORDER_SHUFFLE]] to <4 x i32> ; AVX-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 ; AVX-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 ; AVX-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 ; AVX-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; AVX-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; AVX-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; AVX-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0 +; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; AVX-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1 +; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; AVX-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2 +; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; AVX-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3 ; AVX-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 ; AVX-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; AVX-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -110,22 +110,22 @@ ; AVX1-LABEL: @ashr_shl_v8i32( ; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 ; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 -; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 -; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 ; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX1-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] -; AVX1-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] -; AVX1-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] +; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <8 x i32> +; AVX1-NEXT: [[TMP5:%.*]] = shl <4 x i32> [[TMP1]], [[TMP2]] +; AVX1-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <8 x i32> +; AVX1-NEXT: [[TMP7:%.*]] = shl <8 x i32> [[A]], [[B]] ; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> +; AVX1-NEXT: [[R3:%.*]] = shufflevector <8 x i32> [[R1]], <8 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: [[R5:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP6]], <8 x i32> +; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R5]], <8 x i32> [[TMP7]], <8 x i32> ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @ashr_shl_v8i32( @@ -177,19 +177,19 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-LABEL: @ashr_shl_v8i32_const( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], -; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[REORDER_SHUFFLE]], +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], +; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; AVX1-LABEL: @ashr_shl_v8i32_const( -; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], -; AVX1-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[REORDER_SHUFFLE]], +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], +; AVX1-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @ashr_shl_v8i32_const( @@ -293,25 +293,18 @@ ; AVX2-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; AVX2-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 ; AVX2-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[A]], [[B]] ; AVX2-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; AVX2-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 -; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1 -; AVX2-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 -; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2 -; AVX2-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 -; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3 -; AVX2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 -; AVX2-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4 -; AVX2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 -; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5 +; AVX2-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2 +; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP3]], i32 2 +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3 +; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP4]], i32 3 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4 +; AVX2-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP5]], i32 4 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5 +; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP6]], i32 5 ; AVX2-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; AVX2-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX2-NEXT: ret <8 x i32> [[R7]] @@ -321,25 +314,18 @@ ; AVX512-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; AVX512-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 ; AVX512-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] -; AVX512-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] -; AVX512-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[A]], [[B]] ; AVX512-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; AVX512-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1 -; AVX512-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 -; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2 -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 -; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 -; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4 -; AVX512-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5 +; AVX512-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2 +; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP3]], i32 2 +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3 +; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP4]], i32 3 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4 +; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP5]], i32 4 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5 +; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP6]], i32 5 ; AVX512-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX512-NEXT: ret <8 x i32> [[R7]] @@ -412,26 +398,56 @@ } define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { -; CHECK-LABEL: @sdiv_v8i32_undefs( -; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; CHECK-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; CHECK-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 -; CHECK-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 -; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; CHECK-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 -; CHECK-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[AB2]], i32 2 -; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[AB3]], i32 3 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; CHECK-NEXT: ret <8 x i32> [[R7]] +; SSE-LABEL: @sdiv_v8i32_undefs( +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SSE-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; SSE-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; SSE-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SSE-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; SSE-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; SSE-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; SSE-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 +; SSE-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[AB2]], i32 2 +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SSE-NEXT: ret <8 x i32> [[R7]] +; +; AVX1-LABEL: @sdiv_v8i32_undefs( +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX1-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; AVX1-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP2:%.*]] = sdiv <4 x i32> [[TMP1]], +; AVX1-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 +; AVX1-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 +; AVX1-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 +; AVX1-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[TMP6]], i32 6 +; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX1-NEXT: ret <8 x i32> [[R7]] +; +; AVX2-LABEL: @sdiv_v8i32_undefs( +; AVX2-NEXT: [[TMP1:%.*]] = sdiv <8 x i32> [[A:%.*]], +; AVX2-NEXT: ret <8 x i32> [[TMP1]] +; +; AVX512-LABEL: @sdiv_v8i32_undefs( +; AVX512-NEXT: [[TMP1:%.*]] = sdiv <8 x i32> [[A:%.*]], +; AVX512-NEXT: ret <8 x i32> [[TMP1]] ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cmp_commute.ll @@ -237,23 +237,22 @@ define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) { ; CHECK-LABEL: @fcmp_ord_uno_v4i32( ; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 ; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 ; CHECK-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 -; CHECK-NEXT: [[B1:%.*]] = load float, float* [[P1]], align 4 -; CHECK-NEXT: [[B2:%.*]] = load float, float* [[P2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 ; CHECK-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] -; CHECK-NEXT: [[C1:%.*]] = fcmp uno float [[B1]], [[A1]] -; CHECK-NEXT: [[C2:%.*]] = fcmp uno float [[B2]], [[A2]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] ; CHECK-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0 -; CHECK-NEXT: [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[C1]], i32 1 -; CHECK-NEXT: [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[C2]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; CHECK-NEXT: [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; CHECK-NEXT: [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[TMP6]], i32 2 ; CHECK-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D2]], i1 [[C3]], i32 3 ; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -55,35 +55,32 @@ ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; AVX-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] -; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ] +; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; AVX-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1 -; AVX-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]] -; AVX-NEXT: [[TMP9:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer -; AVX-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[TMP9]], [[TMP8]] -; AVX-NEXT: [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], -; AVX-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> -; AVX-NEXT: [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], -; AVX-NEXT: [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer -; AVX-NEXT: [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> , <2 x float> [[TMP14]] -; AVX-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 -; AVX-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 -; AVX-NEXT: [[ADD13]] = fadd float [[TMP16]], [[TMP17]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <2 x float> undef, float [[TMP17]], i32 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[ADD13]], i32 1 -; AVX-NEXT: [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], -; AVX-NEXT: [[TMP21:%.*]] = select <2 x i1> [[TMP20]], <2 x float> [[TMP19]], <2 x float> -; AVX-NEXT: [[TMP22:%.*]] = fcmp olt <2 x float> [[TMP21]], -; AVX-NEXT: [[TMP23]] = select <2 x i1> [[TMP22]], <2 x float> , <2 x float> [[TMP21]] +; AVX-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 +; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[REORDER_SHUFFLE]], [[TMP3]] +; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer +; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[TMP4]] +; AVX-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], +; AVX-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> +; AVX-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], +; AVX-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer +; AVX-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> , <2 x float> [[TMP10]] +; AVX-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; AVX-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; AVX-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]] +; AVX-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[TMP13]], i32 0 +; AVX-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1 +; AVX-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], +; AVX-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> +; AVX-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], +; AVX-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> , <2 x float> [[TMP17]] ; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 ; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; AVX: for.end: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_lencod.ll @@ -131,10 +131,9 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double undef, i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll @@ -4,25 +4,13 @@ define i32 @crash_reordering_undefs() { ; CHECK-LABEL: @crash_reordering_undefs( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[OR0:%.*]] = or i64 undef, undef -; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]] -; CHECK-NEXT: [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD1:%.*]] = add i32 undef, [[ADD0]] -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 undef, undef -; CHECK-NEXT: [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD1]], [[ADD2]] -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 undef, undef -; CHECK-NEXT: [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD5:%.*]] = add i32 [[ADD3]], [[ADD4]] -; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD5]], undef -; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[ADD6]], undef -; CHECK-NEXT: [[ADD8:%.*]] = add i32 [[ADD7]], undef -; CHECK-NEXT: [[OR1:%.*]] = or i64 undef, undef -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]] -; CHECK-NEXT: [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[ADD8]], [[ADD9]] -; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD10]], undef -; CHECK-NEXT: ret i32 [[ADD11]] +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> ) +; CHECK-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP0]], undef +; CHECK-NEXT: [[OP_EXTRA1:%.*]] = add i32 [[OP_EXTRA]], undef +; CHECK-NEXT: [[OP_EXTRA2:%.*]] = add i32 [[OP_EXTRA1]], undef +; CHECK-NEXT: [[OP_EXTRA3:%.*]] = add i32 [[OP_EXTRA2]], undef +; CHECK-NEXT: [[OP_EXTRA4:%.*]] = add i32 [[OP_EXTRA3]], undef +; CHECK-NEXT: ret i32 [[OP_EXTRA4]] ; entry: %or0 = or i64 undef, undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_smallpt.ll @@ -31,15 +31,14 @@ ; CHECK: cond.false66.us: ; CHECK-NEXT: [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, undef ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[ADD_I276_US]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double undef, i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], -; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> undef, [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], +; CHECK-NEXT: [[TMP4:%.*]] = fmul <2 x double> undef, [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[AGG_TMP99208_SROA_0_0_IDX]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP6]], align 8 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[AGG_TMP101211_SROA_0_0_IDX]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP7]], align 8 ; CHECK-NEXT: unreachable ; CHECK: cond.true63.us: ; CHECK-NEXT: unreachable diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -18,21 +18,15 @@ ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], +; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[SHUFFLE]], +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[SHUFFLE1]], ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <4 x double>* +; CHECK-NEXT: store <4 x double> [[TMP3]], <4 x double>* [[TMP4]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/extract.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extract.ll @@ -54,14 +54,11 @@ ; CHECK-LABEL: @fextr2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load <4 x double>, <4 x double>* undef -; CHECK-NEXT: [[V0:%.*]] = extractelement <4 x double> [[LD]], i32 0 -; CHECK-NEXT: [[V1:%.*]] = extractelement <4 x double> [[LD]], i32 1 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x double> [[LD]], <4 x double> undef, <2 x i32> ; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[P0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = fadd <2 x double> [[REORDER_SHUFFLE]], +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[P0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/geps-non-pow-2.ll @@ -11,36 +11,33 @@ ; CHECK-NEXT: [[TOBOOL_NOT19:%.*]] = icmp eq i32 [[TMP0]], 0 ; CHECK-NEXT: br i1 [[TOBOOL_NOT19]], label [[WHILE_END:%.*]], label [[WHILE_BODY:%.*]] ; CHECK: while.body: -; CHECK-NEXT: [[C_022:%.*]] = phi i32* [ [[C_022_BE:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32*> [ [[TMP14:%.*]], [[WHILE_BODY_BACKEDGE]] ], [ undef, [[ENTRY]] ] -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 1 -; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint i32* [[C_022]] to i64 -; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: switch i32 [[TMP3]], label [[WHILE_BODY_BACKEDGE]] [ +; CHECK-NEXT: [[TMP1:%.*]] = phi <4 x i32*> [ [[TMP16:%.*]], [[WHILE_BODY_BACKEDGE:%.*]] ], [ undef, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32*> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = ptrtoint i32* [[TMP2]] to i64 +; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x i32*> [[TMP1]], <4 x i64> +; CHECK-NEXT: switch i32 [[TMP4]], label [[WHILE_BODY_BACKEDGE]] [ ; CHECK-NEXT: i32 2, label [[SW_BB:%.*]] ; CHECK-NEXT: i32 4, label [[SW_BB6:%.*]] ; CHECK-NEXT: ] ; CHECK: sw.bb: -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = ptrtoint i32* [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 1 -; CHECK-NEXT: store i32 [[TMP7]], i32* [[TMP9]], align 4 -; CHECK-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32*> [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = ptrtoint i32* [[TMP6]] to i64 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[TMP7]] to i32 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32*> [[TMP5]], i32 2 +; CHECK-NEXT: store i32 [[TMP8]], i32* [[TMP9]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, <4 x i32*> [[TMP1]], <4 x i64> ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: sw.bb6: -; CHECK-NEXT: [[INCDEC_PTR8:%.*]] = getelementptr inbounds i32, i32* [[C_022]], i64 2 -; CHECK-NEXT: [[TMP10:%.*]] = ptrtoint i32* [[INCDEC_PTR]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, <2 x i32*> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i32*> [[TMP4]], i32 0 -; CHECK-NEXT: store i32 [[TMP11]], i32* [[TMP13]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32*> [[TMP5]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint i32* [[TMP11]] to i64 +; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i32 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, <4 x i32*> [[TMP1]], <4 x i64> +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32*> [[TMP5]], i32 1 +; CHECK-NEXT: store i32 [[TMP13]], i32* [[TMP15]], align 4 ; CHECK-NEXT: br label [[WHILE_BODY_BACKEDGE]] ; CHECK: while.body.backedge: -; CHECK-NEXT: [[C_022_BE]] = phi i32* [ [[INCDEC_PTR]], [[WHILE_BODY]] ], [ [[INCDEC_PTR8]], [[SW_BB6]] ], [ [[INCDEC_PTR5]], [[SW_BB]] ] -; CHECK-NEXT: [[TMP14]] = phi <2 x i32*> [ [[TMP4]], [[WHILE_BODY]] ], [ [[TMP12]], [[SW_BB6]] ], [ [[TMP8]], [[SW_BB]] ] +; CHECK-NEXT: [[TMP16]] = phi <4 x i32*> [ [[TMP5]], [[WHILE_BODY]] ], [ [[TMP14]], [[SW_BB6]] ], [ [[TMP10]], [[SW_BB]] ] ; CHECK-NEXT: br label [[WHILE_BODY]] ; CHECK: while.end: ; CHECK-NEXT: ret i32 undef diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -218,42 +218,33 @@ ; Unused insertelement define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { ; CHECK-LABEL: @simple_select_no_users( -; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 -; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> undef, <2 x i32> ; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 ; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> ; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 ; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 +; CHECK-NEXT: [[REORDER_SHUFFLE2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <2 x i32> ; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 ; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0 -; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1 -; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[REORDER_SHUFFLE]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[C3]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <2 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[REORDER_SHUFFLE1]], <2 x float> [[REORDER_SHUFFLE2]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[A3]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[B3]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP4]], <2 x float> [[TMP7]], <2 x float> [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 +; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 +; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 +; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP13]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 +; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP14]], i32 3 ; CHECK-NEXT: ret <4 x float> [[RD]] ; %c0 = extractelement <4 x i32> %c, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/jumbled_store_crash.ll @@ -19,22 +19,24 @@ ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[ARRAYIDX]] to <2 x i32>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 4 +; CHECK-NEXT: [[LOAD_EXTEND1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 13 ; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[ARRAYIDX1]] to <2 x i32>* ; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = sitofp <2 x i32> [[TMP5]] to <2 x float> -; CHECK-NEXT: [[TMP7:%.*]] = fmul <2 x float> [[TMP6]], -; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x float> , [[TMP7]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> undef, <4 x i32> +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[LOAD_EXTEND]], [[LOAD_EXTEND1]] +; CHECK-NEXT: [[TMP6:%.*]] = sitofp <4 x i32> [[TMP5]] to <4 x float> +; CHECK-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[TMP6]], +; CHECK-NEXT: [[TMP8:%.*]] = fsub <4 x float> , [[TMP7]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 ; CHECK-NEXT: store float [[TMP9]], float* @g, align 4 -; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], -; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x float> [[SHUFFLE]], +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x float> [[TMP10]], i32 2 ; CHECK-NEXT: store float [[TMP11]], float* @c, align 4 -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 ; CHECK-NEXT: store float [[TMP12]], float* @d, align 4 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x float> [[TMP10]], i32 3 ; CHECK-NEXT: store float [[TMP13]], float* @e, align 4 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x float> [[TMP10]], i32 0 ; CHECK-NEXT: store float [[TMP14]], float* @f, align 4 @@ -42,16 +44,17 @@ ; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 15 ; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* @a, align 4 ; CHECK-NEXT: [[CONV19:%.*]] = sitofp i32 [[TMP15]] to float -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 2 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 3 -; CHECK-NEXT: [[TMP19:%.*]] = fadd <4 x float> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = fsub <4 x float> [[TMP10]], [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = fptosi <4 x float> [[TMP21]] to <4 x i32> -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP22]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[REORDER_SHUFFLE]], <4 x i32>* [[TMP23]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> , float [[CONV19]], i32 1 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x float> [[TMP16]], float [[TMP17]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x float> [[TMP18]], float -1.000000e+00, i32 3 +; CHECK-NEXT: [[TMP20:%.*]] = fadd <4 x float> [[TMP10]], [[TMP19]] +; CHECK-NEXT: [[TMP21:%.*]] = fsub <4 x float> [[TMP10]], [[TMP19]] +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP21]], <4 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = fptosi <4 x float> [[TMP22]] to <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP23]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[ARRAYIDX1]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[REORDER_SHUFFLE]], <4 x i32>* [[TMP24]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -54,15 +54,16 @@ ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP1]], i32 4, <4 x i1> , <4 x float> undef) +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 ; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 ; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP4]], i32 1 -; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[X2]], i32 2 -; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2 +; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[TMP5]], i32 2 +; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[TMP5]], i32 3 ; CHECK-NEXT: ret <4 x float> [[I3]] ; %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/multi_user.ll @@ -15,21 +15,21 @@ define i32 @foo(i32* nocapture %A, i32 %n) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: [[TMP1:%.*]] = mul nsw i32 [[N:%.*]], 5 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP1]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP1]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP5]], -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[A:%.*]] to <4 x i32>* -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[A]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = add nsw i32 [[TMP1]], 11 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 4 -; CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[TMP12]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = add nsw i32 [[TMP11]], [[TMP13]] -; CHECK-NEXT: store i32 [[TMP14]], i32* [[TMP12]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = add nsw i32 [[TMP1]], 7 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[A:%.*]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw i32 [[TMP2]], [[TMP3]] +; CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP1]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i32>, <4 x i32>* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[TMP10]], [[TMP12]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP13]], <4 x i32>* [[TMP14]], align 4 ; CHECK-NEXT: ret i32 undef ; %1 = mul nsw i32 %n, 5 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -264,32 +264,24 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32000 x float], [32000 x float]* @a, i32 0, i32 0), align 16 ; CHECK-NEXT: br label [[FOR_BODY3:%.*]] ; CHECK: for.body3: -; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[ARRAYIDX5]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 -; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP13]] -; CHECK-NEXT: [[TMP14]] = load float, float* [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i32 3 -; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]] -; CHECK-NEXT: store float [[MUL45]], float* [[ARRAYIDX31]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[ARRAYIDX]] to <8 x float>* +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP5]], i32 4, <8 x i1> , <8 x float> undef) +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = fmul <8 x float> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[ARRAYIDX5]] to <8 x float>* +; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP10]], i32 4, <8 x i1> ) +; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP11]], 31995 +; CHECK-NEXT: [[TMP12]] = extractelement <8 x float> [[TMP6]], i32 4 ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]] ; CHECK: for.end: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/partail.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/partail.ll @@ -18,26 +18,23 @@ ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[SHR15]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 undef, i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 undef, i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 undef, i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[TMP7]], undef -; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP7]], <4 x i32> undef -; CHECK-NEXT: [[TMP10:%.*]] = sext <4 x i32> [[TMP9]] to <4 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2 -; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3 -; CHECK-NEXT: [[TMP19:%.*]] = sext i32 [[TMP18]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP19]] +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], undef +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP4]], <4 x i32> undef +; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[TMP6]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i64> [[TMP7]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP16]] ; CHECK-NEXT: unreachable ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/phi.ll @@ -140,49 +140,53 @@ define float @foo3(float* nocapture readonly %A) #0 { ; CHECK-LABEL: @foo3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE]], i32 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <8 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP0]], i32 4, <8 x i1> , <8 x float> undef) +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00 -; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] -; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = phi <8 x float> [ [[SHUFFLE]], [[ENTRY]] ], [ [[TMP32:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 6 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 5 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>* -; CHECK-NEXT: [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP11]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> undef, float [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP13]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP8]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP4]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = fmul <4 x float> [[TMP16]], -; CHECK-NEXT: [[TMP18]] = fadd <4 x float> [[TMP6]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX14]] to <4 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP6]], i32 4, <4 x i1> , <4 x float> undef) +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[SHUFFLE1]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x float> undef, float [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[SHUFFLE1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[SHUFFLE1]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float 8.000000e+00, i32 3 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float 7.000000e+00, i32 4 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x float> , float [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <8 x float> [[TMP16]], float [[TMP3]], i32 4 +; CHECK-NEXT: [[TMP18:%.*]] = fmul <8 x float> [[TMP15]], [[TMP17]] +; CHECK-NEXT: [[TMP19:%.*]] = fadd <8 x float> [[TMP2]], [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP20]], 121 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x float> [[TMP19]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <8 x float> undef, float [[TMP21]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x float> [[TMP19]], i32 1 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <8 x float> [[TMP22]], float [[TMP23]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x float> [[TMP19]], i32 2 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <8 x float> [[TMP24]], float [[TMP25]], i32 2 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x float> [[TMP19]], i32 3 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP27]], i32 3 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x float> [[TMP19]], i32 4 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <8 x float> [[TMP28]], float [[TMP29]], i32 4 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <8 x float> [[TMP30]], float [[TMP8]], i32 5 +; CHECK-NEXT: [[TMP32]] = insertelement <8 x float> [[TMP31]], float [[TMP10]], i32 6 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP18]], i32 3 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP18]], i32 2 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP18]], i32 1 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP18]], i32 0 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP23]] +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[TMP29]], [[TMP27]] +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP25]] +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP23]] +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP21]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll @@ -13,9 +13,10 @@ ; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>* ; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = fadd fast <4 x float> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP3]], i32 0 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[REORDER_SHUFFLE]], i32 0 ; CHECK-NEXT: [[VECIN0:%.*]] = insertelement <2 x float> undef, float [[TMP4]], i64 0 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP3]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[REORDER_SHUFFLE]], i32 1 ; CHECK-NEXT: [[VECIN1:%.*]] = insertelement <2 x float> [[VECIN0]], float [[TMP5]], i64 1 ; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP3]], i32 2 ; CHECK-NEXT: [[VECIN2:%.*]] = insertelement <2 x float> undef, float [[TMP6]], i64 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -219,28 +219,28 @@ ; AVX2-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 ; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 ; AVX2-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1]], i32 0 -; AVX2-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX2-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX2-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX2-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX2-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX2-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX2-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX2-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX2-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 11 +; AVX2-NEXT: [[TMP7:%.*]] = load i32, i32* [[TMP6]], align 4, [[TBAA0]] +; AVX2-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], 2 +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 2 +; AVX2-NEXT: store i32 [[TMP8]], i32* [[TMP5]], align 4, [[TBAA0]] +; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 4 +; AVX2-NEXT: [[TMP11:%.*]] = load i32, i32* [[TMP10]], align 4, [[TBAA0]] +; AVX2-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 3 +; AVX2-NEXT: store i32 [[TMP12]], i32* [[TMP9]], align 4, [[TBAA0]] +; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 15 +; AVX2-NEXT: [[TMP15:%.*]] = load i32, i32* [[TMP14]], align 4, [[TBAA0]] +; AVX2-NEXT: [[TMP16:%.*]] = add i32 [[TMP15]], 4 +; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 4 +; AVX2-NEXT: store i32 [[TMP16]], i32* [[TMP13]], align 4, [[TBAA0]] +; AVX2-NEXT: [[TMP18:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1]], i32 0 +; AVX2-NEXT: [[TMP19:%.*]] = shufflevector <4 x i32*> [[TMP18]], <4 x i32*> undef, <4 x i32> zeroinitializer +; AVX2-NEXT: [[TMP20:%.*]] = getelementptr i32, <4 x i32*> [[TMP19]], <4 x i64> +; AVX2-NEXT: [[TMP21:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP20]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0]] +; AVX2-NEXT: [[TMP22:%.*]] = add <4 x i32> [[TMP21]], +; AVX2-NEXT: [[TMP23:%.*]] = bitcast i32* [[TMP17]] to <4 x i32>* +; AVX2-NEXT: store <4 x i32> [[TMP22]], <4 x i32>* [[TMP23]], align 4, [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @gather_load_3( @@ -248,28 +248,13 @@ ; AVX512-NEXT: [[TMP4:%.*]] = add i32 [[TMP3]], 1 ; AVX512-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP0:%.*]], i64 1 ; AVX512-NEXT: store i32 [[TMP4]], i32* [[TMP0]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1]], i32 0 -; AVX512-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32*> [[TMP6]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP8:%.*]] = getelementptr i32, <4 x i32*> [[TMP7]], <4 x i64> -; AVX512-NEXT: [[TMP9:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP8]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0]] -; AVX512-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP9]], -; AVX512-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 5 -; AVX512-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 9 -; AVX512-NEXT: [[TMP14:%.*]] = load i32, i32* [[TMP13]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP15:%.*]] = add i32 [[TMP14]], 2 -; AVX512-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 6 -; AVX512-NEXT: store i32 [[TMP15]], i32* [[TMP11]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 6 -; AVX512-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP17]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP19:%.*]] = add i32 [[TMP18]], 3 -; AVX512-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], i64 7 -; AVX512-NEXT: store i32 [[TMP19]], i32* [[TMP16]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i64 21 -; AVX512-NEXT: [[TMP22:%.*]] = load i32, i32* [[TMP21]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP23:%.*]] = add i32 [[TMP22]], 4 -; AVX512-NEXT: store i32 [[TMP23]], i32* [[TMP20]], align 4, [[TBAA0]] +; AVX512-NEXT: [[TMP6:%.*]] = insertelement <2 x i32*> undef, i32* [[TMP1]], i32 0 +; AVX512-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32*> [[TMP6]], <2 x i32*> undef, <8 x i32> +; AVX512-NEXT: [[TMP7:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE]], <8 x i64> +; AVX512-NEXT: [[TMP8:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP7]], i32 4, <8 x i1> , <8 x i32> undef), [[TBAA0]] +; AVX512-NEXT: [[TMP9:%.*]] = add <8 x i32> [[TMP8]], +; AVX512-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* +; AVX512-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP9]], <8 x i32>* [[TMP10]], i32 4, <8 x i1> ), [[TBAA0]] ; AVX512-NEXT: ret void ; %3 = load i32, i32* %1, align 4, !tbaa !2 @@ -398,60 +383,45 @@ ; ; AVX2-LABEL: @gather_load_4( ; AVX2-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> undef, i32* [[T1:%.*]], i32 0 +; AVX2-NEXT: [[T6:%.*]] = getelementptr inbounds i32, i32* [[T1:%.*]], i64 11 +; AVX2-NEXT: [[T9:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 2 +; AVX2-NEXT: [[T10:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 4 +; AVX2-NEXT: [[T13:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 3 +; AVX2-NEXT: [[T14:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 15 +; AVX2-NEXT: [[T17:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 4 +; AVX2-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> undef, i32* [[T1]], i32 0 ; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX2-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX2-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX2-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX2-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX2-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX2-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 +; AVX2-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> ; AVX2-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]] +; AVX2-NEXT: [[T7:%.*]] = load i32, i32* [[T6]], align 4, [[TBAA0]] +; AVX2-NEXT: [[T11:%.*]] = load i32, i32* [[T10]], align 4, [[TBAA0]] +; AVX2-NEXT: [[T15:%.*]] = load i32, i32* [[T14]], align 4, [[TBAA0]] ; AVX2-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0]] -; AVX2-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]] -; AVX2-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]] -; AVX2-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]] ; AVX2-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX2-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX2-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX2-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX2-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 +; AVX2-NEXT: [[T8:%.*]] = add i32 [[T7]], 2 +; AVX2-NEXT: [[T12:%.*]] = add i32 [[T11]], 3 +; AVX2-NEXT: [[T16:%.*]] = add i32 [[T15]], 4 +; AVX2-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], ; AVX2-NEXT: store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* +; AVX2-NEXT: store i32 [[T8]], i32* [[T5]], align 4, [[TBAA0]] +; AVX2-NEXT: store i32 [[T12]], i32* [[T9]], align 4, [[TBAA0]] +; AVX2-NEXT: store i32 [[T16]], i32* [[T13]], align 4, [[TBAA0]] +; AVX2-NEXT: [[TMP6:%.*]] = bitcast i32* [[T17]] to <4 x i32>* ; AVX2-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]] -; AVX2-NEXT: store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]] -; AVX2-NEXT: store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]] -; AVX2-NEXT: store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512-LABEL: @gather_load_4( ; AVX512-NEXT: [[T5:%.*]] = getelementptr inbounds i32, i32* [[T0:%.*]], i64 1 -; AVX512-NEXT: [[TMP1:%.*]] = insertelement <4 x i32*> undef, i32* [[T1:%.*]], i32 0 -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32*> [[TMP1]], <4 x i32*> undef, <4 x i32> zeroinitializer -; AVX512-NEXT: [[TMP3:%.*]] = getelementptr i32, <4 x i32*> [[TMP2]], <4 x i64> -; AVX512-NEXT: [[T21:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 5 -; AVX512-NEXT: [[T22:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 9 -; AVX512-NEXT: [[T25:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 6 -; AVX512-NEXT: [[T26:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 6 -; AVX512-NEXT: [[T29:%.*]] = getelementptr inbounds i32, i32* [[T0]], i64 7 -; AVX512-NEXT: [[T30:%.*]] = getelementptr inbounds i32, i32* [[T1]], i64 21 +; AVX512-NEXT: [[TMP1:%.*]] = insertelement <2 x i32*> undef, i32* [[T1:%.*]], i32 0 +; AVX512-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32*> [[TMP1]], <2 x i32*> undef, <8 x i32> +; AVX512-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x i32*> [[SHUFFLE]], <8 x i64> ; AVX512-NEXT: [[T3:%.*]] = load i32, i32* [[T1]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP3]], i32 4, <4 x i1> , <4 x i32> undef), [[TBAA0]] -; AVX512-NEXT: [[T23:%.*]] = load i32, i32* [[T22]], align 4, [[TBAA0]] -; AVX512-NEXT: [[T27:%.*]] = load i32, i32* [[T26]], align 4, [[TBAA0]] -; AVX512-NEXT: [[T31:%.*]] = load i32, i32* [[T30]], align 4, [[TBAA0]] +; AVX512-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> [[TMP2]], i32 4, <8 x i1> , <8 x i32> undef), [[TBAA0]] ; AVX512-NEXT: [[T4:%.*]] = add i32 [[T3]], 1 -; AVX512-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP4]], -; AVX512-NEXT: [[T24:%.*]] = add i32 [[T23]], 2 -; AVX512-NEXT: [[T28:%.*]] = add i32 [[T27]], 3 -; AVX512-NEXT: [[T32:%.*]] = add i32 [[T31]], 4 +; AVX512-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP3]], ; AVX512-NEXT: store i32 [[T4]], i32* [[T0]], align 4, [[TBAA0]] -; AVX512-NEXT: [[TMP6:%.*]] = bitcast i32* [[T5]] to <4 x i32>* -; AVX512-NEXT: store <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]], align 4, [[TBAA0]] -; AVX512-NEXT: store i32 [[T24]], i32* [[T21]], align 4, [[TBAA0]] -; AVX512-NEXT: store i32 [[T28]], i32* [[T25]], align 4, [[TBAA0]] -; AVX512-NEXT: store i32 [[T32]], i32* [[T29]], align 4, [[TBAA0]] +; AVX512-NEXT: [[TMP5:%.*]] = bitcast i32* [[T5]] to <8 x i32>* +; AVX512-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP4]], <8 x i32>* [[TMP5]], i32 4, <8 x i1> ), [[TBAA0]] ; AVX512-NEXT: ret void ; %t5 = getelementptr inbounds i32, i32* %t0, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll @@ -11,31 +11,30 @@ ; CHECK-NEXT: ret void ; CHECK: bb2: ; CHECK-NEXT: [[T:%.*]] = select i1 undef, i16 undef, i16 15 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> undef, i16 [[T]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16 undef, i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> , [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef -; CHECK-NEXT: [[SHUFFLE5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE5]], -; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP5]]) -; CHECK-NEXT: [[T19:%.*]] = select i1 undef, i32 [[TMP6]], i32 undef +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[T]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP2]], undef +; CHECK-NEXT: [[SHUFFLE5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE5]], +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP4]]) +; CHECK-NEXT: [[T19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef ; CHECK-NEXT: [[T20:%.*]] = icmp sgt i32 [[T19]], 63 -; CHECK-NEXT: [[TMP7:%.*]] = sub nsw <2 x i32> undef, [[TMP2]] -; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP7]], undef -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[SHUFFLE]], -; CHECK-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP9]]) -; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i32 [[TMP10]], undef -; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef -; CHECK-NEXT: [[TMP12:%.*]] = icmp slt i32 [[OP_EXTRA]], undef -; CHECK-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[TMP12]], i32 [[OP_EXTRA]], i32 undef -; CHECK-NEXT: [[TMP13:%.*]] = icmp slt i32 [[OP_EXTRA1]], undef -; CHECK-NEXT: [[OP_EXTRA2:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA1]], i32 undef -; CHECK-NEXT: [[TMP14:%.*]] = icmp slt i32 [[OP_EXTRA2]], undef -; CHECK-NEXT: [[OP_EXTRA3:%.*]] = select i1 [[TMP14]], i32 [[OP_EXTRA2]], i32 undef -; CHECK-NEXT: [[TMP15:%.*]] = icmp slt i32 [[OP_EXTRA3]], undef -; CHECK-NEXT: [[OP_EXTRA4:%.*]] = select i1 [[TMP15]], i32 [[OP_EXTRA3]], i32 undef +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> undef, [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> [[TMP6]], undef +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = icmp slt i32 [[TMP9]], undef +; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 undef +; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i32 [[OP_EXTRA]], undef +; CHECK-NEXT: [[OP_EXTRA1:%.*]] = select i1 [[TMP11]], i32 [[OP_EXTRA]], i32 undef +; CHECK-NEXT: [[TMP12:%.*]] = icmp slt i32 [[OP_EXTRA1]], undef +; CHECK-NEXT: [[OP_EXTRA2:%.*]] = select i1 [[TMP12]], i32 [[OP_EXTRA1]], i32 undef +; CHECK-NEXT: [[TMP13:%.*]] = icmp slt i32 [[OP_EXTRA2]], undef +; CHECK-NEXT: [[OP_EXTRA3:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA2]], i32 undef +; CHECK-NEXT: [[TMP14:%.*]] = icmp slt i32 [[OP_EXTRA3]], undef +; CHECK-NEXT: [[OP_EXTRA4:%.*]] = select i1 [[TMP14]], i32 [[OP_EXTRA3]], i32 undef ; CHECK-NEXT: [[T45:%.*]] = icmp sgt i32 undef, [[OP_EXTRA4]] ; CHECK-NEXT: unreachable ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/resched.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resched.ll @@ -19,63 +19,52 @@ ; CHECK-NEXT: [[ARRAYIDX_I_I7_5_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 5 ; CHECK-NEXT: [[ARRAYIDX_I_I7_6_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 6 ; CHECK-NEXT: [[ARRAYIDX_I_I7_7_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 7 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> undef, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[CONV31_I]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[CONV31_I]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[CONV31_I]], i32 4 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[CONV31_I]], i32 5 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[CONV31_I]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[CONV31_I]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = lshr <8 x i32> [[TMP8]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_8_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 8 ; CHECK-NEXT: [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9 ; CHECK-NEXT: [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10 ; CHECK-NEXT: [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12 -; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13 ; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13 -; CHECK-NEXT: [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14 ; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14 -; CHECK-NEXT: [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10 -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15 -; CHECK-NEXT: [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8> -; CHECK-NEXT: [[TMP44:%.*]] = and <16 x i8> [[TMP43]], +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = lshr <16 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x i32> [[TMP2]], i32 14 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> [[TMP4]], i32 [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <16 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x i32> [[TMP6]], i32 [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i32> [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> [[TMP8]], i32 [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <16 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> [[TMP10]], i32 [[TMP11]], i32 4 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i32> [[TMP2]], i32 4 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP13]], i32 5 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i32> [[TMP2]], i32 5 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP15]], i32 6 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <16 x i32> [[TMP2]], i32 6 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[TMP17]], i32 7 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i32> [[TMP2]], i32 7 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP19]], i32 8 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <16 x i32> [[TMP2]], i32 8 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP21]], i32 9 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <16 x i32> [[TMP2]], i32 9 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP23]], i32 10 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i32> [[TMP2]], i32 10 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[TMP25]], i32 11 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <16 x i32> [[TMP2]], i32 11 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[TMP27]], i32 12 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i32> [[TMP2]], i32 12 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[TMP29]], i32 13 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <16 x i32> [[TMP2]], i32 13 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[TMP31]], i32 14 +; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP32]], i32 [[TMP3]], i32 15 +; CHECK-NEXT: [[TMP34:%.*]] = trunc <16 x i32> [[TMP33]] to <16 x i8> +; CHECK-NEXT: [[TMP35:%.*]] = and <16 x i8> [[TMP34]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15 -; CHECK-NEXT: [[TMP45:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* -; CHECK-NEXT: store <16 x i8> [[TMP44]], <16 x i8>* [[TMP45]], align 1 +; CHECK-NEXT: [[TMP36:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP35]], <16 x i8>* [[TMP36]], align 1 ; CHECK-NEXT: unreachable ; CHECK: if.end50.i: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll @@ -13,11 +13,12 @@ ; CHECK-NEXT: [[T11:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 1, i64 1 ; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[T14]] to <2 x float>* ; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4 +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> undef, <4 x i32> ; CHECK-NEXT: br label [[T37:%.*]] ; CHECK: t37: -; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[TMP3:%.*]] ], [ [[T89:%.*]], [[T37]] ] -; CHECK-NEXT: [[TMP7:%.*]] = fdiv fast <2 x float> , [[TMP6]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[LOAD_EXTEND]], [[TMP3:%.*]] ], [ [[REORDER_SHUFFLE:%.*]], [[T37]] ] +; CHECK-NEXT: [[TMP7:%.*]] = fdiv fast <4 x float> , [[TMP6]] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <4 x i32> ; CHECK-NEXT: [[T21:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 0 ; CHECK-NEXT: [[T25:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 1 ; CHECK-NEXT: [[T31:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[T4]], i64 0, i32 2, i64 2 @@ -25,7 +26,8 @@ ; CHECK-NEXT: [[TMP8:%.*]] = bitcast float* [[T21]] to <4 x float>* ; CHECK-NEXT: store <4 x float> [[SHUFFLE]], <4 x float>* [[TMP8]], align 4 ; CHECK-NEXT: [[T88:%.*]] = bitcast float* [[T9]] to <2 x float>* -; CHECK-NEXT: [[T89]] = load <2 x float>, <2 x float>* [[T88]], align 4 +; CHECK-NEXT: [[T89:%.*]] = load <2 x float>, <2 x float>* [[T88]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE]] = shufflevector <2 x float> [[T89]], <2 x float> undef, <4 x i32> ; CHECK-NEXT: br i1 undef, label [[T37]], label [[T55:%.*]] ; CHECK: t55: ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/rgb_phi.ll @@ -25,39 +25,37 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <2 x i32> ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP3:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[G_031:%.*]] = phi float [ [[TMP1]], [[ENTRY]] ], [ [[ADD9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[R_030:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 7.000000e+00 ; CHECK-NEXT: [[ADD4]] = fadd float [[R_030]], [[MUL]] -; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX7]], align 4 -; CHECK-NEXT: [[MUL8:%.*]] = fmul float [[TMP5]], 8.000000e+00 -; CHECK-NEXT: [[ADD9]] = fadd float [[G_031]], [[MUL8]] -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP7]], 9.000000e+00 -; CHECK-NEXT: [[ADD14]] = fadd float [[B_032]], [[MUL13]] +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX7]] to <2 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[REORDER_SHUFFLE1]], +; CHECK-NEXT: [[TMP9]] = fadd <2 x float> [[TMP4]], [[TMP8]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP8]], 121 +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP10]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]] ; CHECK: for.body.for.body_crit_edge: ; CHECK-NEXT: [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[ADD4]], [[ADD9]] -; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[ADD4]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 +; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[TMP12]] ; CHECK-NEXT: ret float [[ADD17]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/schedule-bundle.ll @@ -10,18 +10,10 @@ define i32 @slp_schedule_bundle() local_unnamed_addr #0 { ; CHECK-LABEL: @slp_schedule_bundle( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([1 x i32]* @b to <4 x i32>*), align 4 -; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], -; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4 -; CHECK-NEXT: [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31 -; CHECK-NEXT: [[DOTLOBIT_NOT_4:%.*]] = xor i32 [[DOTLOBIT_4]], 1 -; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_4]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4 -; CHECK-NEXT: [[DOTLOBIT_5:%.*]] = lshr i32 [[TMP4]], 31 -; CHECK-NEXT: [[DOTLOBIT_NOT_5:%.*]] = xor i32 [[DOTLOBIT_5]], 1 -; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_5]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast ([1 x i32]* @b to <8 x i32>*), i32 4, <8 x i1> , <8 x i32> undef) +; CHECK-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = xor <8 x i32> [[TMP1]], +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP2]], <8 x i32>* bitcast ([1 x i32]* @a to <8 x i32>*), i32 4, <8 x i1> ) ; CHECK-NEXT: ret i32 undef ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll @@ -27,7 +27,6 @@ ; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]] ; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]] ; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]] -; CHECK-NEXT: [[T28:%.*]] = add nsw i32 [[T15]], [[T9]] ; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] ; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 @@ -42,13 +41,19 @@ ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 ; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[T50:%.*]] = add nsw i32 [[T40]], [[T48]] -; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[T28]], i32 0 -; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[T50]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[T15]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> undef, i32 [[T9]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i32 0 +; CHECK-NEXT: [[T65:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i32> [[TMP5]], i32 1 +; CHECK-NEXT: [[T66:%.*]] = insertelement <8 x i32> [[T65]], i32 [[TMP7]], i32 1 ; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[T66]], i32 [[T32]], i32 2 ; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3 -; CHECK-NEXT: [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[T28]], i32 4 -; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[T50]], i32 5 +; CHECK-NEXT: [[T69:%.*]] = insertelement <8 x i32> [[T68]], i32 [[TMP6]], i32 4 +; CHECK-NEXT: [[T70:%.*]] = insertelement <8 x i32> [[T69]], i32 [[TMP7]], i32 5 ; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T70]], i32 [[T34]], i32 6 ; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll @@ -7,15 +7,15 @@ ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A7:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A8:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A1:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A2:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A3:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A4:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A5:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A6:%.*]], i32 7 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A2:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]] @@ -57,15 +57,15 @@ ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A6:%.*]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A4:%.*]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A4:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A5:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A8:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A2:%.*]], i32 5 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A2:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A3:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]] @@ -111,15 +111,15 @@ ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A4:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A6:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A5:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A8:%.*]], i32 3 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A2:%.*]], i32 4 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A7:%.*]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A1:%.*]], i32 6 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A3:%.*]], i32 7 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A3:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A4:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A2:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A1:%.*]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[A8:%.*]], i32 7 ; CHECK-NEXT: [[TMP10:%.*]] = add <8 x i32> [[SHUFFLE]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> [[TMP10]]) ; CHECK-NEXT: ret i32 [[TMP11]] diff --git a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll --- a/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll +++ b/llvm/test/Transforms/SLPVectorizer/slp-max-phi-size.ll @@ -126,12 +126,12 @@ ; MAX256-NEXT: bb: ; MAX256-NEXT: br label [[BB1:%.*]] ; MAX256: bb1: -; MAX256-NEXT: [[TMP0:%.*]] = insertelement <4 x half> undef, half [[HVAL:%.*]], i32 0 -; MAX256-NEXT: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[HVAL]], i32 1 -; MAX256-NEXT: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[HVAL]], i32 2 -; MAX256-NEXT: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[HVAL]], i32 3 -; MAX256-NEXT: [[TMP4:%.*]] = fpext <4 x half> [[TMP3]] to <4 x float> -; MAX256-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <8 x i32> +; MAX256-NEXT: [[TMP0:%.*]] = insertelement <8 x half> undef, half [[HVAL:%.*]], i32 0 +; MAX256-NEXT: [[TMP1:%.*]] = insertelement <8 x half> [[TMP0]], half [[HVAL]], i32 1 +; MAX256-NEXT: [[TMP2:%.*]] = insertelement <8 x half> [[TMP1]], half [[HVAL]], i32 2 +; MAX256-NEXT: [[TMP3:%.*]] = insertelement <8 x half> [[TMP2]], half [[HVAL]], i32 3 +; MAX256-NEXT: [[TMP4:%.*]] = fpext <8 x half> [[TMP3]] to <8 x float> +; MAX256-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> undef, <8 x i32> ; MAX256-NEXT: [[TMP5:%.*]] = insertelement <8 x float> undef, float [[FVAL:%.*]], i32 0 ; MAX256-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[FVAL]], i32 1 ; MAX256-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[FVAL]], i32 2 @@ -302,12 +302,12 @@ ; MAX1024-NEXT: bb: ; MAX1024-NEXT: br label [[BB1:%.*]] ; MAX1024: bb1: -; MAX1024-NEXT: [[TMP0:%.*]] = insertelement <4 x half> undef, half [[HVAL:%.*]], i32 0 -; MAX1024-NEXT: [[TMP1:%.*]] = insertelement <4 x half> [[TMP0]], half [[HVAL]], i32 1 -; MAX1024-NEXT: [[TMP2:%.*]] = insertelement <4 x half> [[TMP1]], half [[HVAL]], i32 2 -; MAX1024-NEXT: [[TMP3:%.*]] = insertelement <4 x half> [[TMP2]], half [[HVAL]], i32 3 -; MAX1024-NEXT: [[TMP4:%.*]] = fpext <4 x half> [[TMP3]] to <4 x float> -; MAX1024-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> undef, <32 x i32> +; MAX1024-NEXT: [[TMP0:%.*]] = insertelement <32 x half> undef, half [[HVAL:%.*]], i32 0 +; MAX1024-NEXT: [[TMP1:%.*]] = insertelement <32 x half> [[TMP0]], half [[HVAL]], i32 1 +; MAX1024-NEXT: [[TMP2:%.*]] = insertelement <32 x half> [[TMP1]], half [[HVAL]], i32 2 +; MAX1024-NEXT: [[TMP3:%.*]] = insertelement <32 x half> [[TMP2]], half [[HVAL]], i32 3 +; MAX1024-NEXT: [[TMP4:%.*]] = fpext <32 x half> [[TMP3]] to <32 x float> +; MAX1024-NEXT: [[SHUFFLE:%.*]] = shufflevector <32 x float> [[TMP4]], <32 x float> undef, <32 x i32> ; MAX1024-NEXT: [[TMP5:%.*]] = insertelement <32 x float> undef, float [[FVAL:%.*]], i32 0 ; MAX1024-NEXT: [[TMP6:%.*]] = insertelement <32 x float> [[TMP5]], float [[FVAL]], i32 1 ; MAX1024-NEXT: [[TMP7:%.*]] = insertelement <32 x float> [[TMP6]], float [[FVAL]], i32 2