Index: lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1131,8 +1131,8 @@ MachineFunction &MF = B.getMF(); const SIMachineFunctionInfo *MFI = MF.getInfo(); - const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || - MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE; + const bool IsIEEEOp = MI.getOpcode() == TargetOpcode::G_FMINNUM_IEEE || + MI.getOpcode() == TargetOpcode::G_FMAXNUM_IEEE; // With ieee_mode disabled, the instructions have the correct behavior // already for G_FMINNUM/G_FMAXNUM Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -184,6 +184,8 @@ return false; BasicBlock *BB = I0->getParent(); for (int i = 1, e = VL.size(); i < e; i++) { + if (isa(VL[i])) + continue; Instruction *I = dyn_cast(VL[i]); if (!I) return false; @@ -359,9 +361,18 @@ /// could be vectorized even if its structure is diverse. static InstructionsState getSameOpcode(ArrayRef VL, unsigned BaseIndex = 0) { - // Make sure these are all Instructions. - if (llvm::any_of(VL, [](Value *V) { return !isa(V); })) + // Make sure these are all Instructions or UndefValues. + if (llvm::any_of(VL, + [](Value *V) { + return !isa(V) && !isa(V); + }) || + llvm::all_of(VL, [](Value *V) { return isa(V); })) return InstructionsState(VL[BaseIndex], nullptr, nullptr); + for (unsigned I = BaseIndex, E = VL.size(); I < E; I++) + if (isa(VL[I])) { + BaseIndex = I; + break; + } bool IsCastOp = isa(VL[BaseIndex]); bool IsBinOp = isa(VL[BaseIndex]); @@ -372,6 +383,8 @@ // Check for one alternate opcode from another BinaryOperator. // TODO - generalize to support all operators (types, calls etc.). for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) { + if (isa(VL[Cnt])) + continue; unsigned InstOpcode = cast(VL[Cnt])->getOpcode(); if (IsBinOp && isa(VL[Cnt])) { if (InstOpcode == Opcode || InstOpcode == AltOpcode) @@ -695,6 +708,7 @@ /// accessing a consecutive address. These strategies are summarized in the /// 'ReorderingMode' enumerator. enum class ReorderingMode { + Unknown, ///< Mode is not defined yet Load, ///< Matching loads to consecutive memory addresses Opcode, ///< Matching instructions based on opcode (same or alternate) Constant, ///< Matching constants @@ -709,6 +723,7 @@ const DataLayout &DL; ScalarEvolution &SE; + Instruction &VL0; /// \returns the operand data at \p OpIdx and \p Lane. OperandData &getData(unsigned OpIdx, unsigned Lane) { @@ -823,6 +838,8 @@ break; case ReorderingMode::Failed: return None; + case ReorderingMode::Unknown: + llvm_unreachable("Unknown mode is not expected here."); } } @@ -863,10 +880,17 @@ // a map. Instead we can simply count the number of operands that // correspond to one of them (in this case the 'true' APO), and calculate // the other by subtracting it from the total number of operands. - for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) - if (getData(OpIdx, Lane).APO) + unsigned UndefsCnt = 0; + for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { + const OperandData &OpData = getData(OpIdx, Lane); + if (isa(OpData.V)) { + ++UndefsCnt; + continue; + } + if (OpData.APO) ++CntTrue; - unsigned CntFalse = NumOperands - CntTrue; + } + unsigned CntFalse = NumOperands - CntTrue - UndefsCnt; return std::max(CntTrue, CntFalse); } @@ -875,13 +899,18 @@ assert(!VL.empty() && "Bad VL"); assert((empty() || VL.size() == getNumLanes()) && "Expected same number of lanes"); - assert(isa(VL[0]) && "Expected instruction"); - unsigned NumOperands = cast(VL[0])->getNumOperands(); + unsigned NumOperands = VL0.getNumOperands(); OpsVec.resize(NumOperands); unsigned NumLanes = VL.size(); for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) { OpsVec[OpIdx].resize(NumLanes); for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + if (isa(VL[Lane])) { + OpsVec[OpIdx][Lane] = { + UndefValue::get(VL0.getOperand(OpIdx)->getType()), false, + false}; + continue; + } assert(isa(VL[Lane]) && "Expected instruction"); // Our tree has just 3 nodes: the root and two operands. // It is therefore trivial to get the APO. We only need to check the @@ -946,9 +975,9 @@ public: /// Initialize with all the operands of the instruction vector \p RootVL. - VLOperands(ArrayRef RootVL, const DataLayout &DL, + VLOperands(Instruction &VL0, ArrayRef RootVL, const DataLayout &DL, ScalarEvolution &SE) - : DL(DL), SE(SE) { + : DL(DL), SE(SE), VL0(VL0) { // Append all the operands of RootVL. appendOperandsOfVL(RootVL); } @@ -973,7 +1002,8 @@ // Each operand has its own mode. We are using this mode to help us select // the instructions for each lane, so that they match best with the ones // we have selected so far. - SmallVector ReorderingModes(NumOperands); + SmallVector ReorderingModes(NumOperands, + ReorderingMode::Unknown); // This is a greedy single-pass algorithm. We are going over each lane // once and deciding on the best order right away with no back-tracking. @@ -1067,6 +1097,8 @@ #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) { switch (RMode) { + case ReorderingMode::Unknown: + return "Unknown"; case ReorderingMode::Load: return "Load"; case ReorderingMode::Opcode: @@ -1145,7 +1177,8 @@ /// \returns the scalarization cost for this type. Scalarization in this /// context means the creation of vectors from a group of scalars. - int getGatherCost(Type *Ty, const DenseSet &ShuffledIndices) const; + int getGatherCost(Type *Ty, const DenseSet &ShuffledIndices, + const DenseSet &IgnoredIndices) const; /// \returns the scalarization cost for this list of values. Assuming that /// this subtree gets vectorized, we may need to extract the values from the @@ -1166,7 +1199,8 @@ /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. - static void reorderInputsAccordingToOpcode(ArrayRef VL, + static void reorderInputsAccordingToOpcode(Instruction &VL0, + ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right, const DataLayout &DL, @@ -1233,15 +1267,19 @@ } /// Set the operands of this bundle in their original order. - void setOperandsInOrder() { + void setOperandsInOrder(Instruction *I0) { assert(Operands.empty() && "Already initialized?"); - auto *I0 = cast(Scalars[0]); Operands.resize(I0->getNumOperands()); unsigned NumLanes = Scalars.size(); for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands(); OpIdx != NumOperands; ++OpIdx) { Operands[OpIdx].resize(NumLanes); for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + if (isa(Scalars[Lane])) { + Operands[OpIdx][Lane] = + UndefValue::get(I0->getOperand(OpIdx)->getType()); + continue; + } auto *I = cast(Scalars[Lane]); assert(I->getNumOperands() == NumOperands && "Expected same number of operands"); @@ -1319,11 +1357,15 @@ Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); Last->ReorderIndices = ReorderIndices; - if (Vectorized) { - for (int i = 0, e = VL.size(); i != e; ++i) { - assert(!getTreeEntry(VL[i]) && "Scalar already in tree!"); - ScalarToTreeEntry[VL[i]] = Last; + for (Value *V : make_filter_range(VL, Instruction::classof)) { + if (Vectorized) { + assert(!getTreeEntry(V) && "Scalar already in tree!"); + ScalarToTreeEntry[V] = Last; + } else { + MustGather.insert(V); } + } + if (Vectorized) { // Update the scheduler bundle to point to this TreeEntry. unsigned Lane = 0; for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember; @@ -1332,10 +1374,9 @@ BundleMember->Lane = Lane; ++Lane; } - assert((!Bundle.getValue() || Lane == VL.size()) && + assert((!Bundle.getValue() || + Lane == llvm::count_if(VL, Instruction::classof)) && "Bundle and VL out of sync"); - } else { - MustGather.insert(VL.begin(), VL.end()); } if (UserTreeIdx.UserTE) @@ -2020,6 +2061,8 @@ // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (isa(Scalar)) + continue; int FoundLane = Lane; if (!Entry->ReuseShuffleIndices.empty()) { FoundLane = @@ -2043,7 +2086,8 @@ // Skip in-tree scalars that become vectors if (TreeEntry *UseEntry = getTreeEntry(U)) { - Value *UseScalar = UseEntry->Scalars[0]; + Value *UseScalar = + *llvm::find_if(UseEntry->Scalars, Instruction::classof); // Some in-tree scalars will remain as scalar in vectorized // instructions. If that is the case, the one in Lane 0 will // be used. @@ -2104,9 +2148,9 @@ // the same block. // Don't vectorize ephemeral values. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (EphValues.count(VL[i])) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] + for (Value *V : VL) { + if (!isa(V) && EphValues.count(V)) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is ephemeral.\n"); newTreeEntry(VL, None /*not vectorized*/, UserTreeIdx); return; @@ -2130,12 +2174,9 @@ } // Check that none of the instructions in the bundle are already in the tree. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - auto *I = dyn_cast(VL[i]); - if (!I) - continue; - if (getTreeEntry(I)) { - LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] + for (Value *V : VL) { + if (!isa(V) && getTreeEntry(V)) { + LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is already in tree.\n"); newTreeEntry(VL, None /*not vectorized*/, UserTreeIdx); return; @@ -2145,8 +2186,9 @@ // If any of the scalars is marked as a value that needs to stay scalar, then // we need to gather the scalars. // The reduction nodes (stored in UserIgnoreList) also should stay scalar. - for (unsigned i = 0, e = VL.size(); i != e; ++i) { - if (MustGather.count(VL[i]) || is_contained(UserIgnoreList, VL[i])) { + for (Value *V : VL) { + if (!isa(V) && + (MustGather.count(V) || is_contained(UserIgnoreList, V))) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n"); newTreeEntry(VL, None /*not vectorized*/, UserTreeIdx); return; @@ -2171,6 +2213,11 @@ SmallVector UniqueValues; DenseMap UniquePositions; for (Value *V : VL) { + if (isa(V)) { + ReuseShuffleIndicies.emplace_back(UniqueValues.size()); + UniqueValues.emplace_back(V); + continue; + } auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); ReuseShuffleIndicies.emplace_back(Res.first->second); if (Res.second) @@ -2180,13 +2227,21 @@ ReuseShuffleIndicies.clear(); } else { LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); - if (UniqueValues.size() <= 1 || !llvm::isPowerOf2_32(UniqueValues.size())) { - LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - newTreeEntry(VL, None /*not vectorized*/, UserTreeIdx); - return; - } + UniqueValues.append(VL.size() - UniqueValues.size(), + UndefValue::get(VL0->getType())); VL = UniqueValues; } + const unsigned NumberOfInstructions = + llvm::count_if(VL, Instruction::classof); + if (NumberOfInstructions <= 1) { + LLVM_DEBUG( + dbgs() + << "SLP: Gathering due to vectorization of single instruction.\n"); + newTreeEntry(VL, None /*not vectorized*/, UserTreeIdx, + ReuseShuffleIndicies); + return; + } + auto InstructionsOnly = make_filter_range(VL, Instruction::classof); auto &BSRef = BlocksSchedules[BB]; if (!BSRef) @@ -2213,11 +2268,11 @@ PHINode *PH = dyn_cast(VL0); // Check for terminator values (e.g. invoke). - for (unsigned j = 0; j < VL.size(); ++j) - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { - Instruction *Term = dyn_cast( - cast(VL[j])->getIncomingValueForBlock( - PH->getIncomingBlock(i))); + for (Value *V : InstructionsOnly) + for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { + auto *Term = + dyn_cast(cast(V)->getIncomingValueForBlock( + PH->getIncomingBlock(I))); if (Term && Term->isTerminator()) { LLVM_DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (terminator use).\n"); @@ -2234,13 +2289,16 @@ // Keeps the reordered operands to avoid code duplication. SmallVector OperandsVec; - for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) { + for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getIncomingValueForBlock( - PH->getIncomingBlock(i))); - TE->setOperand(i, Operands); + for (Value *V : VL) { + Operands.emplace_back( + isa(V) ? UndefValue::get(V->getType()) + : cast(V)->getIncomingValueForBlock( + PH->getIncomingBlock(I))); + } + TE->setOperand(I, Operands); OperandsVec.push_back(Operands); } for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx) @@ -2312,9 +2370,22 @@ // Make sure all loads in the bundle are simple - we can't vectorize // atomic or volatile loads. - SmallVector PointerOps(VL.size()); + SmallVector PointerOps(NumberOfInstructions); auto POIter = PointerOps.begin(); + bool TypeIsSimple = VL0->getType()->isIntegerTy() || + VL0->getType()->isFloatingPointTy() || + VL0->getType()->isPointerTy(); for (Value *V : VL) { + if (isa(V)) { + if (!TypeIsSimple) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); + return; + } + continue; + } auto *L = cast(V); if (!L->isSimple()) { BS.cancelScheduling(VL, VL0); @@ -2345,21 +2416,27 @@ dyn_cast(SE->getMinusSCEV(ScevN, Scev0)); uint64_t Size = DL->getTypeAllocSize(ScalarTy); // Check that the sorted loads are consecutive. - if (Diff && Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size) { + if (Diff && + ((NumberOfInstructions < VL.size() && + Diff->getAPInt().getZExtValue() <= (VL.size() - 1) * Size) || + (NumberOfInstructions == VL.size() && + Diff->getAPInt().getZExtValue() == (VL.size() - 1) * Size))) { if (CurrentOrder.empty()) { // Original loads are consecutive and does not require reordering. ++NumOpsWantToKeepOriginalOrder; TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); } else { + CurrentOrder.append(VL.size() - NumberOfInstructions, + VL.size() + 1); // Need to reorder. auto I = NumOpsWantToKeepOrder.try_emplace(CurrentOrder).first; ++I->getSecond(); TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, UserTreeIdx, ReuseShuffleIndicies, I->getFirst()); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n"); } return; @@ -2385,8 +2462,10 @@ case Instruction::FPTrunc: case Instruction::BitCast: { Type *SrcTy = VL0->getOperand(0)->getType(); - for (unsigned i = 0; i < VL.size(); ++i) { - Type *Ty = cast(VL[i])->getOperand(0)->getType(); + for (Value *V : VL) { + if (isa(V)) + continue; + Type *Ty = cast(V)->getOperand(0)->getType(); if (Ty != SrcTy || !isValidElementType(Ty)) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, UserTreeIdx, @@ -2400,12 +2479,15 @@ ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n"); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *V : VL) { + Operands.push_back(isa(V) + ? UndefValue::get(SrcTy) + : cast(V)->getOperand(i)); + } buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -2417,8 +2499,10 @@ CmpInst::Predicate P0 = cast(VL0)->getPredicate(); CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0); Type *ComparedTy = VL0->getOperand(0)->getType(); - for (unsigned i = 1, e = VL.size(); i < e; ++i) { - CmpInst *Cmp = cast(VL[i]); + for (Value *V : VL) { + if (isa(V)) + continue; + auto *Cmp = cast(V); if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) || Cmp->getOperand(0)->getType() != ComparedTy) { BS.cancelScheduling(VL, VL0); @@ -2439,10 +2523,15 @@ // Commutative predicate - collect + sort operands of the instructions // so that each side is more likely to have the same opcode. assert(P0 == SwapP0 && "Commutative Predicate mismatch"); - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(*VL0, VL, Left, Right, *DL, *SE); } else { // Collect operands - commute if it uses the swapped predicate. for (Value *V : VL) { + if (isa(V)) { + Left.push_back(UndefValue::get(VL0->getOperand(0)->getType())); + Right.push_back(UndefValue::get(VL0->getOperand(1)->getType())); + continue; + } auto *Cmp = cast(V); Value *LHS = Cmp->getOperand(0); Value *RHS = Cmp->getOperand(1); @@ -2486,7 +2575,7 @@ // have the same opcode. if (isa(VL0) && VL0->isCommutative()) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(*VL0, VL, Left, Right, *DL, *SE); TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -2494,12 +2583,16 @@ return; } - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *V : VL) { + Operands.push_back( + isa(V) + ? UndefValue::get(VL0->getOperand(i)->getType()) + : cast(V)->getOperand(i)); + } buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -2507,8 +2600,10 @@ } case Instruction::GetElementPtr: { // We don't combine GEPs with complicated (nested) indexing. - for (unsigned j = 0; j < VL.size(); ++j) { - if (cast(VL[j])->getNumOperands() != 2) { + for (Value *V : VL) { + if (isa(V)) + continue; + if (cast(V)->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n"); BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, UserTreeIdx, @@ -2520,8 +2615,10 @@ // We can't combine several GEPs into one vector if they operate on // different types. Type *Ty0 = VL0->getOperand(0)->getType(); - for (unsigned j = 0; j < VL.size(); ++j) { - Type *CurTy = cast(VL[j])->getOperand(0)->getType(); + for (Value *V : VL) { + if (isa(V)) + continue; + Type *CurTy = cast(V)->getOperand(0)->getType(); if (Ty0 != CurTy) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n"); @@ -2533,8 +2630,10 @@ } // We don't combine GEPs with non-constant indexes. - for (unsigned j = 0; j < VL.size(); ++j) { - auto Op = cast(VL[j])->getOperand(1); + for (Value *V : VL) { + if (isa(V)) + continue; + auto *Op = cast(V)->getOperand(1); if (!isa(Op)) { LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n"); @@ -2548,12 +2647,16 @@ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n"); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); for (unsigned i = 0, e = 2; i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *V : VL) { + Operands.push_back( + isa(V) + ? UndefValue::get(VL0->getOperand(i)->getType()) + : cast(V)->getOperand(i)); + } buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -2561,23 +2664,38 @@ } case Instruction::Store: { // Check if the stores are consecutive or of we need to swizzle them. - for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) - if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) { + for (auto I = InstructionsOnly.begin(), E = InstructionsOnly.end(); + std::next(I) != E; ++I) { + if (!isConsecutiveAccess(*I, *std::next(I), *DL, *SE)) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); return; } + } + if (NumberOfInstructions != VL.size() && + !(VL0->getOperand(0)->getType()->isIntegerTy() || + VL0->getOperand(0)->getType()->isFloatingPointTy() || + VL0->getOperand(0)->getType()->isPointerTy())) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n"); + return; + } TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n"); ValueList Operands; - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(0)); - TE->setOperandsInOrder(); + for (Value *V : VL) { + Operands.push_back(isa(V) + ? UndefValue::get(VL0->getOperand(0)->getType()) + : cast(V)->getOperand(0)); + } + TE->setOperandsInOrder(VL0); buildTree_rec(Operands, Depth + 1, {TE, 0}); return; } @@ -2600,16 +2718,23 @@ for (unsigned j = 0; j != NumArgs; ++j) if (hasVectorInstrinsicScalarOpd(ID, j)) ScalarArgs[j] = CI->getArgOperand(j); - for (unsigned i = 1, e = VL.size(); i != e; ++i) { - CallInst *CI2 = dyn_cast(VL[i]); + for (Value *V : VL) { + if (isa(V)) { + BS.cancelScheduling(VL, VL0); + newTreeEntry(VL, None /*not vectorized*/, UserTreeIdx, + ReuseShuffleIndicies); + LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n"); + return; + } + CallInst *CI2 = dyn_cast(V); if (!CI2 || CI2->getCalledFunction() != Int || getVectorIntrinsicIDForCall(CI2, TLI) != ID || !CI->hasIdenticalOperandBundleSchema(*CI2)) { BS.cancelScheduling(VL, VL0); newTreeEntry(VL, None /*not vectorized*/, UserTreeIdx, ReuseShuffleIndicies); - LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i] - << "\n"); + LLVM_DEBUG(dbgs() + << "SLP: mismatched calls:" << *CI << "!=" << *V << "\n"); return; } // Some intrinsics have scalar arguments and should be same in order for @@ -2637,19 +2762,19 @@ newTreeEntry(VL, None /*not vectorized*/, UserTreeIdx, ReuseShuffleIndicies); LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" - << *CI << "!=" << *VL[i] << '\n'); + << *CI << "!=" << *V << '\n'); return; } } TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, UserTreeIdx, ReuseShuffleIndicies); - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) { - CallInst *CI2 = dyn_cast(j); + for (Value *V : VL) { + auto *CI2 = dyn_cast(V); Operands.push_back(CI2->getArgOperand(i)); } buildTree_rec(Operands, Depth + 1, {TE, i}); @@ -2673,7 +2798,7 @@ // Reorder operands if reordering would enable vectorization. if (isa(VL0)) { ValueList Left, Right; - reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE); + reorderInputsAccordingToOpcode(*VL0, VL, Left, Right, *DL, *SE); TE->setOperand(0, Left); TE->setOperand(1, Right); buildTree_rec(Left, Depth + 1, {TE, 0}); @@ -2681,12 +2806,15 @@ return; } - TE->setOperandsInOrder(); + TE->setOperandsInOrder(VL0); for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) { ValueList Operands; // Prepare the operand vector. - for (Value *j : VL) - Operands.push_back(cast(j)->getOperand(i)); + for (Value *V : VL) + Operands.push_back( + isa(V) + ? UndefValue::get(VL0->getOperand(i)->getType()) + : cast(V)->getOperand(i)); buildTree_rec(Operands, Depth + 1, {TE, i}); } @@ -2753,9 +2881,6 @@ NElts = Vec->getType()->getVectorNumElements(); } - if (NElts != VL.size()) - return false; - // Check that all of the indices extract from the correct offset. bool ShouldKeepOrder = true; unsigned E = VL.size(); @@ -2765,8 +2890,10 @@ // consecutive access in the extract instructions, by checking that no // element of CurrentOrder still has value E + 1. CurrentOrder.assign(E, E + 1); - unsigned I = 0; - for (; I < E; ++I) { + unsigned I = 0, End = std::min(NElts, E); + for (; I < End; ++I) { + if (isa(VL[I])) + continue; auto *Inst = cast(VL[I]); if (Inst->getOperand(0) != Vec) break; @@ -2785,10 +2912,12 @@ CurrentOrder[I] = I; } } - if (I < E) { + if (I < End) { CurrentOrder.clear(); return false; } + if (NElts > E) + return false; return ShouldKeepOrder; } @@ -2803,18 +2932,29 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { ArrayRef VL = E->Scalars; + auto InstructionsOnly = make_filter_range(VL, Instruction::classof); + const unsigned NumOfInstructions = + llvm::count_if(InstructionsOnly, [](Value *V) { return true; }); + Value *V0; Type *ScalarTy = VL[0]->getType(); - if (StoreInst *SI = dyn_cast(VL[0])) - ScalarTy = SI->getValueOperand()->getType(); - else if (CmpInst *CI = dyn_cast(VL[0])) - ScalarTy = CI->getOperand(0)->getType(); - VectorType *VecTy = VectorType::get(ScalarTy, VL.size()); - - // If we have computed a smaller type for the expression, update VecTy so - // that the costs will be accurate. - if (MinBWs.count(VL[0])) - VecTy = VectorType::get( - IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size()); + VectorType *VecTy; + if (!llvm::empty(InstructionsOnly)) { + V0 = *InstructionsOnly.begin(); + if (StoreInst *SI = dyn_cast(V0)) + ScalarTy = SI->getValueOperand()->getType(); + else if (CmpInst *CI = dyn_cast(V0)) + ScalarTy = CI->getOperand(0)->getType(); + VecTy = VectorType::get(ScalarTy, VL.size()); + + // If we have computed a smaller type for the expression, update VecTy so + // that the costs will be accurate. + if (MinBWs.count(V0)) { + VecTy = VectorType::get( + IntegerType::get(F->getContext(), MinBWs[V0].first), VL.size()); + } + } else { + VecTy = VectorType::get(ScalarTy, VL.size()); + } unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size(); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); @@ -2832,10 +2972,15 @@ } if (getSameOpcode(VL).getOpcode() == Instruction::ExtractElement && allSameType(VL) && allSameBlock(VL)) { - Optional ShuffleKind = isShuffle(VL); - if (ShuffleKind.hasValue()) { - int Cost = TTI->getShuffleCost(ShuffleKind.getValue(), VecTy); - for (auto *V : VL) { + Optional ShuffleKind = + NumOfInstructions > 1 + ? isShuffle(llvm::to_vector<4>(InstructionsOnly)) + : None; + if (NumOfInstructions == 1 || ShuffleKind) { + int Cost = NumOfInstructions > 1 + ? TTI->getShuffleCost(*ShuffleKind, VecTy) + : 0; + for (Value *V : InstructionsOnly) { // If all users of instruction are going to be vectorized and this // instruction itself is not going to be vectorized, consider this // instruction as dead and remove its cost from the final cost of the @@ -2844,8 +2989,10 @@ !ScalarToTreeEntry.count(V)) { auto *IO = cast( cast(V)->getIndexOperand()); - Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, - IO->getZExtValue()); + Cost -= TTI->getVectorInstrCost( + Instruction::ExtractElement, + cast(V)->getVectorOperandType(), + IO->getZExtValue()); } } return ReuseShuffleCost + Cost; @@ -2863,16 +3010,19 @@ return 0; case Instruction::ExtractValue: - case Instruction::ExtractElement: + case Instruction::ExtractElement: { if (NeedToShuffleReuses) { unsigned Idx = 0; for (unsigned I : E->ReuseShuffleIndices) { + if (isa(VL[I])) + continue; if (ShuffleOrOp == Instruction::ExtractElement) { auto *IO = cast( cast(VL[I])->getIndexOperand()); Idx = IO->getZExtValue(); ReuseShuffleCost -= TTI->getVectorInstrCost( - Instruction::ExtractElement, VecTy, Idx); + Instruction::ExtractElement, + cast(VL[I])->getVectorOperandType(), Idx); } else { ReuseShuffleCost -= TTI->getVectorInstrCost( Instruction::ExtractElement, VecTy, Idx); @@ -2881,53 +3031,77 @@ } Idx = ReuseShuffleNumbers; for (Value *V : VL) { + if (isa(V)) + continue; if (ShuffleOrOp == Instruction::ExtractElement) { auto *IO = cast( cast(V)->getIndexOperand()); Idx = IO->getZExtValue(); + ReuseShuffleCost += TTI->getVectorInstrCost( + Instruction::ExtractElement, + cast(V)->getVectorOperandType(), Idx); } else { --Idx; + ReuseShuffleCost += TTI->getVectorInstrCost( + Instruction::ExtractElement, VecTy, Idx); } - ReuseShuffleCost += - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx); } } - if (!E->NeedToGather) { - int DeadCost = ReuseShuffleCost; - if (!E->ReorderIndices.empty()) { - // TODO: Merge this shuffle with the ReuseShuffleCost. - DeadCost += TTI->getShuffleCost( - TargetTransformInfo::SK_PermuteSingleSrc, VecTy); - } - for (unsigned i = 0, e = VL.size(); i < e; ++i) { - Instruction *E = cast(VL[i]); - // If all users are going to be vectorized, instruction can be - // considered as dead. - // The same, if have only one user, it will be vectorized for sure. - if (areAllUsersVectorized(E)) { - // Take credit for instruction that will become dead. - if (E->hasOneUse()) { - Instruction *Ext = E->user_back(); - if ((isa(Ext) || isa(Ext)) && - all_of(Ext->users(), - [](User *U) { return isa(U); })) { - // Use getExtractWithExtendCost() to calculate the cost of - // extractelement/ext pair. - DeadCost -= TTI->getExtractWithExtendCost( - Ext->getOpcode(), Ext->getType(), VecTy, i); - // Add back the cost of s|zext which is subtracted separately. - DeadCost += TTI->getCastInstrCost( - Ext->getOpcode(), Ext->getType(), E->getType(), Ext); - continue; - } +#ifndef NDEBUG + OrdersType CurrentOrder; + bool Reuse = canReuseExtract(VL, VL0, CurrentOrder); + assert(Reuse && E->ReorderIndices.empty() || + (!Reuse && CurrentOrder.size() == E->ReorderIndices.size() && + std::equal(CurrentOrder.begin(), CurrentOrder.end(), + E->ReorderIndices.begin())) && + "The sequence of extract elements must be reused or shuffled " + "with the same mask."); +#endif + int DeadCost = ReuseShuffleCost; + if (!E->ReorderIndices.empty()) { + // TODO: Merge this shuffle with the ReuseShuffleCost. + DeadCost += TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } + for (unsigned I = 0, E = VL.size(); I < E; ++I) { + if (isa(VL[I])) + continue; + auto *EI = cast(VL[I]); + // If all users are going to be vectorized, instruction can be + // considered as dead. + // The same, if have only one user, it will be vectorized for sure. + if (areAllUsersVectorized(EI)) { + // Take credit for instruction that will become dead. + if (EI->hasOneUse()) { + Instruction *Ext = EI->user_back(); + if ((isa(Ext) || isa(Ext)) && + all_of(Ext->users(), + [](User *U) { return isa(U); })) { + // Use getExtractWithExtendCost() to calculate the cost of + // extractelement/ext pair. + DeadCost -= TTI->getExtractWithExtendCost( + Ext->getOpcode(), Ext->getType(), VecTy, I); + // Add back the cost of s|zext which is subtracted separately. + DeadCost += TTI->getCastInstrCost( + Ext->getOpcode(), Ext->getType(), EI->getType(), Ext); + continue; } + } + if (ShuffleOrOp == Instruction::ExtractElement) { + auto *IO = cast( + cast(EI)->getIndexOperand()); + unsigned Idx = IO->getZExtValue(); + DeadCost -= TTI->getVectorInstrCost( + Instruction::ExtractElement, + cast(EI)->getVectorOperandType(), Idx); + } else { DeadCost -= - TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); + TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I); } } - return DeadCost; } - return ReuseShuffleCost + getGatherCost(VL); + return DeadCost; + } case Instruction::ZExt: case Instruction::SExt: @@ -2945,11 +3119,12 @@ int ScalarEltCost = TTI->getCastInstrCost(S.getOpcode(), ScalarTy, SrcTy, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } // Calculate the cost of this instruction. - int ScalarCost = VL.size() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); int VecCost = 0; @@ -2967,10 +3142,11 @@ int ScalarEltCost = TTI->getCmpSelInstrCost(S.getOpcode(), ScalarTy, Builder.getInt1Ty(), VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size()); - int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; int VecCost = TTI->getCmpSelInstrCost(S.getOpcode(), VecTy, MaskTy, VL0); return ReuseShuffleCost + VecCost - ScalarCost; } @@ -3009,24 +3185,27 @@ // If instead not all operands are constants, then set the operand kind // to OK_AnyValue. If all operands are constants but not the same, // then set the operand kind to OK_NonUniformConstantValue. - ConstantInt *CInt0 = nullptr; + Constant *C0 = nullptr; for (unsigned i = 0, e = VL.size(); i < e; ++i) { + if (isa(VL[i])) + continue; const Instruction *I = cast(VL[i]); unsigned OpIdx = isa(I) ? 1 : 0; ConstantInt *CInt = dyn_cast(I->getOperand(OpIdx)); - if (!CInt) { + Constant *UV = dyn_cast(I->getOperand(OpIdx)); + if (!CInt && !UV) { Op2VK = TargetTransformInfo::OK_AnyValue; Op2VP = TargetTransformInfo::OP_None; break; } if (Op2VP == TargetTransformInfo::OP_PowerOf2 && - !CInt->getValue().isPowerOf2()) + (UV || !cast(CInt)->getValue().isPowerOf2())) Op2VP = TargetTransformInfo::OP_None; if (i == 0) { - CInt0 = CInt; + C0 = CInt ? CInt : UV; continue; } - if (CInt0 != CInt) + if (C0 != (CInt ? CInt : UV)) Op2VK = TargetTransformInfo::OK_NonUniformConstantValue; } @@ -3034,9 +3213,10 @@ int ScalarEltCost = TTI->getArithmeticInstrCost( S.getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } - int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; int VecCost = TTI->getArithmeticInstrCost(S.getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands); return ReuseShuffleCost + VecCost - ScalarCost; @@ -3050,9 +3230,10 @@ int ScalarEltCost = TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } - int ScalarCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCost = NumOfInstructions * ScalarEltCost; int VecCost = TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK); return ReuseShuffleCost + VecCost - ScalarCost; @@ -3063,12 +3244,33 @@ int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; + } + int ScalarLdCost = NumOfInstructions * ScalarEltCost; + int VecLdCost; + auto FirstInstr = llvm::find_if(VL, Instruction::classof); + auto LastInstr = llvm::find_if(llvm::reverse(VL), Instruction::classof); + unsigned InstrDist = std::distance(FirstInstr, LastInstr.base()); + bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(InstrDist); + if (!IsPowOf2NumOfInstructions || InstrDist == 1) { + VecLdCost = TTI->getIntrinsicInstrCost( + Intrinsic::masked_load, VecTy, + {VecTy->getPointerTo(), Builder.getInt32Ty(), + VectorType::get(Builder.getInt1Ty(), + VecTy->getVectorNumElements()), + VecTy}, FastMathFlags()); + } else if (InstrDist == VL.size()) { + VecLdCost = + TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); + } else { + VectorType *LoadVecTy = VectorType::get(ScalarTy, InstrDist); + VecLdCost = TTI->getMemoryOpCost(Instruction::Load, LoadVecTy, + alignment, 0, VL0) ; } - int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost; - int VecLdCost = - TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0); - if (!E->ReorderIndices.empty()) { + if (!NeedToShuffleReuses && + (!E->ReorderIndices.empty() || + (IsPowOf2NumOfInstructions && InstrDist != VL.size()))) { // TODO: Merge this shuffle with the ReuseShuffleCost. VecLdCost += TTI->getShuffleCost( TargetTransformInfo::SK_PermuteSingleSrc, VecTy); @@ -3081,11 +3283,22 @@ int ScalarEltCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; + } + int ScalarStCost = NumOfInstructions * ScalarEltCost; + int VecStCost; + if (NumOfInstructions != VL.size()) { + VecStCost = TTI->getIntrinsicInstrCost( + Intrinsic::masked_store, Builder.getVoidTy(), + {VecTy, VecTy->getPointerTo(), Builder.getInt32Ty(), + VectorType::get(Builder.getInt1Ty(), + VecTy->getVectorNumElements())}, + FastMathFlags()); + } else { + VecStCost = + TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0); } - int ScalarStCost = VecTy->getNumElements() * ScalarEltCost; - int VecStCost = - TTI->getMemoryOpCost(Instruction::Store, VecTy, alignment, 0, VL0); return ReuseShuffleCost + VecStCost - ScalarStCost; } case Instruction::Call: { @@ -3104,9 +3317,10 @@ int ScalarEltCost = TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF); if (NeedToShuffleReuses) { - ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost; + ReuseShuffleCost -= + (ReuseShuffleNumbers - NumOfInstructions) * ScalarEltCost; } - int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost; + int ScalarCallCost = NumOfInstructions * ScalarEltCost; SmallVector Args(CI->arg_operands()); int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF, @@ -3128,18 +3342,22 @@ int ScalarCost = 0; if (NeedToShuffleReuses) { for (unsigned Idx : E->ReuseShuffleIndices) { - Instruction *I = cast(VL[Idx]); + if (isa(VL[Idx])) + continue; + auto *I = cast(VL[Idx]); ReuseShuffleCost -= TTI->getInstructionCost( I, TargetTransformInfo::TCK_RecipThroughput); } - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { Instruction *I = cast(V); ReuseShuffleCost += TTI->getInstructionCost( I, TargetTransformInfo::TCK_RecipThroughput); } } - for (Value *i : VL) { - Instruction *I = cast(i); + for (Value *V : VL) { + if (isa(V)) + continue; + auto *I = cast(V); assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode"); ScalarCost += TTI->getInstructionCost( I, TargetTransformInfo::TCK_RecipThroughput); @@ -3365,12 +3583,15 @@ return Cost; } -int BoUpSLP::getGatherCost(Type *Ty, - const DenseSet &ShuffledIndices) const { +int BoUpSLP::getGatherCost(Type *Ty, const DenseSet &ShuffledIndices, + const DenseSet &IgnoredIndices) const { int Cost = 0; - for (unsigned i = 0, e = cast(Ty)->getNumElements(); i < e; ++i) - if (!ShuffledIndices.count(i)) - Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i); + for (unsigned I = 0, E = cast(Ty)->getNumElements(); I < E; ++I) { + if (IgnoredIndices.count(I)) + continue; + if (!ShuffledIndices.count(I)) + Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, I); + } if (!ShuffledIndices.empty()) Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty); return Cost; @@ -3386,25 +3607,30 @@ // Check if the same elements are inserted several times and count them as // shuffle candidates. DenseSet ShuffledElements; + DenseSet IgnoredElements; DenseSet UniqueElements; // Iterate in reverse order to consider insert elements with the high cost. for (unsigned I = VL.size(); I > 0; --I) { unsigned Idx = I - 1; + if (isa(VL[Idx])) { + IgnoredElements.insert(Idx); + continue; + } if (!UniqueElements.insert(VL[Idx]).second) ShuffledElements.insert(Idx); } - return getGatherCost(VecTy, ShuffledElements); + return getGatherCost(VecTy, ShuffledElements, IgnoredElements); } // Perform operand reordering on the instructions in VL and return the reordered // operands in Left and Right. void BoUpSLP::reorderInputsAccordingToOpcode( - ArrayRef VL, SmallVectorImpl &Left, + Instruction &VL0, ArrayRef VL, SmallVectorImpl &Left, SmallVectorImpl &Right, const DataLayout &DL, ScalarEvolution &SE) { if (VL.empty()) return; - VLOperands Ops(VL, DL, SE); + VLOperands Ops(VL0, VL, DL, SE); // Reorder the operands in place. Ops.reorder(); Left = Ops.getVL(0); @@ -3413,11 +3639,14 @@ void BoUpSLP::setInsertPointAfterBundle(ArrayRef VL, const InstructionsState &S) { + auto InstructionsOnly = make_filter_range(VL, Instruction::classof); + if (llvm::empty(InstructionsOnly)) + return; // Get the basic block this bundle is in. All instructions in the bundle // should be in this block. auto *Front = cast(S.OpValue); auto *BB = Front->getParent(); - assert(llvm::all_of(make_range(VL.begin(), VL.end()), [=](Value *V) -> bool { + assert(llvm::all_of(InstructionsOnly, [=](Value *V) -> bool { auto *I = cast(V); return !S.isOpcodeOrAlt(I) || I->getParent() == BB; })); @@ -3430,8 +3659,8 @@ // VL.back() and iterate over schedule data until we reach the end of the // bundle. The end of the bundle is marked by null ScheduleData. if (BlocksSchedules.count(BB)) { - auto *Bundle = - BlocksSchedules[BB]->getScheduleData(isOneOf(S, VL.back())); + auto *Bundle = BlocksSchedules[BB]->getScheduleData( + isOneOf(S, *llvm::reverse(InstructionsOnly).begin())); if (Bundle && Bundle->isPartOfBundle()) for (; Bundle; Bundle = Bundle->NextInBundle) if (Bundle->OpValue == Bundle->Inst) @@ -3457,7 +3686,8 @@ // we both exit early from buildTree_rec and that the bundle be out-of-order // (causing us to iterate all the way to the end of the block). if (!LastInst) { - SmallPtrSet Bundle(VL.begin(), VL.end()); + SmallPtrSet Bundle(InstructionsOnly.begin(), + InstructionsOnly.end()); for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) { if (Bundle.erase(&I) && S.isOpcodeOrAlt(&I)) LastInst = &I; @@ -3476,6 +3706,8 @@ Value *Vec = UndefValue::get(Ty); // Generate the 'InsertElement' instruction. for (unsigned i = 0; i < Ty->getNumElements(); ++i) { + if (isa(VL[i])) + continue; Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i)); if (Instruction *Insrt = dyn_cast(Vec)) { GatherSeq.insert(Insrt); @@ -3509,27 +3741,27 @@ Value *BoUpSLP::vectorizeTree(ArrayRef VL) { InstructionsState S = getSameOpcode(VL); if (S.getOpcode()) { - if (TreeEntry *E = getTreeEntry(S.OpValue)) { - if (E->isSame(VL)) { - Value *V = vectorizeTree(E); - if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) { - // We need to get the vectorized value but without shuffle. - if (auto *SV = dyn_cast(V)) { - V = SV->getOperand(0); - } else { - // Reshuffle to get only unique values. - SmallVector UniqueIdxs; - SmallSet UsedIdxs; - for(unsigned Idx : E->ReuseShuffleIndices) - if (UsedIdxs.insert(Idx).second) - UniqueIdxs.emplace_back(Idx); - V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), - UniqueIdxs); - } - } - return V; + // Check that every instruction appears once in this bundle. + SmallVector UniqueValues; + DenseMap UniquePositions; + UniqueValues.reserve(VL.size()); + for (Value *V : VL) { + if (isa(V)) { + UniqueValues.emplace_back(V); + continue; } + auto Res = UniquePositions.try_emplace(V, UniqueValues.size()); + if (Res.second) + UniqueValues.emplace_back(V); + } + if (UniqueValues.size() != VL.size()) { + LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); + UniqueValues.append(VL.size() - UniqueValues.size(), + UndefValue::get(VL[0]->getType())); } + if (TreeEntry *E = getTreeEntry(S.OpValue)) + if (E->isSame(UniqueValues)) + return vectorizeTree(E); } Type *ScalarTy = S.OpValue->getType(); @@ -3573,9 +3805,10 @@ SmallVectorImpl &Mask) { Mask.clear(); const unsigned E = Indices.size(); - Mask.resize(E); + Mask.resize(E, E - 1); for (unsigned I = 0; I < E; ++I) - Mask[Indices[I]] = I; + if (Indices[I] < E) + Mask[Indices[I]] = I; } Value *BoUpSLP::vectorizeTree(TreeEntry *E) { @@ -3592,6 +3825,8 @@ if (StoreInst *SI = dyn_cast(VL0)) ScalarTy = SI->getValueOperand()->getType(); VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size()); + if (isa(VL0)) + return UndefValue::get(VecTy); bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty(); @@ -3609,9 +3844,9 @@ E->VectorizedValue = V; return V; } - - unsigned ShuffleOrOp = S.isAltShuffle() ? - (unsigned) Instruction::ShuffleVector : S.getOpcode(); + auto InstructionsOnly = make_filter_range(E->Scalars, Instruction::classof); + unsigned ShuffleOrOp = + S.isAltShuffle() ? (unsigned)Instruction::ShuffleVector : S.getOpcode(); switch (ShuffleOrOp) { case Instruction::PHI: { PHINode *PH = dyn_cast(VL0); @@ -3650,72 +3885,44 @@ } case Instruction::ExtractElement: { - if (!E->NeedToGather) { - Value *V = E->getSingleOperand(0); - if (!E->ReorderIndices.empty()) { - OrdersType Mask; - inversePermutation(E->ReorderIndices, Mask); - Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask, - "reorder_shuffle"); - } - if (NeedToShuffleReuses) { - // TODO: Merge this shuffle with the ReorderShuffleMask. - if (E->ReorderIndices.empty()) - Builder.SetInsertPoint(VL0); - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - } - E->VectorizedValue = V; - return V; + Value *V = E->getSingleOperand(0); + if (!E->ReorderIndices.empty()) { + OrdersType Mask; + inversePermutation(E->ReorderIndices, Mask); + Builder.SetInsertPoint(VL0); + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask, + "reorder_shuffle"); } - setInsertPointAfterBundle(E->Scalars, S); - auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), + // TODO: Merge this shuffle with the ReorderShuffleMask. + if (E->ReorderIndices.empty()) + Builder.SetInsertPoint(VL0); + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } } E->VectorizedValue = V; return V; } case Instruction::ExtractValue: { - if (!E->NeedToGather) { - LoadInst *LI = cast(E->getSingleOperand(0)); - Builder.SetInsertPoint(LI); - PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); - Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); - LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment()); - Value *NewV = propagateMetadata(V, E->Scalars); - if (!E->ReorderIndices.empty()) { - OrdersType Mask; - inversePermutation(E->ReorderIndices, Mask); - NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask, - "reorder_shuffle"); - } - if (NeedToShuffleReuses) { - // TODO: Merge this shuffle with the ReorderShuffleMask. - NewV = Builder.CreateShuffleVector( - NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); - } - E->VectorizedValue = NewV; - return NewV; + auto *LI = cast(VL0->getOperand(0)); + Builder.SetInsertPoint(LI); + auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace()); + Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy); + LoadInst *V = Builder.CreateAlignedLoad(Ptr, LI->getAlignment()); + Value *NewV = propagateMetadata(V, llvm::to_vector<4>(InstructionsOnly)); + if (!E->ReorderIndices.empty()) { + OrdersType Mask; + inversePermutation(E->ReorderIndices, Mask); + NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask, + "reorder_shuffle"); } - setInsertPointAfterBundle(E->Scalars, S); - auto *V = Gather(E->Scalars, VecTy); if (NeedToShuffleReuses) { - V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), - E->ReuseShuffleIndices, "shuffle"); - if (auto *I = dyn_cast(V)) { - GatherSeq.insert(I); - CSEBlocks.insert(I->getParent()); - } + // TODO: Merge this shuffle with the ReorderShuffleMask. + NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), + E->ReuseShuffleIndices, "shuffle"); } - E->VectorizedValue = V; - return V; + E->VectorizedValue = NewV; + return NewV; } case Instruction::ZExt: case Instruction::SExt: @@ -3854,7 +4061,7 @@ static_cast(S.getOpcode()), LHS, RHS); propagateIRFlags(V, E->Scalars, VL0); if (auto *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); + V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly)); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -3879,8 +4086,21 @@ Type *ScalarLoadTy = LI->getType(); unsigned AS = LI->getPointerAddressSpace(); - Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), - VecTy->getPointerTo(AS)); + auto FirstInstr = llvm::find_if(E->Scalars, Instruction::classof); + auto LastInstr = + llvm::find_if(llvm::reverse(E->Scalars), Instruction::classof).base(); + unsigned NumOfInstructions = std::distance(FirstInstr, LastInstr); + bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(NumOfInstructions); + Value *VecPtr; + if (!IsPowOf2NumOfInstructions || + NumOfInstructions == E->Scalars.size() || NumOfInstructions == 1) { + VecPtr = Builder.CreateBitCast(LI->getPointerOperand(), + VecTy->getPointerTo(AS)); + } else { + VecPtr = Builder.CreateBitCast( + LI->getPointerOperand(), + VectorType::get(ScalarTy, NumOfInstructions)->getPointerTo(AS)); + } // The pointer operand uses an in-tree scalar so we add the new BitCast to // ExternalUses list to make sure that an extract will be generated in the @@ -3890,12 +4110,42 @@ ExternalUses.push_back(ExternalUser(PO, cast(VecPtr), 0)); unsigned Alignment = LI->getAlignment(); - LI = Builder.CreateLoad(VecTy, VecPtr); if (!Alignment) { Alignment = DL->getABITypeAlignment(ScalarLoadTy); } - LI->setAlignment(Alignment); - Value *V = propagateMetadata(LI, E->Scalars); + Instruction *VecLI; + if (IsPowOf2NumOfInstructions) { + LI = Builder.CreateLoad(VecPtr); + LI->setAlignment(Alignment); + VecLI = LI; + } else { + SmallVector Mask; + SmallVector Passthrough; + Mask.reserve(E->Scalars.size()); + Passthrough.reserve(E->Scalars.size()); + for (auto *V : E->Scalars) { + Mask.emplace_back(Builder.getInt1(!isa(V))); + Passthrough.emplace_back(isa(V) + ? cast(V) + : UndefValue::get(LI->getType())); + } + VecLI = Builder.CreateMaskedLoad(VecPtr, Alignment, + ConstantVector::get(Mask), + ConstantVector::get(Passthrough)); + } + Value *V = propagateMetadata(VecLI, llvm::to_vector<4>(InstructionsOnly)); + if (PowerOf2Ceil(NumOfInstructions) != E->Scalars.size()) { + SmallVector ExtendedIndices(E->Scalars.size(), + NumOfInstructions); + const unsigned Dist = std::distance(E->Scalars.begin(), FirstInstr); + auto LI = FirstInstr; + for (unsigned I = 0; I < NumOfInstructions; ++I, ++LI) { + if (!isa(*LI)) + ExtendedIndices[I] = Dist + I; + } + V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()), + ExtendedIndices, "load.extend"); + } if (IsReorder) { OrdersType Mask; inversePermutation(E->ReorderIndices, Mask); @@ -3915,13 +4165,55 @@ StoreInst *SI = cast(VL0); unsigned Alignment = SI->getAlignment(); unsigned AS = SI->getPointerAddressSpace(); + if (!Alignment) + Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); + setInsertPointAfterBundle(E->Scalars, S); Value *VecValue = vectorizeTree(E->getOperand(0)); Value *ScalarPtr = SI->getPointerOperand(); - Value *VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); - StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); + + auto FirstInstr = llvm::find_if(E->Scalars, Instruction::classof); + auto LastInstr = + llvm::find_if(llvm::reverse(E->Scalars), Instruction::classof).base(); + unsigned NumOfInstructions = std::distance(FirstInstr, LastInstr); + bool IsPowOf2NumOfInstructions = llvm::isPowerOf2_32(NumOfInstructions); + if (PowerOf2Ceil(NumOfInstructions) != E->Scalars.size()) { + SmallVector ExtendedIndices(NumOfInstructions, + NumOfInstructions); + const unsigned Dist = std::distance(E->Scalars.begin(), FirstInstr); + auto VI = FirstInstr; + for (unsigned I = 0; I < NumOfInstructions; ++I, ++VI) { + if (!isa(*VI)) + ExtendedIndices[I] = Dist + I; + } + VecValue = Builder.CreateShuffleVector( + VecValue, UndefValue::get(VecValue->getType()), ExtendedIndices, + "values.extend"); + } + Value *VecPtr; + if (!IsPowOf2NumOfInstructions || + NumOfInstructions == E->Scalars.size() || NumOfInstructions == 1) { + VecPtr = Builder.CreateBitCast(ScalarPtr, VecTy->getPointerTo(AS)); + } else { + VecPtr = Builder.CreateBitCast( + ScalarPtr, + VectorType::get(ScalarTy, NumOfInstructions)->getPointerTo(AS)); + } + Instruction *VecSI; + if (IsPowOf2NumOfInstructions) { + StoreInst *ST = Builder.CreateStore(VecValue, VecPtr); + ST->setAlignment(Alignment); + VecSI = ST; + } else { + SmallVector Mask; + Mask.reserve(E->Scalars.size()); + for (auto *V : E->Scalars) + Mask.emplace_back(Builder.getInt1(!isa(V))); + VecSI = Builder.CreateMaskedStore(VecValue, VecPtr, Alignment, + ConstantVector::get(Mask)); + } // The pointer operand uses an in-tree scalar, so add the new BitCast to // ExternalUses to make sure that an extract will be generated in the @@ -3929,11 +4221,7 @@ if (getTreeEntry(ScalarPtr)) ExternalUses.push_back(ExternalUser(ScalarPtr, cast(VecPtr), 0)); - if (!Alignment) - Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); - - ST->setAlignment(Alignment); - Value *V = propagateMetadata(ST, E->Scalars); + Value *V = propagateMetadata(VecSI, llvm::to_vector<4>(InstructionsOnly)); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); @@ -3957,7 +4245,7 @@ Value *V = Builder.CreateGEP( cast(VL0)->getSourceElementType(), Op0, OpVecs); if (Instruction *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); + V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly)); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), @@ -4060,6 +4348,11 @@ unsigned e = E->Scalars.size(); SmallVector Mask(e); for (unsigned i = 0; i < e; ++i) { + if (isa(E->Scalars[i])) { + Mask[i] = Builder.getInt32(i); + OpScalars.push_back(E->Scalars[i]); + continue; + } auto *OpInst = cast(E->Scalars[i]); assert(S.isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode"); if (OpInst->getOpcode() == S.getAltOpcode()) { @@ -4077,7 +4370,7 @@ Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask); if (Instruction *I = dyn_cast(V)) - V = propagateMetadata(I, E->Scalars); + V = propagateMetadata(I, llvm::to_vector<4>(InstructionsOnly)); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle"); @@ -4227,6 +4520,8 @@ // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; + if (isa(Scalar)) + continue; Type *Ty = Scalar->getType(); if (!Ty->isVoidTy()) { @@ -4350,14 +4645,15 @@ bool ReSchedule = false; LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n"); + auto InstructionsOnly = make_filter_range(VL, Instruction::classof); // Make sure that the scheduling region contains all // instructions of the bundle. - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { if (!extendSchedulingRegion(V, S)) return None; } - for (Value *V : VL) { + for (Value *V : InstructionsOnly) { ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member (maybe not in same basic block)"); @@ -4436,7 +4732,9 @@ LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n"); assert(!Bundle->IsScheduled && "Can't cancel bundle which is already scheduled"); - assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() && + assert(Bundle->isSchedulingEntity() && + (Bundle->isPartOfBundle() || + llvm::count_if(VL, Instruction::classof) == 1) && "tried to unbundle something which is not a bundle"); // Un-bundle: make single instructions out of the bundle. @@ -4741,7 +5039,9 @@ I = I->getNextNode()) { BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) { assert(SD->isPartOfBundle() == - (getTreeEntry(SD->Inst) != nullptr) && + (getTreeEntry(SD->Inst) != nullptr && + llvm::count_if(getTreeEntry(SD->Inst)->Scalars, + Instruction::classof) > 1) && "scheduler and vectorizer bundle mismatch"); SD->FirstInBundle->SchedulingPriority = Idx++; if (SD->isSchedulingEntity()) { @@ -5208,15 +5508,25 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, unsigned VecRegSize) { - const unsigned ChainLen = Chain.size(); + unsigned ChainLen = Chain.size(); LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); const unsigned Sz = R.getVectorElementSize(Chain[0]); const unsigned VF = VecRegSize / Sz; - if (!isPowerOf2_32(Sz) || VF < 2) + if (VF < 2 || ChainLen < 2) return false; + SmallVector FixedChain; + if (!isPowerOf2_32(ChainLen)) { + unsigned NewSize = PowerOf2Ceil(ChainLen); + FixedChain.reserve(NewSize); + FixedChain.append(Chain.begin(), Chain.end()); + FixedChain.append(NewSize - Chain.size(), + UndefValue::get(Chain[0]->getType())); + Chain = FixedChain; + ChainLen = NewSize; + } // Keep track of values that were deleted by vectorizing in the loop below. const SmallVector TrackValues(Chain.begin(), Chain.end()); @@ -5389,10 +5699,10 @@ return false; Instruction *I0 = cast(S.OpValue); - unsigned Sz = R.getVectorElementSize(I0); - unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz); - unsigned MaxVF = std::max(PowerOf2Floor(VL.size()), MinVF); - if (MaxVF < 2) { + unsigned Pow2VL = PowerOf2Ceil(VL.size()); + unsigned MinVF = 2; + unsigned MaxVF = Pow2VL; + if (MaxVF < 2 || VL.size() < 2) { R.getORE()->emit([&]() { return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0) << "Cannot SLP vectorize list: vectorization factor " @@ -5426,8 +5736,7 @@ SmallVector TrackValues(VL.begin(), VL.end()); unsigned NextInst = 0, MaxInst = VL.size(); - for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; - VF /= 2) { + for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) { // No actual vectorization should happen, if number of parts is the same as // provided vectorization factor (i.e. the scalar type is used for vector // code during codegen). @@ -5442,7 +5751,7 @@ else OpsWidth = VF; - if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2) + if (OpsWidth < 2 || (VF > MinVF && OpsWidth <= VF / 2)) break; // Check that a previous iteration of this loop did not delete the Value. @@ -5452,6 +5761,22 @@ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations " << "\n"); ArrayRef Ops = VL.slice(I, OpsWidth); + SmallVector FixedChain; + if (UserCost && OpsWidth != VF) { + unsigned NewSize = VF; + FixedChain.reserve(NewSize); + FixedChain.append(Ops.begin(), Ops.end()); + FixedChain.append(NewSize - Ops.size(), + UndefValue::get(Ops[0]->getType())); + Ops = FixedChain; + } else if (OpsWidth != VF && !AllowReorder) { + unsigned NewSize = VF; + FixedChain.reserve(NewSize); + FixedChain.append(Ops.begin(), Ops.end()); + FixedChain.append(NewSize - Ops.size(), + UndefValue::get(Ops[0]->getType())); + Ops = FixedChain; + } R.buildTree(Ops); Optional> Order = R.bestOrder(); @@ -5471,6 +5796,8 @@ R.computeMinimumValueSizes(); int Cost = R.getTreeCost() - UserCost; + LLVM_DEBUG(dbgs() << "SLP: User vectorization cost: " << -UserCost + << ".\n"); CandidateFound = true; MinCost = std::min(MinCost, Cost); @@ -5488,6 +5815,10 @@ NextInst = I + 1; Changed = true; } + if (UserCost && VF == MaxVF && I == 0) { + UserCost -= TTI->getShuffleCost( + TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + } } } @@ -6288,9 +6619,26 @@ Optional> Order = V.bestOrder(); // TODO: Handle orders of size less than number of elements in the vector. if (Order && Order->size() == VL.size()) { + SmallVector NewOrder(Order->begin(), Order->end()); + SmallVector UsedIndices(Order->size(), false); + unsigned BoundVal = NewOrder.size() + 1; + for (unsigned I : *Order) + if (I != BoundVal) + UsedIndices[I] = true; + unsigned Idx = 0, E = UsedIndices.size(); + for (unsigned &I : NewOrder) { + if (I == BoundVal) { + // Find first non-used index. + for (; Idx != E; ++Idx) + if (!UsedIndices[Idx]) + break; + // Set correct index. + I = Idx; ++Idx; + } + } // TODO: reorder tree nodes without tree rebuilding. SmallVector ReorderedOps(VL.size()); - llvm::transform(*Order, ReorderedOps.begin(), + llvm::transform(NewOrder, ReorderedOps.begin(), [VL](const unsigned Idx) { return VL[Idx]; }); V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList); } @@ -6372,6 +6720,15 @@ return VectorizedTree != nullptr; } + SmallVector getExtraArgs() const { + SmallVector Args(ExtraArgs.size()); + auto It = ExtraArgs.begin(); + for (unsigned I = 0, E = ExtraArgs.size(); I < E; ++I, ++It) { + Args[I] = It->second; + } + return Args; + } + unsigned numReductionValues() const { return ReducedVals.size(); } @@ -6510,7 +6867,9 @@ if (isa(V)) break; LastInsertElem = dyn_cast(V); - if (!LastInsertElem || !LastInsertElem->hasOneUse()) + if (!LastInsertElem) + break; + if (!LastInsertElem->hasOneUse()) return false; } while (true); std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end()); @@ -6645,6 +7004,15 @@ // Set P to nullptr to avoid re-analysis of phi node in // matchAssociativeReduction function unless this is the root node. P = nullptr; + // Try to vectorize ExtraArgs. + // Continue analysis for the instruction from the same basic block + // only to save compile time. + if (++Level < RecursionMaxDepth) + for (auto *Op : HorRdx.getExtraArgs()) + if (VisitedInstrs.insert(Op).second) + if (auto *I = dyn_cast(Op)) + if (!isa(I) && I->getParent() == BB) + Stack.emplace_back(Op, Level); continue; } } Index: test/Transforms/SLPVectorizer/AArch64/PR38339.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/PR38339.ll +++ test/Transforms/SLPVectorizer/AArch64/PR38339.ll @@ -3,15 +3,17 @@ define void @f1(<2 x i16> %x, i16* %a) { ; CHECK-LABEL: @f1( -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[X:%.*]], <2 x i16> undef, <4 x i32> +; CHECK-NEXT: [[T2:%.*]] = extractelement <2 x i16> [[X:%.*]], i32 0 +; CHECK-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[X]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 -; CHECK-NEXT: store i16 [[TMP1]], i16* [[A:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP2]], align 2 +; CHECK-NEXT: store i16 [[T2]], i16* [[A:%.*]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR0]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR1]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR2]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR3]] ; CHECK-NEXT: ret void ; %t2 = extractelement <2 x i16> %x, i32 0 @@ -35,15 +37,17 @@ ; CHECK: cont: ; CHECK-NEXT: [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ] ; CHECK-NEXT: [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <4 x i32> +; CHECK-NEXT: [[T2:%.*]] = extractelement <2 x i16> [[XX]], i32 0 +; CHECK-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[XX]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: store i16 [[T2]], i16* [[A]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR0]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR1]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR2]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR3]] ; CHECK-NEXT: [[A_VAL:%.*]] = load i16, i16* [[A]], align 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]] @@ -82,16 +86,17 @@ ; CHECK: cont: ; CHECK-NEXT: [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ] ; CHECK-NEXT: [[AA:%.*]] = phi i16* [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ] -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[REORDER_SHUFFLE]], <2 x i16> undef, <4 x i32> +; CHECK-NEXT: [[T2:%.*]] = extractelement <2 x i16> [[XX]], i32 0 +; CHECK-NEXT: [[T3:%.*]] = extractelement <2 x i16> [[XX]], i32 1 ; CHECK-NEXT: [[PTR0:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 0 ; CHECK-NEXT: [[PTR1:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 1 ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 2 ; CHECK-NEXT: [[PTR3:%.*]] = getelementptr inbounds [4 x i16], [4 x i16]* undef, i16 0, i16 3 -; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i16> [[SHUFFLE]], i32 0 -; CHECK-NEXT: store i16 [[TMP0]], i16* [[A]] -; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[PTR0]] to <4 x i16>* -; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], <4 x i16>* [[TMP1]], align 2 +; CHECK-NEXT: store i16 [[T3]], i16* [[A]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR0]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR1]] +; CHECK-NEXT: store i16 [[T2]], i16* [[PTR2]] +; CHECK-NEXT: store i16 [[T3]], i16* [[PTR3]] ; CHECK-NEXT: [[A_VAL:%.*]] = load i16, i16* [[A]], align 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0 ; CHECK-NEXT: br i1 [[CMP]], label [[CONT]], label [[EXIT:%.*]] Index: test/Transforms/SLPVectorizer/AArch64/gather-root.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -235,12 +235,10 @@ ; ; MAX-COST-LABEL: @PR32038( ; MAX-COST-NEXT: entry: -; MAX-COST-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 -; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer -; MAX-COST-NEXT: [[P4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; MAX-COST-NEXT: [[P5:%.*]] = icmp eq i8 [[P4]], 0 -; MAX-COST-NEXT: [[P6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; MAX-COST-NEXT: [[P7:%.*]] = icmp eq i8 [[P6]], 0 +; MAX-COST-NEXT: [[P0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1 +; MAX-COST-NEXT: [[P1:%.*]] = icmp eq i8 [[P0]], 0 +; MAX-COST-NEXT: [[TMP0:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2) to <4 x i8>*), i32 2, <4 x i1> , <4 x i8> undef) +; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <4 x i8> , [[TMP0]] ; MAX-COST-NEXT: [[P8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 ; MAX-COST-NEXT: [[P9:%.*]] = icmp eq i8 [[P8]], 0 ; MAX-COST-NEXT: [[P10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 @@ -249,16 +247,17 @@ ; MAX-COST-NEXT: [[P13:%.*]] = icmp eq i8 [[P12]], 0 ; MAX-COST-NEXT: [[P14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8 ; MAX-COST-NEXT: [[P15:%.*]] = icmp eq i8 [[P14]], 0 +; MAX-COST-NEXT: [[TMP2:%.*]] = insertelement <4 x i1> undef, i1 [[P1]], i32 0 ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: ; MAX-COST-NEXT: [[P17:%.*]] = phi i32 [ [[P34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; MAX-COST-NEXT: [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0 -; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1 -; MAX-COST-NEXT: [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[P5]], i32 2 -; MAX-COST-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[P7]], i32 3 -; MAX-COST-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[TMP3:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0 +; MAX-COST-NEXT: [[TMP4:%.*]] = insertelement <4 x i1> [[TMP2]], i1 [[TMP3]], i32 1 +; MAX-COST-NEXT: [[TMP5:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1 +; MAX-COST-NEXT: [[TMP6:%.*]] = insertelement <4 x i1> [[TMP4]], i1 [[TMP5]], i32 2 +; MAX-COST-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 +; MAX-COST-NEXT: [[TMP8:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[TMP7]], i32 3 +; MAX-COST-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> , <4 x i32> ; MAX-COST-NEXT: [[P20:%.*]] = add i32 -5, undef ; MAX-COST-NEXT: [[P22:%.*]] = add i32 [[P20]], undef ; MAX-COST-NEXT: [[P24:%.*]] = add i32 [[P22]], undef @@ -266,10 +265,10 @@ ; MAX-COST-NEXT: [[P27:%.*]] = select i1 [[P9]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P28:%.*]] = add i32 [[P26]], [[P27]] ; MAX-COST-NEXT: [[P29:%.*]] = select i1 [[P11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP8]]) -; MAX-COST-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[P27]] -; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P29]] -; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP11]], -5 +; MAX-COST-NEXT: [[TMP10:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) +; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[P27]] +; MAX-COST-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], [[P29]] +; MAX-COST-NEXT: [[OP_EXTRA:%.*]] = add i32 [[TMP12]], -5 ; MAX-COST-NEXT: [[P30:%.*]] = add i32 [[P28]], [[P29]] ; MAX-COST-NEXT: [[P31:%.*]] = select i1 [[P13]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[P32:%.*]] = add i32 [[OP_EXTRA]], [[P31]] Index: test/Transforms/SLPVectorizer/AArch64/transpose.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -77,22 +77,22 @@ define <4 x i32> @build_vec_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]] -; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP5]], [[TMP10]] -; CHECK-NEXT: ret <4 x i32> [[TMP11]] +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE3:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE4:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE3]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE5:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE6:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE5]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP2:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE4]], [[SHUFFLE6]] +; CHECK-NEXT: [[TMP5:%.*]] = sub <4 x i32> [[SHUFFLE4]], [[SHUFFLE6]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP3]], [[TMP6]] +; CHECK-NEXT: ret <4 x i32> [[TMP7]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 @@ -123,18 +123,20 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_0( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = sub <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1_0:%.*]] = sub i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_1:%.*]] = sub i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] +; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 +; CHECK-NEXT: [[TMP3_2:%.*]] = insertelement <4 x i32> [[TMP3_1]], i32 [[TMP2_0]], i32 2 +; CHECK-NEXT: [[TMP3_3:%.*]] = insertelement <4 x i32> [[TMP3_2]], i32 [[TMP2_1]], i32 3 ; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -198,21 +200,25 @@ define <4 x i32> @build_vec_v4i32_3_binops(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_3_binops( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> undef, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = mul <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP4]], <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = mul <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> [[TMP9]], <2 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = xor <2 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP12:%.*]] = xor <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = add <2 x i32> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i32> [[TMP11]], [[TMP12]] -; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <2 x i32> [[TMP13]], <2 x i32> [[TMP14]], <4 x i32> +; CHECK-NEXT: [[V0_0:%.*]] = extractelement <2 x i32> [[V0:%.*]], i32 0 +; CHECK-NEXT: [[V0_1:%.*]] = extractelement <2 x i32> [[V0]], i32 1 +; CHECK-NEXT: [[V1_0:%.*]] = extractelement <2 x i32> [[V1:%.*]], i32 0 +; CHECK-NEXT: [[V1_1:%.*]] = extractelement <2 x i32> [[V1]], i32 1 +; CHECK-NEXT: [[TMP0_0:%.*]] = add i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP0_1:%.*]] = add i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1_0:%.*]] = mul i32 [[V0_0]], [[V1_0]] +; CHECK-NEXT: [[TMP1_1:%.*]] = mul i32 [[V0_1]], [[V1_1]] +; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = xor <2 x i32> [[V0]], [[V1]] +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> undef, <2 x i32> +; CHECK-NEXT: [[TMP2_0:%.*]] = add i32 [[TMP0_0]], [[TMP0_1]] +; CHECK-NEXT: [[TMP2_1:%.*]] = add i32 [[TMP1_0]], [[TMP1_1]] +; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP3_0:%.*]] = insertelement <4 x i32> undef, i32 [[TMP2_0]], i32 0 +; CHECK-NEXT: [[TMP3_1:%.*]] = insertelement <4 x i32> [[TMP3_0]], i32 [[TMP2_1]], i32 1 +; CHECK-NEXT: [[TMP3_3:%.*]] = shufflevector <4 x i32> [[TMP3_1]], <4 x i32> [[TMP6]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP3_3]] ; %v0.0 = extractelement <2 x i32> %v0, i32 0 @@ -240,28 +246,28 @@ define i32 @reduction_v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: @reduction_v4i32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE1]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE3:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = sub <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]] -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i32> [[SHUFFLE2]], [[SHUFFLE3]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP5]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = lshr <4 x i32> [[TMP11]], -; CHECK-NEXT: [[TMP13:%.*]] = and <4 x i32> [[TMP12]], -; CHECK-NEXT: [[TMP14:%.*]] = mul nuw <4 x i32> [[TMP13]], -; CHECK-NEXT: [[TMP15:%.*]] = add <4 x i32> [[TMP14]], [[TMP11]] -; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i32> [[TMP15]], [[TMP14]] -; CHECK-NEXT: [[TMP17:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP16]]) -; CHECK-NEXT: ret i32 [[TMP17]] +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE3:%.*]] = shufflevector <4 x i32> [[V0]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE4:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE3]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x i32> [[V1:%.*]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE5:%.*]] = shufflevector <4 x i32> [[V1]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE6:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE5]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = sub <4 x i32> [[SHUFFLE]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[SHUFFLE]], [[SHUFFLE2]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[SHUFFLE4]], [[SHUFFLE6]] +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE4]], [[SHUFFLE6]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i32> [[TMP3]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i32> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = mul nuw <4 x i32> [[TMP9]], +; CHECK-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP10]], [[TMP7]] +; CHECK-NEXT: [[TMP12:%.*]] = xor <4 x i32> [[TMP11]], [[TMP10]] +; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP12]]) +; CHECK-NEXT: ret i32 [[TMP13]] ; %v0.0 = extractelement <4 x i32> %v0, i32 0 %v0.1 = extractelement <4 x i32> %v0, i32 1 Index: test/Transforms/SLPVectorizer/X86/PR32086.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/PR32086.ll +++ test/Transforms/SLPVectorizer/X86/PR32086.ll @@ -6,7 +6,8 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[LOAD_EXTEND]], <4 x i64> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 @@ -35,13 +36,14 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i64> [[LOAD_EXTEND]], <4 x i64> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[REORDER_SHUFFLE]], <4 x i64> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast i64* [[ST]] to <4 x i64>* -; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64* [[ST]] to <4 x i64>* +; CHECK-NEXT: store <4 x i64> [[SHUFFLE]], <4 x i64>* [[TMP3]], align 8 ; CHECK-NEXT: ret void ; %arrayidx1 = getelementptr inbounds i64, i64* %ld, i64 1 @@ -65,7 +67,8 @@ ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i64, i64* [[LD:%.*]], i64 1 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64* [[LD]] to <2 x i64>* ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 8 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[LOAD_EXTEND]], <4 x i64> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i64, i64* [[ST:%.*]], i64 1 ; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 2 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i64, i64* [[ST]], i64 3 Index: test/Transforms/SLPVectorizer/X86/PR39774.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/PR39774.ll +++ test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -7,8 +7,8 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = phi <8 x i32> [ [[TMP15:%.*]], [[LOOP]] ], [ , [[ENTRY:%.*]] ] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> undef, <8 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[SHUFFLE]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[VAL_1:%.*]] = and i32 [[TMP2]], undef @@ -88,17 +88,17 @@ ; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP8]] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> [[TMP10]], <2 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i32> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> undef, i32 [[TMP12]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> undef, i32 [[TMP12]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP15]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP15]] = insertelement <8 x i32> [[TMP13]], i32 [[TMP14]], i32 1 ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( ; FORCE_REDUCTION-NEXT: entry: ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: -; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY:%.*]] ] -; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> +; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <4 x i32> [ [[TMP13:%.*]], [[LOOP]] ], [ , [[ENTRY:%.*]] ] +; FORCE_REDUCTION-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> ; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[SHUFFLE]], i32 1 ; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <4 x i32> [[SHUFFLE]], ; FORCE_REDUCTION-NEXT: [[VAL_1:%.*]] = and i32 [[TMP2]], undef @@ -174,13 +174,13 @@ ; FORCE_REDUCTION-NEXT: [[VAL_39:%.*]] = add i32 [[TMP2]], 12529 ; FORCE_REDUCTION-NEXT: [[VAL_40:%.*]] = and i32 [[OP_EXTRA29]], [[VAL_39]] ; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[TMP2]], 13685 -; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_40]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP2]], i32 1 -; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> undef, i32 [[VAL_41]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 14910, i32 1 -; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = and <2 x i32> [[TMP8]], [[TMP10]] -; FORCE_REDUCTION-NEXT: [[TMP12:%.*]] = add <2 x i32> [[TMP8]], [[TMP10]] -; FORCE_REDUCTION-NEXT: [[TMP13]] = shufflevector <2 x i32> [[TMP11]], <2 x i32> [[TMP12]], <2 x i32> +; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[VAL_40]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 14910, i32 1 +; FORCE_REDUCTION-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> undef, i32 [[VAL_41]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP2]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP11:%.*]] = and <4 x i32> [[TMP8]], [[TMP10]] +; FORCE_REDUCTION-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP8]], [[TMP10]] +; FORCE_REDUCTION-NEXT: [[TMP13]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], <4 x i32> ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: Index: test/Transforms/SLPVectorizer/X86/PR40310.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/PR40310.ll +++ test/Transforms/SLPVectorizer/X86/PR40310.ll @@ -4,11 +4,11 @@ define void @mainTest(i32 %param, i32 * %vals, i32 %len) { ; CHECK-LABEL: @mainTest( ; CHECK-NEXT: bci_15.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[PARAM:%.*]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> , i32 [[PARAM:%.*]], i32 1 ; CHECK-NEXT: br label [[BCI_15:%.*]] ; CHECK: bci_15: -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = phi <16 x i32> [ [[TMP7:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER:%.*]] ] +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> undef, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 0 ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <16 x i32> [[SHUFFLE]], i32 15 ; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[VALS:%.*]] unordered, align 4 @@ -40,8 +40,8 @@ ; CHECK-NEXT: [[OP_EXTRA:%.*]] = and i32 [[TMP5]], [[TMP2]] ; CHECK-NEXT: [[V43:%.*]] = and i32 undef, [[V42]] ; CHECK-NEXT: [[V44:%.*]] = add i32 [[TMP2]], 16 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> undef, i32 [[V44]], i32 0 -; CHECK-NEXT: [[TMP7]] = insertelement <2 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i32> undef, i32 [[V44]], i32 0 +; CHECK-NEXT: [[TMP7]] = insertelement <16 x i32> [[TMP6]], i32 [[OP_EXTRA]], i32 1 ; CHECK-NEXT: br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]] ; CHECK: loopexit: ; CHECK-NEXT: ret void Index: test/Transforms/SLPVectorizer/X86/alternate-cast.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/alternate-cast.ll +++ test/Transforms/SLPVectorizer/X86/alternate-cast.ll @@ -8,26 +8,24 @@ define <8 x float> @sitofp_uitofp(<8 x i32> %a) { ; SSE-LABEL: @sitofp_uitofp( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> ; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 ; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float -; SSE-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float -; SSE-NEXT: [[AB2:%.*]] = sitofp i32 [[A2]] to float -; SSE-NEXT: [[AB3:%.*]] = sitofp i32 [[A3]] to float +; SSE-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[REORDER_SHUFFLE]] to <4 x float> ; SSE-NEXT: [[AB4:%.*]] = uitofp i32 [[A4]] to float ; SSE-NEXT: [[AB5:%.*]] = uitofp i32 [[A5]] to float ; SSE-NEXT: [[AB6:%.*]] = uitofp i32 [[A6]] to float ; SSE-NEXT: [[AB7:%.*]] = uitofp i32 [[A7]] to float -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 +; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 ; SSE-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 @@ -35,26 +33,24 @@ ; SSE-NEXT: ret <8 x float> [[R7]] ; ; SLM-LABEL: @sitofp_uitofp( -; SLM-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SLM-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> ; SLM-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 ; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; SLM-NEXT: [[AB0:%.*]] = sitofp i32 [[A0]] to float -; SLM-NEXT: [[AB1:%.*]] = sitofp i32 [[A1]] to float -; SLM-NEXT: [[AB2:%.*]] = sitofp i32 [[A2]] to float -; SLM-NEXT: [[AB3:%.*]] = sitofp i32 [[A3]] to float +; SLM-NEXT: [[TMP1:%.*]] = sitofp <4 x i32> [[REORDER_SHUFFLE]] to <4 x float> ; SLM-NEXT: [[AB4:%.*]] = uitofp i32 [[A4]] to float ; SLM-NEXT: [[AB5:%.*]] = uitofp i32 [[A5]] to float ; SLM-NEXT: [[AB6:%.*]] = uitofp i32 [[A6]] to float ; SLM-NEXT: [[AB7:%.*]] = uitofp i32 [[A7]] to float -; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[AB3]], i32 3 +; SLM-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[TMP2]], i32 0 +; SLM-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[TMP3]], i32 1 +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[TMP4]], i32 2 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP5]], i32 3 ; SLM-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[AB4]], i32 4 ; SLM-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 ; SLM-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 @@ -102,26 +98,24 @@ define <8 x i32> @fptosi_fptoui(<8 x float> %a) { ; SSE-LABEL: @fptosi_fptoui( -; SSE-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; SSE-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; SSE-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; SSE-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; SSE-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> ; SSE-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 ; SSE-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 ; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 ; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; SSE-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; SSE-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; SSE-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; SSE-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 +; SSE-NEXT: [[TMP1:%.*]] = fptosi <4 x float> [[REORDER_SHUFFLE]] to <4 x i32> ; SSE-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 ; SSE-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 ; SSE-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 ; SSE-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0 +; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1 +; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3 ; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 ; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 @@ -129,26 +123,24 @@ ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @fptosi_fptoui( -; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; SLM-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; SLM-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> ; SLM-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 ; SLM-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 ; SLM-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 ; SLM-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; SLM-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; SLM-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; SLM-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; SLM-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 +; SLM-NEXT: [[TMP1:%.*]] = fptosi <4 x float> [[REORDER_SHUFFLE]] to <4 x i32> ; SLM-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 ; SLM-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 ; SLM-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 ; SLM-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SLM-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0 +; SLM-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1 +; SLM-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2 +; SLM-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3 ; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 ; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 @@ -156,26 +148,24 @@ ; SLM-NEXT: ret <8 x i32> [[R7]] ; ; AVX-LABEL: @fptosi_fptoui( -; AVX-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 -; AVX-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 -; AVX-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 -; AVX-NEXT: [[A3:%.*]] = extractelement <8 x float> [[A]], i32 3 +; AVX-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> ; AVX-NEXT: [[A4:%.*]] = extractelement <8 x float> [[A]], i32 4 ; AVX-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 ; AVX-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 ; AVX-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 -; AVX-NEXT: [[AB0:%.*]] = fptosi float [[A0]] to i32 -; AVX-NEXT: [[AB1:%.*]] = fptosi float [[A1]] to i32 -; AVX-NEXT: [[AB2:%.*]] = fptosi float [[A2]] to i32 -; AVX-NEXT: [[AB3:%.*]] = fptosi float [[A3]] to i32 +; AVX-NEXT: [[TMP1:%.*]] = fptosi <4 x float> [[REORDER_SHUFFLE]] to <4 x i32> ; AVX-NEXT: [[AB4:%.*]] = fptoui float [[A4]] to i32 ; AVX-NEXT: [[AB5:%.*]] = fptoui float [[A5]] to i32 ; AVX-NEXT: [[AB6:%.*]] = fptoui float [[A6]] to i32 ; AVX-NEXT: [[AB7:%.*]] = fptoui float [[A7]] to i32 -; AVX-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 -; AVX-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; AVX-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; AVX-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP2]], i32 0 +; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; AVX-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP3]], i32 1 +; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; AVX-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP4]], i32 2 +; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; AVX-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP5]], i32 3 ; AVX-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 ; AVX-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; AVX-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 Index: test/Transforms/SLPVectorizer/X86/alternate-fp.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/alternate-fp.ll +++ test/Transforms/SLPVectorizer/X86/alternate-fp.ll @@ -56,20 +56,35 @@ ; SSE-NEXT: ret <8 x float> [[R7]] ; ; SLM-LABEL: @fmul_fdiv_v8f32( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fmul <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = fdiv <4 x float> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> undef, <4 x i32> -; SLM-NEXT: [[TMP7:%.*]] = fmul <4 x float> [[TMP5]], [[TMP6]] -; SLM-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <8 x i32> -; SLM-NEXT: [[TMP9:%.*]] = fdiv <4 x float> [[TMP5]], [[TMP6]] -; SLM-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> undef, <8 x i32> -; SLM-NEXT: [[R3:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP3]], <8 x i32> -; SLM-NEXT: [[R4:%.*]] = shufflevector <8 x float> [[R3]], <8 x float> [[TMP8]], <8 x i32> -; SLM-NEXT: [[R6:%.*]] = shufflevector <8 x float> [[R4]], <8 x float> [[TMP10]], <8 x i32> -; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x float> [[R6]], <8 x float> [[TMP8]], <8 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <8 x float> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x float> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x float> [[A]], i32 2 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x float> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i32 7 +; SLM-NEXT: [[B0:%.*]] = extractelement <8 x float> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <8 x float> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <8 x float> [[B]], i32 2 +; SLM-NEXT: [[B5:%.*]] = extractelement <8 x float> [[B]], i32 5 +; SLM-NEXT: [[B6:%.*]] = extractelement <8 x float> [[B]], i32 6 +; SLM-NEXT: [[B7:%.*]] = extractelement <8 x float> [[B]], i32 7 +; SLM-NEXT: [[AB0:%.*]] = fmul float [[A0]], [[B0]] +; SLM-NEXT: [[AB1:%.*]] = fdiv float [[A1]], [[B1]] +; SLM-NEXT: [[AB2:%.*]] = fdiv float [[A2]], [[B2]] +; SLM-NEXT: [[TMP1:%.*]] = fmul <8 x float> [[A]], [[B]] +; SLM-NEXT: [[AB5:%.*]] = fdiv float [[A5]], [[B5]] +; SLM-NEXT: [[AB6:%.*]] = fdiv float [[A6]], [[B6]] +; SLM-NEXT: [[AB7:%.*]] = fmul float [[A7]], [[B7]] +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x float> undef, float [[AB0]], i32 0 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x float> [[R0]], float [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x float> [[R1]], float [[AB2]], i32 2 +; SLM-NEXT: [[TMP2:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x float> [[R2]], float [[TMP2]], i32 3 +; SLM-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x float> [[R3]], float [[TMP3]], i32 4 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x float> [[R4]], float [[AB5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x float> [[R5]], float [[AB6]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x float> [[R6]], float [[AB7]], i32 7 ; SLM-NEXT: ret <8 x float> [[R7]] ; ; AVX-LABEL: @fmul_fdiv_v8f32( @@ -125,14 +140,15 @@ ; SSE-NEXT: ret <4 x float> [[TMP1]] ; ; SLM-LABEL: @fmul_fdiv_v4f32_const( -; SLM-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; SLM-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> ; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 ; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; SLM-NEXT: [[AB0:%.*]] = fmul float [[A0]], 2.000000e+00 +; SLM-NEXT: [[TMP1:%.*]] = fmul <2 x float> [[REORDER_SHUFFLE]], ; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00 -; SLM-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[A1]], i32 1 +; SLM-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 +; SLM-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0 +; SLM-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 +; SLM-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[TMP3]], i32 1 ; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2 ; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3 ; SLM-NEXT: ret <4 x float> [[R3]] Index: test/Transforms/SLPVectorizer/X86/alternate-int.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/alternate-int.ll +++ test/Transforms/SLPVectorizer/X86/alternate-int.ll @@ -83,20 +83,17 @@ ; ; SLM-LABEL: @add_mul_v4i32( ; SLM-NEXT: [[A0:%.*]] = extractelement <4 x i32> [[A:%.*]], i32 0 -; SLM-NEXT: [[A1:%.*]] = extractelement <4 x i32> [[A]], i32 1 -; SLM-NEXT: [[A2:%.*]] = extractelement <4 x i32> [[A]], i32 2 ; SLM-NEXT: [[A3:%.*]] = extractelement <4 x i32> [[A]], i32 3 ; SLM-NEXT: [[B0:%.*]] = extractelement <4 x i32> [[B:%.*]], i32 0 -; SLM-NEXT: [[B1:%.*]] = extractelement <4 x i32> [[B]], i32 1 -; SLM-NEXT: [[B2:%.*]] = extractelement <4 x i32> [[B]], i32 2 ; SLM-NEXT: [[B3:%.*]] = extractelement <4 x i32> [[B]], i32 3 ; SLM-NEXT: [[AB0:%.*]] = mul i32 [[A0]], [[B0]] -; SLM-NEXT: [[AB1:%.*]] = add i32 [[A1]], [[B1]] -; SLM-NEXT: [[AB2:%.*]] = add i32 [[A2]], [[B2]] +; SLM-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[B]] ; SLM-NEXT: [[AB3:%.*]] = mul i32 [[A3]], [[B3]] ; SLM-NEXT: [[R0:%.*]] = insertelement <4 x i32> undef, i32 [[AB0]], i32 0 -; SLM-NEXT: [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[AB1]], i32 1 -; SLM-NEXT: [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[AB2]], i32 2 +; SLM-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; SLM-NEXT: [[R1:%.*]] = insertelement <4 x i32> [[R0]], i32 [[TMP2]], i32 1 +; SLM-NEXT: [[TMP3:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; SLM-NEXT: [[R2:%.*]] = insertelement <4 x i32> [[R1]], i32 [[TMP3]], i32 2 ; SLM-NEXT: [[R3:%.*]] = insertelement <4 x i32> [[R2]], i32 [[AB3]], i32 3 ; SLM-NEXT: ret <4 x i32> [[R3]] ; @@ -137,33 +134,83 @@ ; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 ; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 ; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; SSE-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; SSE-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 ; SSE-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 ; SSE-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 +; SSE-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 +; SSE-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 +; SSE-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 +; SSE-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SSE-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; SSE-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] ; SSE-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] ; SSE-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] -; SSE-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] +; SSE-NEXT: [[AB4:%.*]] = shl i32 [[A4]], [[B4]] +; SSE-NEXT: [[AB5:%.*]] = shl i32 [[A5]], [[B5]] +; SSE-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] +; SSE-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; SSE-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 ; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 ; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; SSE-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> +; SSE-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @ashr_shl_v8i32( -; SLM-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; SLM-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SLM-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 +; SLM-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] +; SLM-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] +; SLM-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] +; SLM-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] +; SLM-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SLM-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> ; SLM-NEXT: ret <8 x i32> [[R7]] ; -; AVX-LABEL: @ashr_shl_v8i32( -; AVX-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] -; AVX-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] -; AVX-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> -; AVX-NEXT: ret <8 x i32> [[R7]] +; AVX1-LABEL: @ashr_shl_v8i32( +; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 +; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 +; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] +; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] +; AVX1-NEXT: [[AB2:%.*]] = ashr i32 [[A2]], [[B2]] +; AVX1-NEXT: [[AB3:%.*]] = ashr i32 [[A3]], [[B3]] +; AVX1-NEXT: [[TMP1:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[R3]], <8 x i32> [[TMP1]], <8 x i32> +; AVX1-NEXT: ret <8 x i32> [[R7]] +; +; AVX2-LABEL: @ashr_shl_v8i32( +; AVX2-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] +; AVX2-NEXT: [[TMP2:%.*]] = shl <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[R7:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> +; AVX2-NEXT: ret <8 x i32> [[R7]] ; ; AVX512-LABEL: @ashr_shl_v8i32( ; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A:%.*]], [[B:%.*]] @@ -208,27 +255,27 @@ define <8 x i32> @ashr_shl_v8i32_const(<8 x i32> %a) { ; SSE-LABEL: @ashr_shl_v8i32_const( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; SSE-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], -; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SSE-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[REORDER_SHUFFLE]], +; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], +; SSE-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> ; SSE-NEXT: ret <8 x i32> [[R7]] ; ; SLM-LABEL: @ashr_shl_v8i32_const( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; SLM-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; SLM-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; SLM-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], -; SLM-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; SLM-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[REORDER_SHUFFLE]], +; SLM-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; SLM-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], +; SLM-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> ; SLM-NEXT: ret <8 x i32> [[R7]] ; ; AVX1-LABEL: @ashr_shl_v8i32_const( -; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP2:%.*]] = ashr <4 x i32> [[TMP1]], -; AVX1-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX1-NEXT: [[TMP4:%.*]] = shl <4 x i32> [[TMP3]], -; AVX1-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> +; AVX1-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[REORDER_SHUFFLE]], +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> +; AVX1-NEXT: [[TMP3:%.*]] = shl <4 x i32> [[TMP2]], +; AVX1-NEXT: [[R7:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <8 x i32> ; AVX1-NEXT: ret <8 x i32> [[R7]] ; ; AVX2-LABEL: @ashr_shl_v8i32_const( @@ -309,27 +356,34 @@ ; SLM-LABEL: @ashr_lshr_shl_v8i32( ; SLM-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SLM-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; SLM-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; SLM-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 +; SLM-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 +; SLM-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 ; SLM-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; SLM-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; SLM-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; SLM-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; SLM-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] +; SLM-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] +; SLM-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] +; SLM-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; SLM-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; SLM-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; SLM-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; SLM-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; SLM-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 -; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 -; SLM-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 -; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 -; SLM-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 -; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 -; SLM-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 -; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SLM-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; SLM-NEXT: ret <8 x i32> [[R7]] @@ -337,27 +391,34 @@ ; AVX1-LABEL: @ashr_lshr_shl_v8i32( ; AVX1-NEXT: [[A0:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 0 ; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A]], i32 4 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 ; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 ; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; AVX1-NEXT: [[B0:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 0 ; AVX1-NEXT: [[B1:%.*]] = extractelement <8 x i32> [[B]], i32 1 +; AVX1-NEXT: [[B2:%.*]] = extractelement <8 x i32> [[B]], i32 2 +; AVX1-NEXT: [[B3:%.*]] = extractelement <8 x i32> [[B]], i32 3 +; AVX1-NEXT: [[B4:%.*]] = extractelement <8 x i32> [[B]], i32 4 +; AVX1-NEXT: [[B5:%.*]] = extractelement <8 x i32> [[B]], i32 5 ; AVX1-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B]], i32 6 ; AVX1-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 ; AVX1-NEXT: [[AB0:%.*]] = ashr i32 [[A0]], [[B0]] ; AVX1-NEXT: [[AB1:%.*]] = ashr i32 [[A1]], [[B1]] -; AVX1-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX1-NEXT: [[AB2:%.*]] = lshr i32 [[A2]], [[B2]] +; AVX1-NEXT: [[AB3:%.*]] = lshr i32 [[A3]], [[B3]] +; AVX1-NEXT: [[AB4:%.*]] = lshr i32 [[A4]], [[B4]] +; AVX1-NEXT: [[AB5:%.*]] = lshr i32 [[A5]], [[B5]] ; AVX1-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; AVX1-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] ; AVX1-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[AB0]], i32 0 ; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[AB1]], i32 1 -; AVX1-NEXT: [[TMP2:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 -; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP2]], i32 2 -; AVX1-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 -; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP3]], i32 3 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 -; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP4]], i32 4 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 -; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP5]], i32 5 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB4]], i32 4 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[AB5]], i32 5 ; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX1-NEXT: ret <8 x i32> [[R7]] @@ -367,25 +428,22 @@ ; AVX2-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; AVX2-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 ; AVX2-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]] +; AVX2-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[A]], [[B]] ; AVX2-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; AVX2-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX2-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 -; AVX2-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1 -; AVX2-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 -; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2 -; AVX2-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 -; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3 -; AVX2-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 -; AVX2-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4 -; AVX2-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 -; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5 +; AVX2-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; AVX2-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 +; AVX2-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; AVX2-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP4]], i32 1 +; AVX2-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2 +; AVX2-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP5]], i32 2 +; AVX2-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3 +; AVX2-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3 +; AVX2-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4 +; AVX2-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP7]], i32 4 +; AVX2-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5 +; AVX2-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP8]], i32 5 ; AVX2-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; AVX2-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX2-NEXT: ret <8 x i32> [[R7]] @@ -395,25 +453,22 @@ ; AVX512-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 ; AVX512-NEXT: [[B6:%.*]] = extractelement <8 x i32> [[B:%.*]], i32 6 ; AVX512-NEXT: [[B7:%.*]] = extractelement <8 x i32> [[B]], i32 7 -; AVX512-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> undef, <4 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = ashr <4 x i32> [[TMP1]], [[TMP2]] -; AVX512-NEXT: [[TMP4:%.*]] = lshr <4 x i32> [[TMP1]], [[TMP2]] -; AVX512-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A]], [[B]] +; AVX512-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[A]], [[B]] ; AVX512-NEXT: [[AB6:%.*]] = shl i32 [[A6]], [[B6]] ; AVX512-NEXT: [[AB7:%.*]] = shl i32 [[A7]], [[B7]] -; AVX512-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[TMP3]], i32 0 -; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP6]], i32 0 -; AVX512-NEXT: [[TMP7:%.*]] = extractelement <4 x i32> [[TMP3]], i32 1 -; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP7]], i32 1 -; AVX512-NEXT: [[TMP8:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2 -; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP8]], i32 2 -; AVX512-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3 -; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP9]], i32 3 -; AVX512-NEXT: [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i32 4 -; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP10]], i32 4 -; AVX512-NEXT: [[TMP11:%.*]] = extractelement <8 x i32> [[TMP5]], i32 5 -; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP11]], i32 5 +; AVX512-NEXT: [[TMP3:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; AVX512-NEXT: [[R0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0 +; AVX512-NEXT: [[TMP4:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; AVX512-NEXT: [[R1:%.*]] = insertelement <8 x i32> [[R0]], i32 [[TMP4]], i32 1 +; AVX512-NEXT: [[TMP5:%.*]] = extractelement <8 x i32> [[TMP2]], i32 2 +; AVX512-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[TMP5]], i32 2 +; AVX512-NEXT: [[TMP6:%.*]] = extractelement <8 x i32> [[TMP2]], i32 3 +; AVX512-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[TMP6]], i32 3 +; AVX512-NEXT: [[TMP7:%.*]] = extractelement <8 x i32> [[TMP2]], i32 4 +; AVX512-NEXT: [[R4:%.*]] = insertelement <8 x i32> [[R3]], i32 [[TMP7]], i32 4 +; AVX512-NEXT: [[TMP8:%.*]] = extractelement <8 x i32> [[TMP2]], i32 5 +; AVX512-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R4]], i32 [[TMP8]], i32 5 ; AVX512-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 ; AVX512-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 ; AVX512-NEXT: ret <8 x i32> [[R7]] @@ -486,26 +541,74 @@ } define <8 x i32> @sdiv_v8i32_undefs(<8 x i32> %a) { -; CHECK-LABEL: @sdiv_v8i32_undefs( -; CHECK-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 -; CHECK-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 -; CHECK-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 -; CHECK-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 -; CHECK-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 -; CHECK-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 -; CHECK-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 -; CHECK-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 -; CHECK-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 -; CHECK-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 -; CHECK-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 -; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 -; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 -; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 -; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 -; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 -; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 -; CHECK-NEXT: ret <8 x i32> [[R7]] +; SSE-LABEL: @sdiv_v8i32_undefs( +; SSE-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; SSE-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SSE-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SSE-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SSE-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SSE-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SSE-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; SSE-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; SSE-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SSE-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; SSE-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; SSE-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; SSE-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 +; SSE-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SSE-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SSE-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; SSE-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SSE-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SSE-NEXT: ret <8 x i32> [[R7]] +; +; SLM-LABEL: @sdiv_v8i32_undefs( +; SLM-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; SLM-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; SLM-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; SLM-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; SLM-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; SLM-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; SLM-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; SLM-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; SLM-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; SLM-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; SLM-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 +; SLM-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; SLM-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; SLM-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; SLM-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; SLM-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; SLM-NEXT: ret <8 x i32> [[R7]] +; +; AVX1-LABEL: @sdiv_v8i32_undefs( +; AVX1-NEXT: [[A1:%.*]] = extractelement <8 x i32> [[A:%.*]], i32 1 +; AVX1-NEXT: [[A2:%.*]] = extractelement <8 x i32> [[A]], i32 2 +; AVX1-NEXT: [[A3:%.*]] = extractelement <8 x i32> [[A]], i32 3 +; AVX1-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i32 5 +; AVX1-NEXT: [[A6:%.*]] = extractelement <8 x i32> [[A]], i32 6 +; AVX1-NEXT: [[A7:%.*]] = extractelement <8 x i32> [[A]], i32 7 +; AVX1-NEXT: [[AB1:%.*]] = sdiv i32 [[A1]], 4 +; AVX1-NEXT: [[AB2:%.*]] = sdiv i32 [[A2]], 8 +; AVX1-NEXT: [[AB3:%.*]] = sdiv i32 [[A3]], 16 +; AVX1-NEXT: [[AB5:%.*]] = sdiv i32 [[A5]], 4 +; AVX1-NEXT: [[AB6:%.*]] = sdiv i32 [[A6]], 8 +; AVX1-NEXT: [[AB7:%.*]] = sdiv i32 [[A7]], 16 +; AVX1-NEXT: [[R1:%.*]] = insertelement <8 x i32> undef, i32 [[AB1]], i32 1 +; AVX1-NEXT: [[R2:%.*]] = insertelement <8 x i32> [[R1]], i32 [[AB2]], i32 2 +; AVX1-NEXT: [[R3:%.*]] = insertelement <8 x i32> [[R2]], i32 [[AB3]], i32 3 +; AVX1-NEXT: [[R5:%.*]] = insertelement <8 x i32> [[R3]], i32 [[AB5]], i32 5 +; AVX1-NEXT: [[R6:%.*]] = insertelement <8 x i32> [[R5]], i32 [[AB6]], i32 6 +; AVX1-NEXT: [[R7:%.*]] = insertelement <8 x i32> [[R6]], i32 [[AB7]], i32 7 +; AVX1-NEXT: ret <8 x i32> [[R7]] +; +; AVX2-LABEL: @sdiv_v8i32_undefs( +; AVX2-NEXT: ret <8 x i32> undef +; +; AVX512-LABEL: @sdiv_v8i32_undefs( +; AVX512-NEXT: ret <8 x i32> undef ; %a0 = extractelement <8 x i32> %a, i32 0 %a1 = extractelement <8 x i32> %a, i32 1 Index: test/Transforms/SLPVectorizer/X86/cmp_commute.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/cmp_commute.ll +++ test/Transforms/SLPVectorizer/X86/cmp_commute.ll @@ -237,23 +237,22 @@ define <4 x i32> @fcmp_ord_uno_v4i32(<4 x float> %a, float* %b) { ; CHECK-LABEL: @fcmp_ord_uno_v4i32( ; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 ; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, float* [[B:%.*]], i64 1 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, float* [[B]], i64 2 ; CHECK-NEXT: [[P3:%.*]] = getelementptr inbounds float, float* [[B]], i64 3 ; CHECK-NEXT: [[B0:%.*]] = load float, float* [[B]], align 4 -; CHECK-NEXT: [[B1:%.*]] = load float, float* [[P1]], align 4 -; CHECK-NEXT: [[B2:%.*]] = load float, float* [[P2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P1]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 ; CHECK-NEXT: [[B3:%.*]] = load float, float* [[P3]], align 4 ; CHECK-NEXT: [[C0:%.*]] = fcmp ord float [[A0]], [[B0]] -; CHECK-NEXT: [[C1:%.*]] = fcmp uno float [[B1]], [[A1]] -; CHECK-NEXT: [[C2:%.*]] = fcmp uno float [[B2]], [[A2]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <2 x float> [[TMP2]], [[TMP3]] ; CHECK-NEXT: [[C3:%.*]] = fcmp ord float [[A3]], [[B3]] ; CHECK-NEXT: [[D0:%.*]] = insertelement <4 x i1> undef, i1 [[C0]], i32 0 -; CHECK-NEXT: [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[C1]], i32 1 -; CHECK-NEXT: [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[C2]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0 +; CHECK-NEXT: [[D1:%.*]] = insertelement <4 x i1> [[D0]], i1 [[TMP5]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1 +; CHECK-NEXT: [[D2:%.*]] = insertelement <4 x i1> [[D1]], i1 [[TMP6]], i32 2 ; CHECK-NEXT: [[D3:%.*]] = insertelement <4 x i1> [[D2]], i1 [[C3]], i32 3 ; CHECK-NEXT: [[R:%.*]] = sext <4 x i1> [[D3]] to <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[R]] Index: test/Transforms/SLPVectorizer/X86/crash_cmpop.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_cmpop.ll +++ test/Transforms/SLPVectorizer/X86/crash_cmpop.ll @@ -55,35 +55,32 @@ ; AVX: for.body: ; AVX-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; AVX-NEXT: [[ACC1_056:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD13:%.*]], [[FOR_BODY]] ] -; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP23:%.*]], [[FOR_BODY]] ] +; AVX-NEXT: [[TMP0:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[FOR_BODY]] ] ; AVX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX]], align 4 ; AVX-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[DEST:%.*]], i64 [[INDVARS_IV]] ; AVX-NEXT: store float [[ACC1_056]], float* [[ARRAYIDX2]], align 4 -; AVX-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP0]], i32 1 -; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> undef, float [[TMP2]], i32 0 -; AVX-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP0]], i32 0 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP3]], float [[TMP4]], i32 1 -; AVX-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 -; AVX-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP1]], i32 1 -; AVX-NEXT: [[TMP8:%.*]] = fadd <2 x float> [[TMP5]], [[TMP7]] -; AVX-NEXT: [[TMP9:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer -; AVX-NEXT: [[TMP10:%.*]] = fadd <2 x float> [[TMP9]], [[TMP8]] -; AVX-NEXT: [[TMP11:%.*]] = fcmp olt <2 x float> [[TMP10]], -; AVX-NEXT: [[TMP12:%.*]] = select <2 x i1> [[TMP11]], <2 x float> [[TMP10]], <2 x float> -; AVX-NEXT: [[TMP13:%.*]] = fcmp olt <2 x float> [[TMP12]], -; AVX-NEXT: [[TMP14:%.*]] = fmul <2 x float> [[TMP12]], zeroinitializer -; AVX-NEXT: [[TMP15:%.*]] = select <2 x i1> [[TMP13]], <2 x float> , <2 x float> [[TMP14]] -; AVX-NEXT: [[TMP16:%.*]] = extractelement <2 x float> [[TMP15]], i32 0 -; AVX-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP15]], i32 1 -; AVX-NEXT: [[ADD13]] = fadd float [[TMP16]], [[TMP17]] -; AVX-NEXT: [[TMP18:%.*]] = insertelement <2 x float> undef, float [[TMP17]], i32 0 -; AVX-NEXT: [[TMP19:%.*]] = insertelement <2 x float> [[TMP18]], float [[ADD13]], i32 1 -; AVX-NEXT: [[TMP20:%.*]] = fcmp olt <2 x float> [[TMP19]], -; AVX-NEXT: [[TMP21:%.*]] = select <2 x i1> [[TMP20]], <2 x float> [[TMP19]], <2 x float> -; AVX-NEXT: [[TMP22:%.*]] = fcmp olt <2 x float> [[TMP21]], -; AVX-NEXT: [[TMP23]] = select <2 x i1> [[TMP22]], <2 x float> , <2 x float> [[TMP21]] +; AVX-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> undef, <2 x i32> +; AVX-NEXT: [[TMP2:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0 +; AVX-NEXT: [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1 +; AVX-NEXT: [[TMP4:%.*]] = fadd <2 x float> [[REORDER_SHUFFLE]], [[TMP3]] +; AVX-NEXT: [[TMP5:%.*]] = fmul <2 x float> [[TMP0]], zeroinitializer +; AVX-NEXT: [[TMP6:%.*]] = fadd <2 x float> [[TMP5]], [[TMP4]] +; AVX-NEXT: [[TMP7:%.*]] = fcmp olt <2 x float> [[TMP6]], +; AVX-NEXT: [[TMP8:%.*]] = select <2 x i1> [[TMP7]], <2 x float> [[TMP6]], <2 x float> +; AVX-NEXT: [[TMP9:%.*]] = fcmp olt <2 x float> [[TMP8]], +; AVX-NEXT: [[TMP10:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer +; AVX-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP9]], <2 x float> , <2 x float> [[TMP10]] +; AVX-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; AVX-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; AVX-NEXT: [[ADD13]] = fadd float [[TMP12]], [[TMP13]] +; AVX-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[TMP13]], i32 0 +; AVX-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[ADD13]], i32 1 +; AVX-NEXT: [[TMP16:%.*]] = fcmp olt <2 x float> [[TMP15]], +; AVX-NEXT: [[TMP17:%.*]] = select <2 x i1> [[TMP16]], <2 x float> [[TMP15]], <2 x float> +; AVX-NEXT: [[TMP18:%.*]] = fcmp olt <2 x float> [[TMP17]], +; AVX-NEXT: [[TMP19]] = select <2 x i1> [[TMP18]], <2 x float> , <2 x float> [[TMP17]] ; AVX-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 32 ; AVX-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] ; AVX: for.end: Index: test/Transforms/SLPVectorizer/X86/crash_lencod.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_lencod.ll +++ test/Transforms/SLPVectorizer/X86/crash_lencod.ll @@ -131,10 +131,9 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double undef, i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP5]], <2 x double>* [[TMP6]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast double* [[ARRAYIDX44]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP4]], <2 x double>* [[TMP5]], align 8 ; CHECK-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll +++ test/Transforms/SLPVectorizer/X86/crash_reordering_undefs.ll @@ -4,25 +4,15 @@ define i32 @crash_reordering_undefs() { ; CHECK-LABEL: @crash_reordering_undefs( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[OR0:%.*]] = or i64 undef, undef -; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i64 undef, [[OR0]] -; CHECK-NEXT: [[ADD0:%.*]] = select i1 [[CMP0]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD1:%.*]] = add i32 undef, [[ADD0]] -; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i64 undef, undef -; CHECK-NEXT: [[ADD2:%.*]] = select i1 [[CMP1]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD1]], [[ADD2]] -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i64 undef, undef -; CHECK-NEXT: [[ADD4:%.*]] = select i1 [[CMP2]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD5:%.*]] = add i32 [[ADD3]], [[ADD4]] +; CHECK-NEXT: [[ADD1:%.*]] = add i32 undef, undef +; CHECK-NEXT: [[ADD3:%.*]] = add i32 [[ADD1]], undef +; CHECK-NEXT: [[ADD5:%.*]] = add i32 [[ADD3]], undef ; CHECK-NEXT: [[ADD6:%.*]] = add i32 [[ADD5]], undef ; CHECK-NEXT: [[ADD7:%.*]] = add i32 [[ADD6]], undef ; CHECK-NEXT: [[ADD8:%.*]] = add i32 [[ADD7]], undef -; CHECK-NEXT: [[OR1:%.*]] = or i64 undef, undef -; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i64 undef, [[OR1]] -; CHECK-NEXT: [[ADD9:%.*]] = select i1 [[CMP3]], i32 65536, i32 65537 -; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[ADD8]], [[ADD9]] +; CHECK-NEXT: [[ADD10:%.*]] = add i32 [[ADD8]], undef ; CHECK-NEXT: [[ADD11:%.*]] = add i32 [[ADD10]], undef -; CHECK-NEXT: ret i32 [[ADD11]] +; CHECK-NEXT: ret i32 undef ; entry: %or0 = or i64 undef, undef Index: test/Transforms/SLPVectorizer/X86/crash_smallpt.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/crash_smallpt.ll +++ test/Transforms/SLPVectorizer/X86/crash_smallpt.ll @@ -31,8 +31,8 @@ ; CHECK: cond.false66.us: ; CHECK-NEXT: [[ADD_I276_US:%.*]] = fadd double 0.000000e+00, undef ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[ADD_I276_US]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double undef, i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double 0xBFA5CC2D1960285F, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> , [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = fmul <2 x double> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP3]], ; CHECK-NEXT: [[TMP5:%.*]] = fmul <2 x double> undef, [[TMP2]] Index: test/Transforms/SLPVectorizer/X86/cse.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/cse.ll +++ test/Transforms/SLPVectorizer/X86/cse.ll @@ -18,21 +18,16 @@ ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast double* [[ARRAYIDX]] to <2 x double>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, <2 x double>* [[TMP0]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = fmul <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP2]], +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> undef, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[LOAD_EXTEND]], <4 x double> undef, <4 x i32> ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds double, double* [[G]], i64 1 -; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[TMP4]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 ; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 -; CHECK-NEXT: [[MUL11:%.*]] = fmul double [[TMP6]], 4.000000e+00 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> undef, double [[TMP5]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[MUL11]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP8]], +; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> , [[SHUFFLE]] +; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[SHUFFLE1]], ; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds double, double* [[G]], i64 3 -; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[ARRAYIDX9]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP9]], <2 x double>* [[TMP10]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[G]] to <4 x double>* +; CHECK-NEXT: store <4 x double> [[TMP3]], <4 x double>* [[TMP4]], align 8 ; CHECK-NEXT: ret i32 undef ; entry: @@ -131,7 +126,8 @@ ; CHECK-NEXT: [[TMP3:%.*]] = load double, double* [[TMP2]], align 8 ; CHECK-NEXT: [[TMP4:%.*]] = fmul double [[TMP3]], 4.000000e+00 ; CHECK-NEXT: br i1 [[TMP1]], label [[TMP14:%.*]], label [[TMP5:%.*]] -; CHECK: [[TMP6:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 +; CHECK: 5: +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 ; CHECK-NEXT: [[TMP7:%.*]] = load double, double* [[TMP6]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = fmul double [[TMP7]], 3.000000e+00 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x double> undef, double [[TMP4]], i32 0 @@ -141,7 +137,8 @@ ; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[G]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP11]], <2 x double>* [[TMP13]], align 8 ; CHECK-NEXT: br label [[TMP24:%.*]] -; CHECK: [[TMP15:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 +; CHECK: 14: +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[G]], i64 2 ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, double* [[G]], i64 6 ; CHECK-NEXT: [[TMP17:%.*]] = load double, double* [[TMP16]], align 8 ; CHECK-NEXT: [[TMP18:%.*]] = fmul double [[TMP17]], 3.000000e+00 @@ -152,7 +149,8 @@ ; CHECK-NEXT: [[TMP23:%.*]] = bitcast double* [[TMP15]] to <2 x double>* ; CHECK-NEXT: store <2 x double> [[TMP21]], <2 x double>* [[TMP23]], align 8 ; CHECK-NEXT: br label [[TMP24]] -; CHECK: ret i32 undef +; CHECK: 24: +; CHECK-NEXT: ret i32 undef ; %1 = icmp eq i32 %k, 0 %2 = getelementptr inbounds double, double* %G, i64 5 Index: test/Transforms/SLPVectorizer/X86/extract.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/extract.ll +++ test/Transforms/SLPVectorizer/X86/extract.ll @@ -54,14 +54,11 @@ ; CHECK-LABEL: @fextr2( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[LD:%.*]] = load <4 x double>, <4 x double>* undef -; CHECK-NEXT: [[V0:%.*]] = extractelement <4 x double> [[LD]], i32 0 -; CHECK-NEXT: [[V1:%.*]] = extractelement <4 x double> [[LD]], i32 1 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x double> [[LD]], <4 x double> undef, <2 x i32> ; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds double, double* [[PTR:%.*]], i64 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x double> undef, double [[V0]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> [[TMP0]], double [[V1]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x double> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = bitcast double* [[P0]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP2]], <2 x double>* [[TMP3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = fadd <2 x double> [[REORDER_SHUFFLE]], +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[P0]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP0]], <2 x double>* [[TMP1]], align 4 ; CHECK-NEXT: ret void ; entry: Index: test/Transforms/SLPVectorizer/X86/hadd.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/hadd.ll +++ test/Transforms/SLPVectorizer/X86/hadd.ll @@ -202,13 +202,22 @@ ; SSE-NEXT: ret <4 x double> [[R03]] ; ; SLM-LABEL: @test_v4f64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3 +; SLM-NEXT: [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3 +; SLM-NEXT: [[R0:%.*]] = fadd double [[A0]], [[A1]] +; SLM-NEXT: [[R1:%.*]] = fadd double [[B0]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = fadd double [[A2]], [[A3]] +; SLM-NEXT: [[R3:%.*]] = fadd double [[B2]], [[B3]] +; SLM-NEXT: [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i32 0 +; SLM-NEXT: [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1 +; SLM-NEXT: [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2 +; SLM-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3 ; SLM-NEXT: ret <4 x double> [[R03]] ; ; AVX-LABEL: @test_v4f64( @@ -322,13 +331,22 @@ ; SSE-NEXT: ret <4 x i64> [[R03]] ; ; SLM-LABEL: @test_v4i64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x i64> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x i64> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x i64> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x i64> [[A]], i32 3 +; SLM-NEXT: [[B0:%.*]] = extractelement <4 x i64> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <4 x i64> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <4 x i64> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <4 x i64> [[B]], i32 3 +; SLM-NEXT: [[R0:%.*]] = add i64 [[A0]], [[A1]] +; SLM-NEXT: [[R1:%.*]] = add i64 [[B0]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = add i64 [[A2]], [[A3]] +; SLM-NEXT: [[R3:%.*]] = add i64 [[B2]], [[B3]] +; SLM-NEXT: [[R00:%.*]] = insertelement <4 x i64> undef, i64 [[R0]], i32 0 +; SLM-NEXT: [[R01:%.*]] = insertelement <4 x i64> [[R00]], i64 [[R1]], i32 1 +; SLM-NEXT: [[R02:%.*]] = insertelement <4 x i64> [[R01]], i64 [[R2]], i32 2 +; SLM-NEXT: [[R03:%.*]] = insertelement <4 x i64> [[R02]], i64 [[R3]], i32 3 ; SLM-NEXT: ret <4 x i64> [[R03]] ; ; AVX-LABEL: @test_v4i64( @@ -432,13 +450,57 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SSE-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i32 8 +; SSE-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i32 9 +; SSE-NEXT: [[A10:%.*]] = extractelement <16 x i16> [[A]], i32 10 +; SSE-NEXT: [[A11:%.*]] = extractelement <16 x i16> [[A]], i32 11 +; SSE-NEXT: [[A12:%.*]] = extractelement <16 x i16> [[A]], i32 12 +; SSE-NEXT: [[A13:%.*]] = extractelement <16 x i16> [[A]], i32 13 +; SSE-NEXT: [[A14:%.*]] = extractelement <16 x i16> [[A]], i32 14 +; SSE-NEXT: [[A15:%.*]] = extractelement <16 x i16> [[A]], i32 15 +; SSE-NEXT: [[B8:%.*]] = extractelement <16 x i16> [[B:%.*]], i32 8 +; SSE-NEXT: [[B9:%.*]] = extractelement <16 x i16> [[B]], i32 9 +; SSE-NEXT: [[B10:%.*]] = extractelement <16 x i16> [[B]], i32 10 +; SSE-NEXT: [[B11:%.*]] = extractelement <16 x i16> [[B]], i32 11 +; SSE-NEXT: [[B12:%.*]] = extractelement <16 x i16> [[B]], i32 12 +; SSE-NEXT: [[B13:%.*]] = extractelement <16 x i16> [[B]], i32 13 +; SSE-NEXT: [[B14:%.*]] = extractelement <16 x i16> [[B]], i32 14 +; SSE-NEXT: [[B15:%.*]] = extractelement <16 x i16> [[B]], i32 15 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = add <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[R8:%.*]] = add i16 [[A8]], [[A9]] +; SSE-NEXT: [[R9:%.*]] = add i16 [[A10]], [[A11]] +; SSE-NEXT: [[R10:%.*]] = add i16 [[A12]], [[A13]] +; SSE-NEXT: [[R11:%.*]] = add i16 [[A14]], [[A15]] +; SSE-NEXT: [[R12:%.*]] = add i16 [[B8]], [[B9]] +; SSE-NEXT: [[R13:%.*]] = add i16 [[B10]], [[B11]] +; SSE-NEXT: [[R14:%.*]] = add i16 [[B12]], [[B13]] +; SSE-NEXT: [[R15:%.*]] = add i16 [[B14]], [[B15]] +; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[RV0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[RV1:%.*]] = insertelement <16 x i16> [[RV0]], i16 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[RV2:%.*]] = insertelement <16 x i16> [[RV1]], i16 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[RV3:%.*]] = insertelement <16 x i16> [[RV2]], i16 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[RV4:%.*]] = insertelement <16 x i16> [[RV3]], i16 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[TMP11]], i32 7 +; SSE-NEXT: [[RV8:%.*]] = insertelement <16 x i16> [[RV7]], i16 [[R8]], i32 8 +; SSE-NEXT: [[RV9:%.*]] = insertelement <16 x i16> [[RV8]], i16 [[R9]], i32 9 +; SSE-NEXT: [[RV10:%.*]] = insertelement <16 x i16> [[RV9]], i16 [[R10]], i32 10 +; SSE-NEXT: [[RV11:%.*]] = insertelement <16 x i16> [[RV10]], i16 [[R11]], i32 11 +; SSE-NEXT: [[RV12:%.*]] = insertelement <16 x i16> [[RV11]], i16 [[R12]], i32 12 +; SSE-NEXT: [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i32 13 +; SSE-NEXT: [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i32 14 +; SSE-NEXT: [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i32 15 ; SSE-NEXT: ret <16 x i16> [[RV15]] ; ; SLM-LABEL: @test_v16i16( Index: test/Transforms/SLPVectorizer/X86/horizontal-list.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/horizontal-list.ll +++ test/Transforms/SLPVectorizer/X86/horizontal-list.ll @@ -23,44 +23,62 @@ ; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]] +; CHECK-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]] +; CHECK-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]] ; CHECK-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]] ; CHECK-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]] ; CHECK-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]] -; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]] -; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]] -; CHECK-NEXT: store float [[ADD19_3]], float* @res, align 4 -; CHECK-NEXT: ret float [[ADD19_3]] +; CHECK-NEXT: [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP8]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[BIN_RDX]], i32 0 +; CHECK-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP9]], [[TMP5]] +; CHECK-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[TMP4]] +; CHECK-NEXT: [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], [[ADD_1]] +; CHECK-NEXT: [[OP_EXTRA3:%.*]] = fadd fast float [[OP_EXTRA2]], [[CONV]] +; CHECK-NEXT: [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]] +; CHECK-NEXT: store float [[OP_EXTRA3]], float* @res, align 4 +; CHECK-NEXT: ret float [[OP_EXTRA3]] ; ; THRESHOLD-LABEL: @baz( ; THRESHOLD-NEXT: entry: ; THRESHOLD-NEXT: [[TMP0:%.*]] = load i32, i32* @n, align 4 ; THRESHOLD-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3 ; THRESHOLD-NEXT: [[CONV:%.*]] = sitofp i32 [[MUL]] to float -; THRESHOLD-NEXT: [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16 -; THRESHOLD-NEXT: [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]] -; THRESHOLD-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0 -; THRESHOLD-NEXT: [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]] -; THRESHOLD-NEXT: [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1 -; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]] -; THRESHOLD-NEXT: [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 -; THRESHOLD-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 -; THRESHOLD-NEXT: [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]] -; THRESHOLD-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0 -; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]] -; THRESHOLD-NEXT: [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1 -; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]] +; THRESHOLD-NEXT: [[TMP1:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16 +; THRESHOLD-NEXT: [[TMP2:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16 +; THRESHOLD-NEXT: [[MUL4:%.*]] = fmul fast float [[TMP2]], [[TMP1]] +; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 1), align 4 +; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 1), align 4 +; THRESHOLD-NEXT: [[TMP5:%.*]] = insertelement <2 x float> undef, float [[TMP4]], i32 0 +; THRESHOLD-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[MUL4]], i32 1 +; THRESHOLD-NEXT: [[TMP7:%.*]] = insertelement <2 x float> undef, float [[TMP3]], i32 0 +; THRESHOLD-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[CONV]], i32 1 +; THRESHOLD-NEXT: [[TMP9:%.*]] = fmul fast <2 x float> [[TMP6]], [[TMP8]] +; THRESHOLD-NEXT: [[TMP10:%.*]] = fadd fast <2 x float> [[TMP6]], [[TMP8]] +; THRESHOLD-NEXT: [[TMP11:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> [[TMP10]], <2 x i32> +; THRESHOLD-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 +; THRESHOLD-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 +; THRESHOLD-NEXT: [[ADD_1:%.*]] = fadd fast float [[TMP12]], [[TMP13]] +; THRESHOLD-NEXT: [[TMP14:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8 +; THRESHOLD-NEXT: [[TMP15:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8 +; THRESHOLD-NEXT: [[TMP16:%.*]] = fmul fast <2 x float> [[TMP15]], [[TMP14]] +; THRESHOLD-NEXT: [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]] +; THRESHOLD-NEXT: [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]] ; THRESHOLD-NEXT: [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]] -; THRESHOLD-NEXT: [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]] -; THRESHOLD-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]] -; THRESHOLD-NEXT: [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]] -; THRESHOLD-NEXT: [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]] -; THRESHOLD-NEXT: store float [[ADD19_3]], float* @res, align 4 -; THRESHOLD-NEXT: ret float [[ADD19_3]] +; THRESHOLD-NEXT: [[ADD19:%.*]] = fadd fast float [[MUL4]], [[ADD7]] +; THRESHOLD-NEXT: [[ADD19_1:%.*]] = fadd fast float [[TMP12]], [[ADD19]] +; THRESHOLD-NEXT: [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]] +; THRESHOLD-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> undef, <2 x i32> +; THRESHOLD-NEXT: [[BIN_RDX:%.*]] = fadd fast <2 x float> [[TMP16]], [[RDX_SHUF]] +; THRESHOLD-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[BIN_RDX]], i32 0 +; THRESHOLD-NEXT: [[OP_EXTRA:%.*]] = fadd fast float [[TMP17]], [[TMP12]] +; THRESHOLD-NEXT: [[OP_EXTRA1:%.*]] = fadd fast float [[OP_EXTRA]], [[MUL4]] +; THRESHOLD-NEXT: [[OP_EXTRA2:%.*]] = fadd fast float [[OP_EXTRA1]], [[ADD_1]] +; THRESHOLD-NEXT: [[OP_EXTRA3:%.*]] = fadd fast float [[OP_EXTRA2]], [[CONV]] +; THRESHOLD-NEXT: [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]] +; THRESHOLD-NEXT: store float [[OP_EXTRA3]], float* @res, align 4 +; THRESHOLD-NEXT: ret float [[OP_EXTRA3]] ; entry: %0 = load i32, i32* @n, align 4 Index: test/Transforms/SLPVectorizer/X86/hsub.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/hsub.ll +++ test/Transforms/SLPVectorizer/X86/hsub.ll @@ -202,13 +202,22 @@ ; SSE-NEXT: ret <4 x double> [[R03]] ; ; SLM-LABEL: @test_v4f64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = fsub <2 x double> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP6]], <4 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3 +; SLM-NEXT: [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3 +; SLM-NEXT: [[R0:%.*]] = fsub double [[A0]], [[A1]] +; SLM-NEXT: [[R1:%.*]] = fsub double [[B0]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = fsub double [[A2]], [[A3]] +; SLM-NEXT: [[R3:%.*]] = fsub double [[B2]], [[B3]] +; SLM-NEXT: [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i32 0 +; SLM-NEXT: [[R01:%.*]] = insertelement <4 x double> [[R00]], double [[R1]], i32 1 +; SLM-NEXT: [[R02:%.*]] = insertelement <4 x double> [[R01]], double [[R2]], i32 2 +; SLM-NEXT: [[R03:%.*]] = insertelement <4 x double> [[R02]], double [[R3]], i32 3 ; SLM-NEXT: ret <4 x double> [[R03]] ; ; AVX-LABEL: @test_v4f64( @@ -322,13 +331,22 @@ ; SSE-NEXT: ret <4 x i64> [[R03]] ; ; SLM-LABEL: @test_v4i64( -; SLM-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> -; SLM-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP3:%.*]] = sub <2 x i64> [[TMP1]], [[TMP2]] -; SLM-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> -; SLM-NEXT: [[TMP6:%.*]] = sub <2 x i64> [[TMP4]], [[TMP5]] -; SLM-NEXT: [[R03:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP6]], <4 x i32> +; SLM-NEXT: [[A0:%.*]] = extractelement <4 x i64> [[A:%.*]], i32 0 +; SLM-NEXT: [[A1:%.*]] = extractelement <4 x i64> [[A]], i32 1 +; SLM-NEXT: [[A2:%.*]] = extractelement <4 x i64> [[A]], i32 2 +; SLM-NEXT: [[A3:%.*]] = extractelement <4 x i64> [[A]], i32 3 +; SLM-NEXT: [[B0:%.*]] = extractelement <4 x i64> [[B:%.*]], i32 0 +; SLM-NEXT: [[B1:%.*]] = extractelement <4 x i64> [[B]], i32 1 +; SLM-NEXT: [[B2:%.*]] = extractelement <4 x i64> [[B]], i32 2 +; SLM-NEXT: [[B3:%.*]] = extractelement <4 x i64> [[B]], i32 3 +; SLM-NEXT: [[R0:%.*]] = sub i64 [[A0]], [[A1]] +; SLM-NEXT: [[R1:%.*]] = sub i64 [[B0]], [[B1]] +; SLM-NEXT: [[R2:%.*]] = sub i64 [[A2]], [[A3]] +; SLM-NEXT: [[R3:%.*]] = sub i64 [[B2]], [[B3]] +; SLM-NEXT: [[R00:%.*]] = insertelement <4 x i64> undef, i64 [[R0]], i32 0 +; SLM-NEXT: [[R01:%.*]] = insertelement <4 x i64> [[R00]], i64 [[R1]], i32 1 +; SLM-NEXT: [[R02:%.*]] = insertelement <4 x i64> [[R01]], i64 [[R2]], i32 2 +; SLM-NEXT: [[R03:%.*]] = insertelement <4 x i64> [[R02]], i64 [[R3]], i32 3 ; SLM-NEXT: ret <4 x i64> [[R03]] ; ; AVX-LABEL: @test_v4i64( @@ -432,13 +450,57 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) { ; SSE-LABEL: @test_v16i16( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> +; SSE-NEXT: [[A8:%.*]] = extractelement <16 x i16> [[A:%.*]], i32 8 +; SSE-NEXT: [[A9:%.*]] = extractelement <16 x i16> [[A]], i32 9 +; SSE-NEXT: [[A10:%.*]] = extractelement <16 x i16> [[A]], i32 10 +; SSE-NEXT: [[A11:%.*]] = extractelement <16 x i16> [[A]], i32 11 +; SSE-NEXT: [[A12:%.*]] = extractelement <16 x i16> [[A]], i32 12 +; SSE-NEXT: [[A13:%.*]] = extractelement <16 x i16> [[A]], i32 13 +; SSE-NEXT: [[A14:%.*]] = extractelement <16 x i16> [[A]], i32 14 +; SSE-NEXT: [[A15:%.*]] = extractelement <16 x i16> [[A]], i32 15 +; SSE-NEXT: [[B8:%.*]] = extractelement <16 x i16> [[B:%.*]], i32 8 +; SSE-NEXT: [[B9:%.*]] = extractelement <16 x i16> [[B]], i32 9 +; SSE-NEXT: [[B10:%.*]] = extractelement <16 x i16> [[B]], i32 10 +; SSE-NEXT: [[B11:%.*]] = extractelement <16 x i16> [[B]], i32 11 +; SSE-NEXT: [[B12:%.*]] = extractelement <16 x i16> [[B]], i32 12 +; SSE-NEXT: [[B13:%.*]] = extractelement <16 x i16> [[B]], i32 13 +; SSE-NEXT: [[B14:%.*]] = extractelement <16 x i16> [[B]], i32 14 +; SSE-NEXT: [[B15:%.*]] = extractelement <16 x i16> [[B]], i32 15 +; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> ; SSE-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> -; SSE-NEXT: [[TMP6:%.*]] = sub <8 x i16> [[TMP4]], [[TMP5]] -; SSE-NEXT: [[RV15:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP6]], <16 x i32> +; SSE-NEXT: [[R8:%.*]] = sub i16 [[A8]], [[A9]] +; SSE-NEXT: [[R9:%.*]] = sub i16 [[A10]], [[A11]] +; SSE-NEXT: [[R10:%.*]] = sub i16 [[A12]], [[A13]] +; SSE-NEXT: [[R11:%.*]] = sub i16 [[A14]], [[A15]] +; SSE-NEXT: [[R12:%.*]] = sub i16 [[B8]], [[B9]] +; SSE-NEXT: [[R13:%.*]] = sub i16 [[B10]], [[B11]] +; SSE-NEXT: [[R14:%.*]] = sub i16 [[B12]], [[B13]] +; SSE-NEXT: [[R15:%.*]] = sub i16 [[B14]], [[B15]] +; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x i16> [[TMP3]], i32 0 +; SSE-NEXT: [[RV0:%.*]] = insertelement <16 x i16> undef, i16 [[TMP4]], i32 0 +; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x i16> [[TMP3]], i32 1 +; SSE-NEXT: [[RV1:%.*]] = insertelement <16 x i16> [[RV0]], i16 [[TMP5]], i32 1 +; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x i16> [[TMP3]], i32 2 +; SSE-NEXT: [[RV2:%.*]] = insertelement <16 x i16> [[RV1]], i16 [[TMP6]], i32 2 +; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3 +; SSE-NEXT: [[RV3:%.*]] = insertelement <16 x i16> [[RV2]], i16 [[TMP7]], i32 3 +; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP3]], i32 4 +; SSE-NEXT: [[RV4:%.*]] = insertelement <16 x i16> [[RV3]], i16 [[TMP8]], i32 4 +; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x i16> [[TMP3]], i32 5 +; SSE-NEXT: [[RV5:%.*]] = insertelement <16 x i16> [[RV4]], i16 [[TMP9]], i32 5 +; SSE-NEXT: [[TMP10:%.*]] = extractelement <8 x i16> [[TMP3]], i32 6 +; SSE-NEXT: [[RV6:%.*]] = insertelement <16 x i16> [[RV5]], i16 [[TMP10]], i32 6 +; SSE-NEXT: [[TMP11:%.*]] = extractelement <8 x i16> [[TMP3]], i32 7 +; SSE-NEXT: [[RV7:%.*]] = insertelement <16 x i16> [[RV6]], i16 [[TMP11]], i32 7 +; SSE-NEXT: [[RV8:%.*]] = insertelement <16 x i16> [[RV7]], i16 [[R8]], i32 8 +; SSE-NEXT: [[RV9:%.*]] = insertelement <16 x i16> [[RV8]], i16 [[R9]], i32 9 +; SSE-NEXT: [[RV10:%.*]] = insertelement <16 x i16> [[RV9]], i16 [[R10]], i32 10 +; SSE-NEXT: [[RV11:%.*]] = insertelement <16 x i16> [[RV10]], i16 [[R11]], i32 11 +; SSE-NEXT: [[RV12:%.*]] = insertelement <16 x i16> [[RV11]], i16 [[R12]], i32 12 +; SSE-NEXT: [[RV13:%.*]] = insertelement <16 x i16> [[RV12]], i16 [[R13]], i32 13 +; SSE-NEXT: [[RV14:%.*]] = insertelement <16 x i16> [[RV13]], i16 [[R14]], i32 14 +; SSE-NEXT: [[RV15:%.*]] = insertelement <16 x i16> [[RV14]], i16 [[R15]], i32 15 ; SSE-NEXT: ret <16 x i16> [[RV15]] ; ; SLM-LABEL: @test_v16i16( Index: test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +++ test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll @@ -291,42 +291,33 @@ ; Unused insertelement define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 { ; CHECK-LABEL: @simple_select_no_users( -; CHECK-NEXT: [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0 -; CHECK-NEXT: [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> undef, <2 x i32> ; CHECK-NEXT: [[C2:%.*]] = extractelement <4 x i32> [[C]], i32 2 ; CHECK-NEXT: [[C3:%.*]] = extractelement <4 x i32> [[C]], i32 3 -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> ; CHECK-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2 ; CHECK-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3 -; CHECK-NEXT: [[B0:%.*]] = extractelement <4 x float> [[B:%.*]], i32 0 -; CHECK-NEXT: [[B1:%.*]] = extractelement <4 x float> [[B]], i32 1 +; CHECK-NEXT: [[REORDER_SHUFFLE2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <2 x i32> ; CHECK-NEXT: [[B2:%.*]] = extractelement <4 x float> [[B]], i32 2 ; CHECK-NEXT: [[B3:%.*]] = extractelement <4 x float> [[B]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> undef, i32 [[C0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[C1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <2 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> [[TMP4]], i32 [[C3]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> undef, float [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> [[TMP7]], float [[A1]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> undef, float [[B0]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x float> [[TMP9]], float [[B1]], i32 1 -; CHECK-NEXT: [[TMP11:%.*]] = select <2 x i1> [[TMP3]], <2 x float> [[TMP8]], <2 x float> [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x float> [[TMP12]], float [[A3]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x float> [[TMP14]], float [[B3]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP6]], <2 x float> [[TMP13]], <2 x float> [[TMP15]] -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x float> [[TMP11]], i32 0 -; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP17]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x float> [[TMP11]], i32 1 -; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP18]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x float> [[TMP16]], i32 0 -; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP19]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x float> [[TMP16]], i32 1 -; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP20]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ne <2 x i32> [[REORDER_SHUFFLE]], zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> undef, i32 [[C2]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[C3]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = icmp ne <2 x i32> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[REORDER_SHUFFLE1]], <2 x float> [[REORDER_SHUFFLE2]] +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> undef, float [[A2]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[A3]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x float> undef, float [[B2]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> [[TMP8]], float [[B3]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = select <2 x i1> [[TMP4]], <2 x float> [[TMP7]], <2 x float> [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP5]], i32 0 +; CHECK-NEXT: [[RA:%.*]] = insertelement <4 x float> undef, float [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP5]], i32 1 +; CHECK-NEXT: [[RB:%.*]] = insertelement <4 x float> [[RA]], float [[TMP12]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x float> [[TMP10]], i32 0 +; CHECK-NEXT: [[RC:%.*]] = insertelement <4 x float> undef, float [[TMP13]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x float> [[TMP10]], i32 1 +; CHECK-NEXT: [[RD:%.*]] = insertelement <4 x float> [[RC]], float [[TMP14]], i32 3 ; CHECK-NEXT: ret <4 x float> [[RD]] ; ; ZEROTHRESH-LABEL: @simple_select_no_users( Index: test/Transforms/SLPVectorizer/X86/load-merge.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/load-merge.ll +++ test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -54,15 +54,16 @@ ; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X:%.*]], i64 0, i64 0 ; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 1 ; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, <4 x float>* [[X]], i64 0, i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <2 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[X2:%.*]] = load float, float* [[GEP2]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[GEP0]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP1]], i32 4, <4 x i1> , <4 x float> undef) +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 0 ; CHECK-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 1 ; CHECK-NEXT: [[I1:%.*]] = insertelement <4 x float> [[I0]], float [[TMP4]], i32 1 -; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[X2]], i32 2 -; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[SHUFFLE]], i32 2 +; CHECK-NEXT: [[I2:%.*]] = insertelement <4 x float> [[I1]], float [[TMP5]], i32 2 +; CHECK-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[TMP5]], i32 3 ; CHECK-NEXT: ret <4 x float> [[I3]] ; %gep0 = getelementptr inbounds <4 x float>, <4 x float>* %x, i64 0, i64 0 Index: test/Transforms/SLPVectorizer/X86/operandorder.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/operandorder.ll +++ test/Transforms/SLPVectorizer/X86/operandorder.ll @@ -266,32 +266,24 @@ ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* getelementptr inbounds ([32000 x float], [32000 x float]* @a, i32 0, i32 0), align 16 ; CHECK-NEXT: br label [[FOR_BODY3:%.*]] ; CHECK: for.body3: -; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP14:%.*]], [[FOR_BODY3]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_COND1_PREHEADER]] ], [ [[TMP12:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY3]] ] ; CHECK-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[TMP2]], 1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV]] to i32 -; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], 4 -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast float* [[ARRAYIDX]] to <4 x float>* -; CHECK-NEXT: [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[TMP7]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP9]], <4 x float> [[TMP8]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <4 x float> [[TMP8]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[ARRAYIDX5]] to <4 x float>* -; CHECK-NEXT: store <4 x float> [[TMP11]], <4 x float>* [[TMP12]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5 -; CHECK-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], [32000 x float]* @a, i32 0, i32 [[TMP13]] -; CHECK-NEXT: [[TMP14]] = load float, float* [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x float> [[TMP8]], i32 3 -; CHECK-NEXT: [[MUL45:%.*]] = fmul float [[TMP14]], [[TMP15]] -; CHECK-NEXT: store float [[MUL45]], float* [[ARRAYIDX31]], align 4 -; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP16]], 31995 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast float* [[ARRAYIDX]] to <8 x float>* +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* nonnull [[TMP5]], i32 4, <8 x i1> , <8 x float> undef) +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = fmul <8 x float> [[TMP6]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast float* [[ARRAYIDX5]] to <8 x float>* +; CHECK-NEXT: call void @llvm.masked.store.v8f32.p0v8f32(<8 x float> [[TMP9]], <8 x float>* [[TMP10]], i32 4, <8 x i1> ) +; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[TMP11]], 31995 +; CHECK-NEXT: [[TMP12]] = extractelement <8 x float> [[TMP6]], i32 4 ; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_BODY3]], label [[FOR_END:%.*]] ; CHECK: for.end: ; CHECK-NEXT: ret void Index: test/Transforms/SLPVectorizer/X86/partail.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/partail.ll +++ test/Transforms/SLPVectorizer/X86/partail.ll @@ -18,26 +18,23 @@ ; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = icmp sgt <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> undef, i32 [[SHR15]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 undef, i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 undef, i32 2 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 undef, i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = icmp slt <4 x i32> [[TMP7]], undef -; CHECK-NEXT: [[TMP9:%.*]] = select <4 x i1> [[TMP8]], <4 x i32> [[TMP7]], <4 x i32> undef -; CHECK-NEXT: [[TMP10:%.*]] = sext <4 x i32> [[TMP9]] to <4 x i64> -; CHECK-NEXT: [[TMP11:%.*]] = trunc <4 x i64> [[TMP10]] to <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i32> [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = sext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP13]] -; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i32> [[TMP11]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = sext i32 [[TMP14]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <4 x i32> [[TMP11]], i32 2 -; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[TMP16]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x i32> [[TMP11]], i32 3 -; CHECK-NEXT: [[TMP19:%.*]] = sext i32 [[TMP18]] to i64 -; CHECK-NEXT: [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP19]] +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = icmp slt <4 x i32> [[TMP4]], undef +; CHECK-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP4]], <4 x i32> undef +; CHECK-NEXT: [[TMP7:%.*]] = sext <4 x i32> [[TMP6]] to <4 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = trunc <4 x i64> [[TMP7]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = sext i32 [[TMP9]] to i64 +; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i32> [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_1:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i32> [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_2:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 +; CHECK-NEXT: [[ARRAYIDX31_3:%.*]] = getelementptr inbounds i16*, i16** undef, i64 [[TMP16]] ; CHECK-NEXT: unreachable ; entry: Index: test/Transforms/SLPVectorizer/X86/phi.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/phi.ll +++ test/Transforms/SLPVectorizer/X86/phi.ll @@ -140,49 +140,52 @@ define float @foo3(float* nocapture readonly %A) #0 { ; CHECK-LABEL: @foo3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <4 x float>* -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE]], i32 3 +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[A:%.*]] to <8 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* [[TMP0]], i32 4, <8 x i1> , <8 x float> undef) +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x float> [[REORDER_SHUFFLE]], <8 x float> undef, <8 x i32> ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[R_052:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD6:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi float [ [[TMP3]], [[ENTRY]] ], [ [[TMP11:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[TMP13:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP18:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP5]], 7.000000e+00 -; CHECK-NEXT: [[ADD6]] = fadd float [[R_052]], [[MUL]] -; CHECK-NEXT: [[TMP7:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = load float, float* [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = phi <8 x float> [ [[SHUFFLE]], [[ENTRY]] ], [ [[TMP30:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x float> [[TMP2]], i32 6 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x float> [[TMP2]], i32 5 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[INDVARS_IV]], 2 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[ARRAYIDX19:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP9:%.*]] = bitcast float* [[ARRAYIDX19]] to <2 x float>* -; CHECK-NEXT: [[TMP10:%.*]] = load <2 x float>, <2 x float>* [[TMP9]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP11]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 0 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> undef, float [[TMP11]], i32 0 -; CHECK-NEXT: [[TMP13]] = extractelement <2 x float> [[REORDER_SHUFFLE1]], i32 1 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x float> [[TMP12]], float [[TMP13]], i32 1 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x float> [[TMP14]], float [[TMP8]], i32 2 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x float> [[TMP15]], float [[TMP4]], i32 3 -; CHECK-NEXT: [[TMP17:%.*]] = fmul <4 x float> [[TMP16]], -; CHECK-NEXT: [[TMP18]] = fadd <4 x float> [[TMP6]], [[TMP17]] -; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP19]], 121 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX14]] to <4 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP6]], i32 4, <4 x i1> , <4 x float> undef) +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <4 x float> [[TMP7]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE1]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x float> undef, float [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE1]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x float> [[TMP9]], float [[TMP10]], i32 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x float> [[REORDER_SHUFFLE1]], i32 2 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x float> [[TMP11]], float [[TMP12]], i32 2 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x float> [[TMP13]], float [[TMP4]], i32 3 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x float> [[TMP14]], float [[TMP3]], i32 4 +; CHECK-NEXT: [[TMP16:%.*]] = fmul <8 x float> , [[TMP15]] +; CHECK-NEXT: [[TMP17:%.*]] = fadd <8 x float> [[TMP2]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP18]], 121 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x float> [[TMP17]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <8 x float> undef, float [[TMP19]], i32 0 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x float> [[TMP17]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <8 x float> [[TMP20]], float [[TMP21]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x float> [[TMP17]], i32 2 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <8 x float> [[TMP22]], float [[TMP23]], i32 2 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x float> [[TMP17]], i32 3 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <8 x float> [[TMP24]], float [[TMP25]], i32 3 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x float> [[TMP17]], i32 4 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <8 x float> [[TMP26]], float [[TMP27]], i32 4 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <8 x float> [[TMP28]], float [[TMP8]], i32 5 +; CHECK-NEXT: [[TMP30]] = insertelement <8 x float> [[TMP29]], float [[TMP10]], i32 6 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]] ; CHECK: for.end: -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x float> [[TMP18]], i32 3 -; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[ADD6]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = extractelement <4 x float> [[TMP18]], i32 2 -; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP21]] -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x float> [[TMP18]], i32 1 -; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP22]] -; CHECK-NEXT: [[TMP23:%.*]] = extractelement <4 x float> [[TMP18]], i32 0 -; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP23]] +; CHECK-NEXT: [[ADD28:%.*]] = fadd float [[TMP27]], [[TMP25]] +; CHECK-NEXT: [[ADD29:%.*]] = fadd float [[ADD28]], [[TMP23]] +; CHECK-NEXT: [[ADD30:%.*]] = fadd float [[ADD29]], [[TMP21]] +; CHECK-NEXT: [[ADD31:%.*]] = fadd float [[ADD30]], [[TMP19]] ; CHECK-NEXT: ret float [[ADD31]] ; entry: Index: test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll +++ test/Transforms/SLPVectorizer/X86/reorder_repeated_ops.ll @@ -11,33 +11,31 @@ ; CHECK-NEXT: ret void ; CHECK: bb2: ; CHECK-NEXT: [[TMP:%.*]] = select i1 undef, i16 undef, i16 15 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i16> undef, i16 [[TMP]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> [[TMP0]], i16 undef, i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = sext <2 x i16> [[TMP1]] to <2 x i32> -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sub nsw <2 x i32> , [[REORDER_SHUFFLE]] -; CHECK-NEXT: [[TMP4:%.*]] = sub <2 x i32> [[TMP3]], undef -; CHECK-NEXT: [[SHUFFLE8:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[SHUFFLE8]], +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> undef, i16 [[TMP]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = sext <4 x i16> [[TMP0]] to <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sub nsw <4 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP2]], undef +; CHECK-NEXT: [[SHUFFLE8:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[SHUFFLE8]], ; CHECK-NEXT: [[TMP11:%.*]] = icmp sgt i32 undef, undef ; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i32 undef, i32 undef ; CHECK-NEXT: [[TMP14:%.*]] = icmp sgt i32 [[TMP12]], undef ; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i32 [[TMP12]], i32 undef ; CHECK-NEXT: [[TMP17:%.*]] = icmp sgt i32 [[TMP15]], undef -; CHECK-NEXT: [[RDX_SHUF9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP10:%.*]] = icmp sgt <4 x i32> [[TMP5]], [[RDX_SHUF9]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT11:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP10]], <4 x i32> [[TMP5]], <4 x i32> [[RDX_SHUF9]] +; CHECK-NEXT: [[RDX_SHUF9:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP10:%.*]] = icmp sgt <4 x i32> [[TMP4]], [[RDX_SHUF9]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT11:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP10]], <4 x i32> [[TMP4]], <4 x i32> [[RDX_SHUF9]] ; CHECK-NEXT: [[RDX_SHUF12:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT11]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP13:%.*]] = icmp sgt <4 x i32> [[RDX_MINMAX_SELECT11]], [[RDX_SHUF12]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT14:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP13]], <4 x i32> [[RDX_MINMAX_SELECT11]], <4 x i32> [[RDX_SHUF12]] -; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT14]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT14]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = select i1 [[TMP17]], i32 [[TMP15]], i32 undef -; CHECK-NEXT: [[TMP19:%.*]] = select i1 undef, i32 [[TMP6]], i32 undef +; CHECK-NEXT: [[TMP19:%.*]] = select i1 undef, i32 [[TMP5]], i32 undef ; CHECK-NEXT: [[TMP20:%.*]] = icmp sgt i32 [[TMP19]], 63 -; CHECK-NEXT: [[TMP7:%.*]] = sub nsw <2 x i32> undef, [[TMP2]] -; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP7]], undef -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP6:%.*]] = sub nsw <4 x i32> undef, [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = sub <4 x i32> [[TMP6]], undef +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[SHUFFLE]], ; CHECK-NEXT: [[TMP26:%.*]] = icmp sgt i32 undef, undef ; CHECK-NEXT: [[TMP27:%.*]] = select i1 [[TMP26]], i32 undef, i32 undef ; CHECK-NEXT: [[TMP28:%.*]] = icmp sgt i32 [[TMP27]], undef @@ -53,23 +51,23 @@ ; CHECK-NEXT: [[TMP41:%.*]] = icmp sgt i32 undef, undef ; CHECK-NEXT: [[TMP42:%.*]] = select i1 [[TMP41]], i32 undef, i32 undef ; CHECK-NEXT: [[TMP43:%.*]] = icmp sgt i32 [[TMP42]], [[TMP39]] -; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP9]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i32> [[TMP9]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP9]], <4 x i32> [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i32> [[TMP8]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP8]], <4 x i32> [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> undef, <4 x i32> ; CHECK-NEXT: [[RDX_MINMAX_CMP2:%.*]] = icmp slt <4 x i32> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] ; CHECK-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x i32> [[RDX_MINMAX_SELECT]], <4 x i32> [[RDX_SHUF1]] -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i32 [[TMP10]], undef -; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP11]], i32 [[TMP10]], i32 undef -; CHECK-NEXT: [[TMP12:%.*]] = icmp slt i32 [[OP_EXTRA]], undef -; CHECK-NEXT: [[OP_EXTRA4:%.*]] = select i1 [[TMP12]], i32 [[OP_EXTRA]], i32 undef -; CHECK-NEXT: [[TMP13:%.*]] = icmp slt i32 [[OP_EXTRA4]], undef -; CHECK-NEXT: [[OP_EXTRA5:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA4]], i32 undef -; CHECK-NEXT: [[TMP14:%.*]] = icmp slt i32 [[OP_EXTRA5]], undef -; CHECK-NEXT: [[OP_EXTRA6:%.*]] = select i1 [[TMP14]], i32 [[OP_EXTRA5]], i32 undef -; CHECK-NEXT: [[TMP15:%.*]] = icmp slt i32 [[OP_EXTRA6]], undef -; CHECK-NEXT: [[OP_EXTRA7:%.*]] = select i1 [[TMP15]], i32 [[OP_EXTRA6]], i32 undef +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i32> [[RDX_MINMAX_SELECT3]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = icmp slt i32 [[TMP9]], undef +; CHECK-NEXT: [[OP_EXTRA:%.*]] = select i1 [[TMP10]], i32 [[TMP9]], i32 undef +; CHECK-NEXT: [[TMP11:%.*]] = icmp slt i32 [[OP_EXTRA]], undef +; CHECK-NEXT: [[OP_EXTRA4:%.*]] = select i1 [[TMP11]], i32 [[OP_EXTRA]], i32 undef +; CHECK-NEXT: [[TMP12:%.*]] = icmp slt i32 [[OP_EXTRA4]], undef +; CHECK-NEXT: [[OP_EXTRA5:%.*]] = select i1 [[TMP12]], i32 [[OP_EXTRA4]], i32 undef +; CHECK-NEXT: [[TMP13:%.*]] = icmp slt i32 [[OP_EXTRA5]], undef +; CHECK-NEXT: [[OP_EXTRA6:%.*]] = select i1 [[TMP13]], i32 [[OP_EXTRA5]], i32 undef +; CHECK-NEXT: [[TMP14:%.*]] = icmp slt i32 [[OP_EXTRA6]], undef +; CHECK-NEXT: [[OP_EXTRA7:%.*]] = select i1 [[TMP14]], i32 [[OP_EXTRA6]], i32 undef ; CHECK-NEXT: [[TMP44:%.*]] = select i1 [[TMP43]], i32 [[TMP39]], i32 [[TMP42]] ; CHECK-NEXT: [[TMP45:%.*]] = icmp sgt i32 undef, [[OP_EXTRA7]] ; CHECK-NEXT: unreachable Index: test/Transforms/SLPVectorizer/X86/resched.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/resched.ll +++ test/Transforms/SLPVectorizer/X86/resched.ll @@ -32,45 +32,43 @@ ; CHECK-NEXT: [[ARRAYIDX_I_I7_9_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 9 ; CHECK-NEXT: [[ARRAYIDX_I_I7_10_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 10 ; CHECK-NEXT: [[ARRAYIDX_I_I7_11_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 11 -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> undef, i32 [[CONV31_I]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[CONV31_I]], i32 1 -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[CONV31_I]], i32 2 -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[CONV31_I]], i32 3 -; CHECK-NEXT: [[TMP14:%.*]] = lshr <4 x i32> [[TMP13]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_12_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 12 -; CHECK-NEXT: [[SHR_12_I_I:%.*]] = lshr i32 [[CONV31_I]], 13 ; CHECK-NEXT: [[ARRAYIDX_I_I7_13_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 13 -; CHECK-NEXT: [[SHR_13_I_I:%.*]] = lshr i32 [[CONV31_I]], 14 ; CHECK-NEXT: [[ARRAYIDX_I_I7_14_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 14 -; CHECK-NEXT: [[SHR_14_I_I:%.*]] = lshr i32 [[CONV31_I]], 15 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i32> [[TMP15]], i32 [[TMP16]], i32 1 -; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1 -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x i32> [[TMP17]], i32 [[TMP18]], i32 2 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2 -; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x i32> [[TMP19]], i32 [[TMP20]], i32 3 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3 -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x i32> [[TMP21]], i32 [[TMP22]], i32 4 -; CHECK-NEXT: [[TMP24:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x i32> [[TMP23]], i32 [[TMP24]], i32 5 -; CHECK-NEXT: [[TMP26:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5 -; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i32> [[TMP25]], i32 [[TMP26]], i32 6 -; CHECK-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6 -; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x i32> [[TMP27]], i32 [[TMP28]], i32 7 -; CHECK-NEXT: [[TMP30:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7 -; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x i32> [[TMP29]], i32 [[TMP30]], i32 8 -; CHECK-NEXT: [[TMP32:%.*]] = extractelement <4 x i32> [[TMP14]], i32 0 -; CHECK-NEXT: [[TMP33:%.*]] = insertelement <16 x i32> [[TMP31]], i32 [[TMP32]], i32 9 -; CHECK-NEXT: [[TMP34:%.*]] = extractelement <4 x i32> [[TMP14]], i32 1 -; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i32> [[TMP33]], i32 [[TMP34]], i32 10 -; CHECK-NEXT: [[TMP36:%.*]] = extractelement <4 x i32> [[TMP14]], i32 2 -; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i32> [[TMP35]], i32 [[TMP36]], i32 11 -; CHECK-NEXT: [[TMP38:%.*]] = extractelement <4 x i32> [[TMP14]], i32 3 -; CHECK-NEXT: [[TMP39:%.*]] = insertelement <16 x i32> [[TMP37]], i32 [[TMP38]], i32 12 -; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP39]], i32 [[SHR_12_I_I]], i32 13 -; CHECK-NEXT: [[TMP41:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[SHR_13_I_I]], i32 14 -; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP41]], i32 [[SHR_14_I_I]], i32 15 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> undef, i32 [[CONV31_I]], i32 0 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = lshr <8 x i32> [[SHUFFLE]], +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i32> undef, i32 [[SUB_I]], i32 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <8 x i32> [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i32> [[TMP12]], i32 [[TMP13]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i32> [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x i32> [[TMP14]], i32 [[TMP15]], i32 2 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i32> [[TMP9]], i32 2 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x i32> [[TMP16]], i32 [[TMP17]], i32 3 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i32> [[TMP9]], i32 3 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x i32> [[TMP18]], i32 [[TMP19]], i32 4 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i32> [[TMP9]], i32 4 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i32> [[TMP20]], i32 [[TMP21]], i32 5 +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <8 x i32> [[TMP9]], i32 5 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x i32> [[TMP22]], i32 [[TMP23]], i32 6 +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <8 x i32> [[TMP9]], i32 6 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x i32> [[TMP24]], i32 [[TMP25]], i32 7 +; CHECK-NEXT: [[TMP27:%.*]] = extractelement <8 x i32> [[TMP9]], i32 7 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x i32> [[TMP26]], i32 [[TMP27]], i32 8 +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <8 x i32> [[TMP11]], i32 0 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x i32> [[TMP28]], i32 [[TMP29]], i32 9 +; CHECK-NEXT: [[TMP31:%.*]] = extractelement <8 x i32> [[TMP11]], i32 1 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i32> [[TMP30]], i32 [[TMP31]], i32 10 +; CHECK-NEXT: [[TMP33:%.*]] = extractelement <8 x i32> [[TMP11]], i32 2 +; CHECK-NEXT: [[TMP34:%.*]] = insertelement <16 x i32> [[TMP32]], i32 [[TMP33]], i32 11 +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <8 x i32> [[TMP11]], i32 3 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x i32> [[TMP34]], i32 [[TMP35]], i32 12 +; CHECK-NEXT: [[TMP37:%.*]] = extractelement <8 x i32> [[TMP11]], i32 4 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <16 x i32> [[TMP36]], i32 [[TMP37]], i32 13 +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <8 x i32> [[TMP11]], i32 5 +; CHECK-NEXT: [[TMP40:%.*]] = insertelement <16 x i32> [[TMP38]], i32 [[TMP39]], i32 14 +; CHECK-NEXT: [[TMP41:%.*]] = extractelement <8 x i32> [[TMP11]], i32 6 +; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i32> [[TMP40]], i32 [[TMP41]], i32 15 ; CHECK-NEXT: [[TMP43:%.*]] = trunc <16 x i32> [[TMP42]] to <16 x i8> ; CHECK-NEXT: [[TMP44:%.*]] = and <16 x i8> [[TMP43]], ; CHECK-NEXT: [[ARRAYIDX_I_I7_15_I_I:%.*]] = getelementptr inbounds %"struct.std::array", %"struct.std::array"* undef, i64 0, i32 0, i64 15 Index: test/Transforms/SLPVectorizer/X86/rgb_phi.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/rgb_phi.ll +++ test/Transforms/SLPVectorizer/X86/rgb_phi.ll @@ -25,39 +25,37 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[A:%.*]], align 4 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, float* [[A]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 2 -; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[ARRAYIDX1]] to <2 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> undef, <2 x i32> ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[TMP3:%.*]] = phi float [ [[TMP0]], [[ENTRY:%.*]] ], [ [[DOTPRE:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE:%.*]] ] ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[B_032:%.*]] = phi float [ [[TMP2]], [[ENTRY]] ], [ [[ADD14:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] -; CHECK-NEXT: [[G_031:%.*]] = phi float [ [[TMP1]], [[ENTRY]] ], [ [[ADD9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[R_030:%.*]] = phi float [ [[TMP0]], [[ENTRY]] ], [ [[ADD4:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x float> [ [[REORDER_SHUFFLE]], [[ENTRY]] ], [ [[TMP9:%.*]], [[FOR_BODY_FOR_BODY_CRIT_EDGE]] ] ; CHECK-NEXT: [[MUL:%.*]] = fmul float [[TMP3]], 7.000000e+00 ; CHECK-NEXT: [[ADD4]] = fadd float [[R_030]], [[MUL]] -; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX7]], align 4 -; CHECK-NEXT: [[MUL8:%.*]] = fmul float [[TMP5]], 8.000000e+00 -; CHECK-NEXT: [[ADD9]] = fadd float [[G_031]], [[MUL8]] -; CHECK-NEXT: [[TMP6:%.*]] = add nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = load float, float* [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[MUL13:%.*]] = fmul float [[TMP7]], 9.000000e+00 -; CHECK-NEXT: [[ADD14]] = fadd float [[B_032]], [[MUL13]] +; CHECK-NEXT: [[TMP5:%.*]] = add nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = bitcast float* [[ARRAYIDX7]] to <2 x float>* +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, <2 x float>* [[TMP6]], align 4 +; CHECK-NEXT: [[REORDER_SHUFFLE1:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> undef, <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[REORDER_SHUFFLE1]], +; CHECK-NEXT: [[TMP9]] = fadd <2 x float> [[TMP4]], [[TMP8]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 3 -; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP8]], 121 +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP10]], 121 ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY_FOR_BODY_CRIT_EDGE]], label [[FOR_END:%.*]] ; CHECK: for.body.for.body_crit_edge: ; CHECK-NEXT: [[ARRAYIDX3_PHI_TRANS_INSERT:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV_NEXT]] ; CHECK-NEXT: [[DOTPRE]] = load float, float* [[ARRAYIDX3_PHI_TRANS_INSERT]], align 4 ; CHECK-NEXT: br label [[FOR_BODY]] ; CHECK: for.end: -; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[ADD4]], [[ADD9]] -; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[ADD14]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <2 x float> [[TMP9]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd float [[ADD4]], [[TMP11]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x float> [[TMP9]], i32 0 +; CHECK-NEXT: [[ADD17:%.*]] = fadd float [[ADD16]], [[TMP12]] ; CHECK-NEXT: ret float [[ADD17]] ; entry: Index: test/Transforms/SLPVectorizer/X86/schedule-bundle.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/schedule-bundle.ll +++ test/Transforms/SLPVectorizer/X86/schedule-bundle.ll @@ -10,18 +10,10 @@ define i32 @slp_schedule_bundle() local_unnamed_addr #0 { ; CHECK-LABEL: @slp_schedule_bundle( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([1 x i32]* @b to <4 x i32>*), align 4 -; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> [[TMP1]], -; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([1 x i32]* @a to <4 x i32>*), align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 4, i64 0), align 4 -; CHECK-NEXT: [[DOTLOBIT_4:%.*]] = lshr i32 [[TMP3]], 31 -; CHECK-NEXT: [[DOTLOBIT_NOT_4:%.*]] = xor i32 [[DOTLOBIT_4]], 1 -; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_4]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 4, i64 0), align 4 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* getelementptr ([1 x i32], [1 x i32]* @b, i64 5, i64 0), align 4 -; CHECK-NEXT: [[DOTLOBIT_5:%.*]] = lshr i32 [[TMP4]], 31 -; CHECK-NEXT: [[DOTLOBIT_NOT_5:%.*]] = xor i32 [[DOTLOBIT_5]], 1 -; CHECK-NEXT: store i32 [[DOTLOBIT_NOT_5]], i32* getelementptr ([1 x i32], [1 x i32]* @a, i64 5, i64 0), align 4 +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast ([1 x i32]* @b to <8 x i32>*), i32 4, <8 x i1> , <8 x i32> undef) +; CHECK-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = xor <8 x i32> , [[TMP1]] +; CHECK-NEXT: call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP2]], <8 x i32>* bitcast ([1 x i32]* @a to <8 x i32>*), i32 4, <8 x i1> ) ; CHECK-NEXT: ret i32 undef ; entry: Index: test/Transforms/SLPVectorizer/X86/sext.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/sext.ll +++ test/Transforms/SLPVectorizer/X86/sext.ll @@ -166,17 +166,18 @@ ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 -; AVX1-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; AVX1-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[X2:%.*]] = sext i8 [[I2]] to i64 -; AVX1-NEXT: [[X3:%.*]] = sext i8 [[I3]] to i64 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: [[TMP3:%.*]] = bitcast i8* [[P2]] to <2 x i8>* +; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* [[TMP3]], align 1 +; AVX1-NEXT: [[TMP5:%.*]] = sext <2 x i8> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[TMP6:%.*]] = sext <2 x i8> [[TMP4]] to <2 x i64> +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 ; AVX1-NEXT: ret <4 x i64> [[V3]] ; ; AVX2-LABEL: @loadext_4i8_to_4i64( @@ -604,17 +605,18 @@ ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 -; AVX1-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; AVX1-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[X2:%.*]] = sext i16 [[I2]] to i64 -; AVX1-NEXT: [[X3:%.*]] = sext i16 [[I3]] to i64 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: [[TMP3:%.*]] = bitcast i16* [[P2]] to <2 x i16>* +; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* [[TMP3]], align 1 +; AVX1-NEXT: [[TMP5:%.*]] = sext <2 x i16> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[TMP6:%.*]] = sext <2 x i16> [[TMP4]] to <2 x i64> +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 ; AVX1-NEXT: ret <4 x i64> [[V3]] ; ; AVX2-LABEL: @loadext_4i16_to_4i64( @@ -822,17 +824,18 @@ ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 -; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[X2:%.*]] = sext i32 [[I2]] to i64 -; AVX1-NEXT: [[X3:%.*]] = sext i32 [[I3]] to i64 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[P2]] to <2 x i32>* +; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 1 +; AVX1-NEXT: [[TMP5:%.*]] = sext <2 x i32> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[TMP6:%.*]] = sext <2 x i32> [[TMP4]] to <2 x i64> +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 ; AVX1-NEXT: ret <4 x i64> [[V3]] ; ; AVX2-LABEL: @loadext_4i32_to_4i64( Index: test/Transforms/SLPVectorizer/X86/slp-throttle.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/slp-throttle.ll +++ test/Transforms/SLPVectorizer/X86/slp-throttle.ll @@ -5,18 +5,20 @@ ; CHECK-LABEL: @rftbsub( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, double* [[A:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load double, double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = or i64 2, 1 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP2:%.*]] = load double, double* [[ARRAYIDX12]], align 8 -; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP2]], undef +; CHECK-NEXT: [[TMP0:%.*]] = or i64 2, 1 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, <2 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; CHECK-NEXT: [[ADD16:%.*]] = fadd double [[TMP3]], undef ; CHECK-NEXT: [[MUL18:%.*]] = fmul double undef, [[ADD16]] -; CHECK-NEXT: [[ADD19:%.*]] = fadd double undef, [[MUL18]] -; CHECK-NEXT: [[SUB22:%.*]] = fsub double undef, undef -; CHECK-NEXT: [[SUB25:%.*]] = fsub double [[TMP0]], [[ADD19]] -; CHECK-NEXT: store double [[SUB25]], double* [[ARRAYIDX6]], align 8 -; CHECK-NEXT: [[SUB29:%.*]] = fsub double [[TMP2]], [[SUB22]] -; CHECK-NEXT: store double [[SUB29]], double* [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> undef, double [[MUL18]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = fadd <2 x double> undef, [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = fsub <2 x double> undef, [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x double> [[TMP2]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = bitcast double* [[ARRAYIDX6]] to <2 x double>* +; CHECK-NEXT: store <2 x double> [[TMP8]], <2 x double>* [[TMP9]], align 8 ; CHECK-NEXT: unreachable ; entry: Index: test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll +++ test/Transforms/SLPVectorizer/X86/vectorize-reorder-reuse.ll @@ -7,10 +7,11 @@ ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <2 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <2 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[REORDER_SHUFFLE]], <2 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> undef, <8 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[LOAD_EXTEND]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[REORDER_SHUFFLE]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A2:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 @@ -81,12 +82,13 @@ ; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 3 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[LOAD_EXTEND]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[REORDER_SHUFFLE]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A4:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A1:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A2:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A3:%.*]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 @@ -159,12 +161,13 @@ ; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[ARR]], i64 1 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[ARR]] to <4 x i32>* ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 -; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[REORDER_SHUFFLE]], <4 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A1:%.*]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A2:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A3:%.*]], i32 2 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A4:%.*]], i32 3 +; CHECK-NEXT: [[LOAD_EXTEND:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> undef, <8 x i32> +; CHECK-NEXT: [[REORDER_SHUFFLE:%.*]] = shufflevector <8 x i32> [[LOAD_EXTEND]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[REORDER_SHUFFLE]], <8 x i32> undef, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[A3:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[A4:%.*]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A2:%.*]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[A1:%.*]], i32 3 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[A5:%.*]], i32 4 ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[A6:%.*]], i32 5 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[A7:%.*]], i32 6 Index: test/Transforms/SLPVectorizer/X86/zext.ll =================================================================== --- test/Transforms/SLPVectorizer/X86/zext.ll +++ test/Transforms/SLPVectorizer/X86/zext.ll @@ -131,17 +131,18 @@ ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i8, i8* [[P0]], i64 3 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i8* [[P0]] to <2 x i8>* ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i8>, <2 x i8>* [[TMP1]], align 1 -; AVX1-NEXT: [[I2:%.*]] = load i8, i8* [[P2]], align 1 -; AVX1-NEXT: [[I3:%.*]] = load i8, i8* [[P3]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[X2:%.*]] = zext i8 [[I2]] to i64 -; AVX1-NEXT: [[X3:%.*]] = zext i8 [[I3]] to i64 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: [[TMP3:%.*]] = bitcast i8* [[P2]] to <2 x i8>* +; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i8>, <2 x i8>* [[TMP3]], align 1 +; AVX1-NEXT: [[TMP5:%.*]] = zext <2 x i8> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[TMP6:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i64> +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 ; AVX1-NEXT: ret <4 x i64> [[V3]] ; ; AVX2-LABEL: @loadext_4i8_to_4i64( @@ -569,17 +570,18 @@ ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i16, i16* [[P0]], i64 3 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i16* [[P0]] to <2 x i16>* ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i16>, <2 x i16>* [[TMP1]], align 1 -; AVX1-NEXT: [[I2:%.*]] = load i16, i16* [[P2]], align 1 -; AVX1-NEXT: [[I3:%.*]] = load i16, i16* [[P3]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[X2:%.*]] = zext i16 [[I2]] to i64 -; AVX1-NEXT: [[X3:%.*]] = zext i16 [[I3]] to i64 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: [[TMP3:%.*]] = bitcast i16* [[P2]] to <2 x i16>* +; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i16>, <2 x i16>* [[TMP3]], align 1 +; AVX1-NEXT: [[TMP5:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[TMP6:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64> +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 ; AVX1-NEXT: ret <4 x i64> [[V3]] ; ; AVX2-LABEL: @loadext_4i16_to_4i64( @@ -787,17 +789,18 @@ ; AVX1-NEXT: [[P3:%.*]] = getelementptr inbounds i32, i32* [[P0]], i64 3 ; AVX1-NEXT: [[TMP1:%.*]] = bitcast i32* [[P0]] to <2 x i32>* ; AVX1-NEXT: [[TMP2:%.*]] = load <2 x i32>, <2 x i32>* [[TMP1]], align 1 -; AVX1-NEXT: [[I2:%.*]] = load i32, i32* [[P2]], align 1 -; AVX1-NEXT: [[I3:%.*]] = load i32, i32* [[P3]], align 1 -; AVX1-NEXT: [[TMP3:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> -; AVX1-NEXT: [[X2:%.*]] = zext i32 [[I2]] to i64 -; AVX1-NEXT: [[X3:%.*]] = zext i32 [[I3]] to i64 -; AVX1-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[TMP3]], i32 0 -; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP4]], i32 0 -; AVX1-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP3]], i32 1 -; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP5]], i32 1 -; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[X2]], i32 2 -; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[X3]], i32 3 +; AVX1-NEXT: [[TMP3:%.*]] = bitcast i32* [[P2]] to <2 x i32>* +; AVX1-NEXT: [[TMP4:%.*]] = load <2 x i32>, <2 x i32>* [[TMP3]], align 1 +; AVX1-NEXT: [[TMP5:%.*]] = zext <2 x i32> [[TMP2]] to <2 x i64> +; AVX1-NEXT: [[TMP6:%.*]] = zext <2 x i32> [[TMP4]] to <2 x i64> +; AVX1-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[TMP5]], i32 0 +; AVX1-NEXT: [[V0:%.*]] = insertelement <4 x i64> undef, i64 [[TMP7]], i32 0 +; AVX1-NEXT: [[TMP8:%.*]] = extractelement <2 x i64> [[TMP5]], i32 1 +; AVX1-NEXT: [[V1:%.*]] = insertelement <4 x i64> [[V0]], i64 [[TMP8]], i32 1 +; AVX1-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX1-NEXT: [[V2:%.*]] = insertelement <4 x i64> [[V1]], i64 [[TMP9]], i32 2 +; AVX1-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 +; AVX1-NEXT: [[V3:%.*]] = insertelement <4 x i64> [[V2]], i64 [[TMP10]], i32 3 ; AVX1-NEXT: ret <4 x i64> [[V3]] ; ; AVX2-LABEL: @loadext_4i32_to_4i64(